2014-04-22 21:28:57

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 00/17] nfs: support multiple requests per page

This patchset changes the read and write paths to be more flexible in dealing
with requests that are not page aligned. Until now there was a 1:1 mapping
of struct nfs_page (referred to as "nfs requests") to struct page, which
limited the client to page aligned I/O in several pNFS scenarios.

This patchset allows multiple requests per page, loosely following
the approach taken with struct buffer_head (part of kernel bio interface).

With this patchset the client now supports:
- non-page-aligned O_DIRECT I/O to DSes (instead of reverting to MDS)
- arbitrary pnfs layout segment boundaries
- arbitrary pnfs filelayout stripe sizes

This patchset also includes a lot of cleanup - notably we no longer need
a separate code path to support rsize/wsize < PAGE_SIZE.

This new approach opens the door to many optimizations, such as not having to
flush a page on a non-contiguous write, but for the time being we are focusing
on correctness -- this patchset touches the read and write path for *all*
versions of NFS!

This has been tested against v2, v3, v4.0 and v4.1 (no pnfs) servers with
different rsize/wsize settings, and against pynfs filelayout servers hacked to
have non page aligned stripe sizes.

I had some code review already (with changes applied) and we've been testing
this pretty extensively for the last month+ - focusing mostly on v2, v3, v4.x
(no pnfs).

The patchset applies against Trond's testing branch, but should also include
the fix I posted earlier today: "pnfs: fix race in filelayout commit path"
as the race seems to be easier to hit with this patchset applied.

I'm pretty sure I didn't break anything in the object and block layouts, but
some extra attention there would be helpful.

I plan on sharing some performance numbers once I'm able to run some nfsometer
workloads. Expect them soon.

-dros

Weston Andros Adamson (17):
nfs: clean up PG_* flags
nfs: remove unused arg from nfs_create_request
nfs: modify pg_test interface to return size_t
nfs: call nfs_can_coalesce_requests for every req
nfs: add support for multiple nfs reqs per page
nfs: page group syncing in read path
nfs: page group syncing in write path
nfs: page group support in nfs_mark_uptodate
pnfs: clean up filelayout_alloc_commit_info
nfs: allow coalescing of subpage requests
nfs: chain calls to pg_test
nfs: use > 1 request to handle bsize < PAGE_SIZE
nfs: remove list of [rw]data from pgio header
pnfs: support multiple verfs per direct req
pnfs: allow non page aligned pnfs layout segments
pnfs: filelayout: support non page aligned layouts
nfs: support page groups in nfs_read_completion

fs/nfs/blocklayout/blocklayout.c | 16 ++-
fs/nfs/direct.c | 93 ++++++++++---
fs/nfs/nfs4filelayout.c | 121 +++++++++--------
fs/nfs/objlayout/objio_osd.c | 20 ++-
fs/nfs/pagelist.c | 284 +++++++++++++++++++++++++++++++++------
fs/nfs/pnfs.c | 77 +++++------
fs/nfs/pnfs.h | 3 +-
fs/nfs/read.c | 149 +++++++-------------
fs/nfs/write.c | 217 +++++++++++++++---------------
include/linux/nfs.h | 5 +-
include/linux/nfs_page.h | 32 +++--
include/linux/nfs_xdr.h | 37 ++---
12 files changed, 650 insertions(+), 404 deletions(-)

--
1.8.5.2 (Apple Git-48)



2014-04-22 21:29:14

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 11/17] nfs: chain calls to pg_test

Now that pg_test can change the size of the request (by returning a non-zero
size smaller than the request), pg_test functions that call other
pg_test functions must return the minimum of the result - or 0 if any fail.

Also clean up the logic of some pg_test functions so that all checks are
for contitions where coalescing is not possible.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/nfs4filelayout.c | 27 ++++++++++++++-------------
fs/nfs/objlayout/objio_osd.c | 12 ++++++++----
fs/nfs/pnfs.c | 15 ++++++++++-----
3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 3b32c95..cfd76bd 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -930,26 +930,27 @@ static size_t
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
+ unsigned int size;
u64 p_stripe, r_stripe;
u32 stripe_unit;

- if (!pnfs_generic_pg_test(pgio, prev, req) ||
- !nfs_generic_pg_test(pgio, prev, req))
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
return 0;

- if (!prev)
- return req->wb_bytes;
+ if (prev) {
+ p_stripe = (u64)req_offset(prev);
+ r_stripe = (u64)req_offset(req);
+ stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;

- p_stripe = (u64)req_offset(prev);
- r_stripe = (u64)req_offset(req);
- stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);

- do_div(p_stripe, stripe_unit);
- do_div(r_stripe, stripe_unit);
-
- if (p_stripe == r_stripe)
- return req->wb_bytes;
- return 0;
+ if (p_stripe != r_stripe)
+ return 0;
+ }
+ return min(size, req->wb_bytes);
}

static void
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c20352a..31de29e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -571,13 +571,17 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req)
{
- if (!pnfs_generic_pg_test(pgio, prev, req))
+ unsigned int size;
+
+ size = pnfs_generic_pg_test(pgio, prev, req);
+
+ if (!size)
return 0;

- if (pgio->pg_count + req->wb_bytes <=
+ if (pgio->pg_count + req->wb_bytes >
(unsigned long)pgio->pg_layout_private)
- return req->wb_bytes;
- return 0;
+ return 0;
+ return min(size, req->wb_bytes);
}

static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6201bf6..7c89385 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1469,8 +1469,12 @@ size_t
pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_lseg == NULL)
- return nfs_generic_pg_test(pgio, prev, req);
+ unsigned int size;
+
+ size = nfs_generic_pg_test(pgio, prev, req);
+
+ if (!size)
+ return 0;

/*
* Test if a nfs_page is fully contained in the pnfs_layout_range.
@@ -1486,10 +1490,11 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
* first byte that lies outside the pnfs_layout_range. FIXME?
*
*/
- if (req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
+ if (req_offset(req) >= end_offset(pgio->pg_lseg->pls_range.offset,
pgio->pg_lseg->pls_range.length))
- return req->wb_bytes;
- return 0;
+ return 0;
+
+ return min(size, req->wb_bytes);
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);

--
1.8.5.2 (Apple Git-48)


2014-04-23 13:37:09

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 11/17] nfs: chain calls to pg_test

On Apr 23, 2014, at 8:20 AM, Boaz Harrosh <[email protected]> wrote:

> On 04/23/2014 12:29 AM, Weston Andros Adamson wrote:
>> Now that pg_test can change the size of the request (by returning a non-zero
>> size smaller than the request), pg_test functions that call other
>> pg_test functions must return the minimum of the result - or 0 if any fail.
>>
>> Also clean up the logic of some pg_test functions so that all checks are
>> for contitions where coalescing is not possible.
>>
>> Signed-off-by: Weston Andros Adamson <[email protected]>
>> ---
>> fs/nfs/nfs4filelayout.c | 27 ++++++++++++++-------------
>> fs/nfs/objlayout/objio_osd.c | 12 ++++++++----
>> fs/nfs/pnfs.c | 15 ++++++++++-----
>> 3 files changed, 32 insertions(+), 22 deletions(-)
>>
>> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
>> index 3b32c95..cfd76bd 100644
>> --- a/fs/nfs/nfs4filelayout.c
>> +++ b/fs/nfs/nfs4filelayout.c
>> @@ -930,26 +930,27 @@ static size_t
>> filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
>> struct nfs_page *req)
>> {
>> + unsigned int size;
>> u64 p_stripe, r_stripe;
>> u32 stripe_unit;
>>
>> - if (!pnfs_generic_pg_test(pgio, prev, req) ||
>> - !nfs_generic_pg_test(pgio, prev, req))
>> + /* calls nfs_generic_pg_test */
>> + size = pnfs_generic_pg_test(pgio, prev, req);
>> + if (!size)
>> return 0;
>>
>> - if (!prev)
>> - return req->wb_bytes;
>> + if (prev) {
>> + p_stripe = (u64)req_offset(prev);
>> + r_stripe = (u64)req_offset(req);
>> + stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
>>
>> - p_stripe = (u64)req_offset(prev);
>> - r_stripe = (u64)req_offset(req);
>> - stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
>> + do_div(p_stripe, stripe_unit);
>> + do_div(r_stripe, stripe_unit);
>>
>> - do_div(p_stripe, stripe_unit);
>> - do_div(r_stripe, stripe_unit);
>> -
>> - if (p_stripe == r_stripe)
>> - return req->wb_bytes;
>> - return 0;
>> + if (p_stripe != r_stripe)
>> + return 0;
>> + }
>> + return min(size, req->wb_bytes);
>> }
>>
>> static void
>> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
>> index c20352a..31de29e 100644
>> --- a/fs/nfs/objlayout/objio_osd.c
>> +++ b/fs/nfs/objlayout/objio_osd.c
>> @@ -571,13 +571,17 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
>> static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
>> struct nfs_page *prev, struct nfs_page *req)
>> {
>> - if (!pnfs_generic_pg_test(pgio, prev, req))
>> + unsigned int size;
>> +
>> + size = pnfs_generic_pg_test(pgio, prev, req);
>> +
>> + if (!size)
>> return 0;
>>
>> - if (pgio->pg_count + req->wb_bytes <=
>> + if (pgio->pg_count + req->wb_bytes >
>> (unsigned long)pgio->pg_layout_private)
>> - return req->wb_bytes;
>> - return 0;
>> + return 0;
>
> objio_osd can enjoy the new facility by returning the
> remainder here:
>
> max_io = (unsigned long)pgio->pg_layout_private);
>
> wb_bytes = min(size, req->wb_bytes);
>
> if (pgio->pg_count + req->wb_bytes > max_io)
> wb_bytes = max_io - pgio->pg_count;
>
> return wb_bytes;
>
> Which reminds me that this code sucks and I need to fix it. I will do
> so after you send your changes.
>
> [I promise to test these guys soon. Can you please put them on a public tree?]

Yes, I?ll put something up on linux-nfs.org.

Thanks!
-dros

>
> Thanks
> Boaz
>
>> + return min(size, req->wb_bytes);
>> }
>>
>> static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 6201bf6..7c89385 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -1469,8 +1469,12 @@ size_t
>> pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
>> struct nfs_page *req)
>> {
>> - if (pgio->pg_lseg == NULL)
>> - return nfs_generic_pg_test(pgio, prev, req);
>> + unsigned int size;
>> +
>> + size = nfs_generic_pg_test(pgio, prev, req);
>> +
>> + if (!size)
>> + return 0;
>>
>> /*
>> * Test if a nfs_page is fully contained in the pnfs_layout_range.
>> @@ -1486,10 +1490,11 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
>> * first byte that lies outside the pnfs_layout_range. FIXME?
>> *
>> */
>> - if (req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
>> + if (req_offset(req) >= end_offset(pgio->pg_lseg->pls_range.offset,
>> pgio->pg_lseg->pls_range.length))
>> - return req->wb_bytes;
>> - return 0;
>> + return 0;
>> +
>> + return min(size, req->wb_bytes);
>> }
>> EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);


2014-04-22 21:29:24

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 17/17] nfs: support page groups in nfs_read_completion

nfs_read_completion relied on the fact that there was a 1:1 mapping
of page to nfs_request, but this has now changed.

Regions not covered by a request have already been zeroed elsewhere.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/read.c | 24 +++++++++++++++++-------
1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c6b7dd0..e3613e2 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -182,7 +182,6 @@ static void nfs_page_group_set_uptodate(struct nfs_page *req)
SetPageUptodate(req->wb_page);
}

-/* Note io was page aligned */
static void nfs_read_completion(struct nfs_pgio_header *hdr)
{
unsigned long bytes = 0;
@@ -192,14 +191,25 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
struct page *page = req->wb_page;
+ unsigned long start = req->wb_pgbase;
+ unsigned long end = req->wb_pgbase + req->wb_bytes;

if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
- if (bytes > hdr->good_bytes)
- zero_user(page, 0, PAGE_SIZE);
- else if (hdr->good_bytes - bytes < PAGE_SIZE)
- zero_user_segment(page,
- hdr->good_bytes & ~PAGE_MASK,
- PAGE_SIZE);
+ /* note: regions of the page not covered by a
+ * request are zeroed in nfs_readpage_async /
+ * readpage_async_filler */
+ if (bytes > hdr->good_bytes) {
+ /* nothing in this request was good, so zero
+ * the full extent of the request */
+ zero_user_segment(page, start, end);
+
+ } else if (hdr->good_bytes - bytes < req->wb_bytes) {
+ /* part of this request has good bytes, but
+ * not all. zero the bad bytes */
+ start += hdr->good_bytes - bytes;
+ WARN_ON(start < req->wb_pgbase);
+ zero_user_segment(page, start, end);
+ }
}
bytes += req->wb_bytes;
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:29:20

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 15/17] pnfs: allow non page aligned pnfs layout segments

Remove alignment checks that would revert to MDS and change pg_test
to return the max ammount left in the segment (or other pg_test call)
up to size of passed request, or 0 if no space is left.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/pnfs.c | 25 ++++++++++---------------
1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3b3ec46..7bd3bdb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1388,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r

WARN_ON_ONCE(pgio->pg_lseg != NULL);

- if (req->wb_offset != req->wb_pgbase) {
- nfs_pageio_reset_read_mds(pgio);
- return;
- }
-
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
else
@@ -1417,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
{
WARN_ON_ONCE(pgio->pg_lseg != NULL);

- if (req->wb_offset != req->wb_pgbase) {
- nfs_pageio_reset_write_mds(pgio);
- return;
- }
-
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
@@ -1470,9 +1460,9 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
unsigned int size;
+ u64 end;

size = nfs_generic_pg_test(pgio, prev, req);
-
if (!size)
return 0;

@@ -1490,11 +1480,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
* first byte that lies outside the pnfs_layout_range. FIXME?
*
*/
- if (req_offset(req) >= end_offset(pgio->pg_lseg->pls_range.offset,
- pgio->pg_lseg->pls_range.length))
- return 0;
+ if (pgio->pg_lseg) {
+ end = end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length);
+ WARN_ON_ONCE(req_offset(req) > end);
+ if (req_offset(req) >= end)
+ return 0;
+ size = min((unsigned int)(end - req_offset(req)), size);
+ }

- return min(size, req->wb_bytes);
+ return size;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);

--
1.8.5.2 (Apple Git-48)


2014-04-22 21:29:09

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 07/17] nfs: page group syncing in write path

Operations that modify state for a whole page must be syncronized across
all requests within a page group. In the write path, this is calling
end_page_writeback and removing the head request from an inode.
Both of these operations should not be called until all requests
in a page group have reached the point where they would call them.

This patch should have no effect yet since all page groups currently
have one request, but will come into play when pg_test functions are
modified to split pages into sub-page regions.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/write.c | 29 ++++++++++++++++++++---------
include/linux/nfs_page.h | 2 ++
2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d1453f2..c6f6449 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -258,12 +258,15 @@ static void nfs_set_page_writeback(struct page *page)
}
}

-static void nfs_end_page_writeback(struct page *page)
+static void nfs_end_page_writeback(struct nfs_page *req)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = page_file_mapping(req->wb_page)->host;
struct nfs_server *nfss = NFS_SERVER(inode);

- end_page_writeback(page);
+ if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+ return;
+
+ end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
}
@@ -452,12 +455,20 @@ static void nfs_inode_remove_request(struct nfs_page *req)
{
struct inode *inode = req->wb_context->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *head;
+
+ if (!nfs_page_group_sync_on_bit(req, PG_REMOVE))
+ return;
+
+ /* always operate on the *head* of the page group (it's what was
+ referenced in _add_request) */
+ head = req->wb_head;

spin_lock(&inode->i_lock);
- if (likely(!PageSwapCache(req->wb_page))) {
- set_page_private(req->wb_page, 0);
- ClearPagePrivate(req->wb_page);
- clear_bit(PG_MAPPED, &req->wb_flags);
+ if (likely(!PageSwapCache(head->wb_page))) {
+ set_page_private(head->wb_page, 0);
+ ClearPagePrivate(head->wb_page);
+ clear_bit(PG_MAPPED, &head->wb_flags);
}
nfsi->npages--;
spin_unlock(&inode->i_lock);
@@ -654,7 +665,7 @@ remove_req:
nfs_inode_remove_request(req);
next:
nfs_unlock_request(req);
- nfs_end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req);
do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
nfs_release_request(req);
}
@@ -1128,7 +1139,7 @@ static void nfs_redirty_request(struct nfs_page *req)
{
nfs_mark_request_dirty(req);
nfs_unlock_request(req);
- nfs_end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req);
nfs_release_request(req);
}

diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 56b1f1c..41ce262 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -30,6 +30,8 @@ enum {
PG_TEARDOWN, /* page group sync for destroy */
PG_UNLOCKPAGE, /* page group sync bit in read path */
PG_UPTODATE, /* page group sync bit in read path */
+ PG_WB_END, /* page group sync bit in write path */
+ PG_REMOVE, /* page group sync bit in write path */
};

struct nfs_inode;
--
1.8.5.2 (Apple Git-48)


2014-04-24 15:23:23

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

On Apr 24, 2014, at 10:50 AM, Jeff Layton <[email protected]> wrote:

> On Tue, 22 Apr 2014 17:29:13 -0400
> Weston Andros Adamson <[email protected]> wrote:
>
>> Add "page groups" - a circular list of nfs requests (struct nfs_page)
>> that all reference the same page. This gives nfs read and write paths
>> the ability to account for sub-page regions independently. This
>> somewhat follows the design of struct buffer_head's sub-page
>> accounting.
>>
>> Only "head" requests are ever added/removed from the inode list in
>> the buffered write path. "head" and "sub" requests are treated the
>> same through the read path and the rest of the write/commit path.
>> Requests are given an extra reference across the life of the list.
>>
>> Page groups are never rejoined after being split. If the read/write
>> request fails and the client falls back to another path (ie revert
>> to MDS in PNFS case), the already split requests are pushed through
>> the recoalescing code again, which may split them further and then
>> coalesce them into properly sized requests on the wire. Fragmentation
>> shouldn't be a problem with the current design, because we flush all
>> requests in page group when a non-contiguous request is added, so
>> the only time resplitting should occur is on a resend of a read or
>> write.
>>
>> This patch lays the groundwork for sub-page splitting, but does not
>> actually do any splitting. For now all page groups have one request
>> as pg_test functions don't yet split pages. There are several related
>> patches that are needed support multiple requests per page group.
>>
>> Signed-off-by: Weston Andros Adamson <[email protected]>
>> ---
>> fs/nfs/direct.c | 7 +-
>> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
>> fs/nfs/read.c | 4 +-
>> fs/nfs/write.c | 12 ++-
>> include/linux/nfs_page.h | 12 ++-
>> 5 files changed, 231 insertions(+), 22 deletions(-)
>>
>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>> index a0c30c5..9d968ca 100644
>> --- a/fs/nfs/direct.c
>> +++ b/fs/nfs/direct.c
>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
>> struct nfs_page *req;
>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>> /* XXX do we need to do the eof zeroing found in async_filler? */
>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>> pgbase, req_len);
>> if (IS_ERR(req)) {
>> result = PTR_ERR(req);
>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
>> struct nfs_page *req;
>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>
>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>> pgbase, req_len);
>> if (IS_ERR(req)) {
>> result = PTR_ERR(req);
>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>> spin_unlock(&dreq->lock);
>>
>> while (!list_empty(&hdr->pages)) {
>> + bool do_destroy = true;
>> +
>> req = nfs_list_entry(hdr->pages.next);
>> nfs_list_remove_request(req);
>> switch (bit) {
>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>> case NFS_IOHDR_NEED_COMMIT:
>> kref_get(&req->wb_kref);
>> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
>> + do_destroy = false;
>> }
>> nfs_unlock_and_release_request(req);
>> }
>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>> index ac4fb64..8cb8e14 100644
>> --- a/fs/nfs/pagelist.c
>> +++ b/fs/nfs/pagelist.c
>> @@ -26,6 +26,8 @@
>>
>> static struct kmem_cache *nfs_page_cachep;
>>
>> +static void nfs_free_request(struct nfs_page *);
>> +
>> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
>> {
>> p->npages = pagecount;
>> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>> return __nfs_iocounter_wait(c);
>> }
>>
>> +/*
>> + * nfs_page_group_lock - lock the head of the page group
>> + * @req - request in group that is to be locked
>> + *
>> + * this lock must be held if modifying the page group list
>> + */
>> +void
>> +nfs_page_group_lock(struct nfs_page *req)
>> +{
>> + struct nfs_page *head = req->wb_head;
>> + int err = -EAGAIN;
>> +
>> + WARN_ON_ONCE(head != head->wb_head);
>> +
>> + while (err)
>> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
>> + nfs_wait_bit_killable, TASK_KILLABLE);
>> +}
>> +
>> +/*
>> + * nfs_page_group_unlock - unlock the head of the page group
>> + * @req - request in group that is to be unlocked
>> + */
>> +void
>> +nfs_page_group_unlock(struct nfs_page *req)
>> +{
>> + struct nfs_page *head = req->wb_head;
>> +
>> + WARN_ON_ONCE(head != head->wb_head);
>> +
>> + smp_mb__before_clear_bit();
>> + clear_bit(PG_HEADLOCK, &head->wb_flags);
>> + smp_mb__after_clear_bit();
>> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
>> +}
>> +
>> +/*
>> + * nfs_page_group_sync_on_bit_locked
>> + *
>> + * must be called with page group lock held
>> + */
>> +static bool
>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
>> +{
>> + struct nfs_page *head = req->wb_head;
>> + struct nfs_page *tmp;
>> +
>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
>> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
>> +
>> + tmp = req->wb_this_page;
>> + while (tmp != req) {
>> + if (!test_bit(bit, &tmp->wb_flags))
>> + return false;
>> + tmp = tmp->wb_this_page;
>> + }
>> +
>> + /* true! reset all bits */
>> + tmp = req;
>> + do {
>> + clear_bit(bit, &tmp->wb_flags);
>> + tmp = tmp->wb_this_page;
>> + } while (tmp != req);
>> +
>> + return true;
>> +}
>> +
>> +/*
>> + * nfs_page_group_sync_on_bit - set bit on current request, but only
>> + * return true if the bit is set for all requests in page group
>> + * @req - request in page group
>> + * @bit - PG_* bit that is used to sync page group
>> + */
>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
>> +{
>> + bool ret;
>> +
>> + nfs_page_group_lock(req);
>> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
>> + nfs_page_group_unlock(req);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * nfs_page_group_init - Initialize the page group linkage for @req
>> + * @req - a new nfs request
>> + * @prev - the previous request in page group, or NULL if @req is the first
>> + * or only request in the group (the head).
>> + */
>> +static inline void
>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
>> +{
>> + WARN_ON_ONCE(prev == req);
>> +
>> + if (!prev) {
>> + req->wb_head = req;
>> + req->wb_this_page = req;
>> + } else {
>> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
>> + req->wb_head = prev->wb_head;
>> + req->wb_this_page = prev->wb_this_page;
>> + prev->wb_this_page = req;
>> + }
>> +}
>> +
>> +/*
>> + * nfs_page_group_destroy - sync the destruction of page groups
>> + * @req - request that no longer needs the page group
>> + *
>> + * releases the page group reference from each member once all
>> + * members have called this function.
>> + */
>> +static void
>> +nfs_page_group_destroy(struct kref *kref)
>> +{
>> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>> + struct nfs_page *tmp, *next;
>> +
>> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
>> + return;
>> +
>> + tmp = req;
>> + do {
>> + next = tmp->wb_this_page;
>> + /* unlink and free */
>> + tmp->wb_this_page = tmp;
>> + tmp->wb_head = tmp;
>> + nfs_free_request(tmp);
>> + tmp = next;
>> + } while (tmp != req);
>> +}
>> +
>> /**
>> * nfs_create_request - Create an NFS read/write request.
>> * @ctx: open context to use
>> * @page: page to write
>> + * @last: last nfs request created for this page group or NULL if head
>> * @offset: starting offset within the page for the write
>> * @count: number of bytes to read/write
>> *
>> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>> */
>> struct nfs_page *
>> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>> - unsigned int offset, unsigned int count)
>> + struct nfs_page *last, unsigned int offset,
>> + unsigned int count)
>> {
>> struct nfs_page *req;
>> struct nfs_lock_context *l_ctx;
>> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>> req->wb_bytes = count;
>> req->wb_context = get_nfs_open_context(ctx);
>> kref_init(&req->wb_kref);
>> + nfs_page_group_init(req, last);
>> return req;
>> }
>>
>> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
>> }
>> }
>>
>> -
>> /**
>> * nfs_release_request - Release the count on an NFS read/write request
>> * @req: request to release
>> *
>> * Note: Should never be called with the spinlock held!
>> */
>> -static void nfs_free_request(struct kref *kref)
>> +static void nfs_free_request(struct nfs_page *req)
>> {
>> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>> + WARN_ON_ONCE(req->wb_this_page != req);
>> +
>> + /* extra debug: make sure no sync bits are still set */
>> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
>>
>> /* Release struct file and open context */
>> nfs_clear_request(req);
>> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
>>
>> void nfs_release_request(struct nfs_page *req)
>> {
>> - kref_put(&req->wb_kref, nfs_free_request);
>> + kref_put(&req->wb_kref, nfs_page_group_destroy);
>> }
>>
>> static int nfs_wait_bit_uninterruptible(void *word)
>> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
>> * @desc: destination io descriptor
>> * @req: request
>> *
>> + * This may split a request into subrequests which are all part of the
>> + * same page group.
>> + *
>> * Returns true if the request 'req' was successfully coalesced into the
>> * existing list of pages 'desc'.
>> */
>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
>> struct nfs_page *req)
>> {
>> - while (!nfs_pageio_do_add_request(desc, req)) {
>> - desc->pg_moreio = 1;
>> - nfs_pageio_doio(desc);
>> - if (desc->pg_error < 0)
>> - return 0;
>> - desc->pg_moreio = 0;
>> - if (desc->pg_recoalesce)
>> - return 0;
>> - }
>> + struct nfs_page *subreq;
>> + unsigned int bytes_left = 0;
>> + unsigned int offset, pgbase;
>> +
>> + nfs_page_group_lock(req);
>> +
>> + subreq = req;
>> + bytes_left = subreq->wb_bytes;
>> + offset = subreq->wb_offset;
>> + pgbase = subreq->wb_pgbase;
>> +
>> + do {
>> + if (!nfs_pageio_do_add_request(desc, subreq)) {
>> + /* make sure pg_test call(s) did nothing */
>> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
>> + WARN_ON_ONCE(subreq->wb_offset != offset);
>> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
>> +
>> + nfs_page_group_unlock(req);
>> + desc->pg_moreio = 1;
>> + nfs_pageio_doio(desc);
>> + if (desc->pg_error < 0)
>> + return 0;
>> + desc->pg_moreio = 0;
>> + if (desc->pg_recoalesce)
>> + return 0;
>> + /* retry add_request for this subreq */
>> + nfs_page_group_lock(req);
>> + continue;
>> + }
>> +
>> + /* check for buggy pg_test call(s) */
>> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
>> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
>> + WARN_ON_ONCE(subreq->wb_bytes == 0);
>> +
>> + bytes_left -= subreq->wb_bytes;
>> + offset += subreq->wb_bytes;
>> + pgbase += subreq->wb_bytes;
>> +
>> + if (bytes_left) {
>> + subreq = nfs_create_request(req->wb_context,
>> + req->wb_page,
>> + subreq, pgbase, bytes_left);
>> + nfs_lock_request(subreq);
>> + subreq->wb_offset = offset;
>> + subreq->wb_index = req->wb_index;
>> + }
>> + } while (bytes_left > 0);
>> +
>> + nfs_page_group_unlock(req);
>> return 1;
>> }
>>
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index 95a0855..ee0a3cd 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>> len = nfs_page_length(page);
>> if (len == 0)
>> return nfs_return_empty_page(page);
>> - new = nfs_create_request(ctx, page, 0, len);
>> + new = nfs_create_request(ctx, page, NULL, 0, len);
>> if (IS_ERR(new)) {
>> unlock_page(page);
>> return PTR_ERR(new);
>> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
>> if (len == 0)
>> return nfs_return_empty_page(page);
>>
>> - new = nfs_create_request(desc->ctx, page, 0, len);
>> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
>> if (IS_ERR(new))
>> goto out_error;
>>
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index ca20ec7..d1453f2 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
>> }
>> nfsi->npages--;
>> spin_unlock(&inode->i_lock);
>> - nfs_release_request(req);
>> + nfs_release_request(head);
>> }
>>
>> static void
>> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
>> {
>> struct nfs_commit_info cinfo;
>> unsigned long bytes = 0;
>> + bool do_destroy;
>>
>> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
>> goto out;
>> @@ -654,6 +655,7 @@ remove_req:
>> next:
>> nfs_unlock_request(req);
>> nfs_end_page_writeback(req->wb_page);
>> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
>> nfs_release_request(req);
>> }
>> out:
>> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
>> if (req == NULL)
>> goto out_unlock;
>>
>> + /* should be handled by nfs_flush_incompatible */
>> + WARN_ON_ONCE(req->wb_head != req);
>> + WARN_ON_ONCE(req->wb_this_page != req);
>> +
>> rqend = req->wb_offset + req->wb_bytes;
>> /*
>> * Tell the caller to flush out the request if
>> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
>> req = nfs_try_to_update_request(inode, page, offset, bytes);
>> if (req != NULL)
>> goto out;
>> - req = nfs_create_request(ctx, page, offset, bytes);
>> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
>> if (IS_ERR(req))
>> goto out;
>> nfs_inode_add_request(inode, req);
>> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>> return 0;
>> l_ctx = req->wb_lock_context;
>> do_flush = req->wb_page != page || req->wb_context != ctx;
>> + /* for now, flush if more than 1 request in page_group */
>> + do_flush |= req->wb_this_page != req;
>> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
>> do_flush |= l_ctx->lockowner.l_owner != current->files
>> || l_ctx->lockowner.l_pid != current->tgid;
>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
>> index 214e098..1fb161b 100644
>> --- a/include/linux/nfs_page.h
>> +++ b/include/linux/nfs_page.h
>> @@ -26,6 +26,8 @@ enum {
>> PG_MAPPED, /* page private set for buffered io */
>> PG_CLEAN, /* write succeeded */
>> PG_COMMIT_TO_DS, /* used by pnfs layouts */
>> + PG_HEADLOCK, /* page group lock of wb_head */
>> + PG_TEARDOWN, /* page group sync for destroy */
>> };
>>
>> struct nfs_inode;
>> @@ -41,6 +43,8 @@ struct nfs_page {
>> struct kref wb_kref; /* reference count */
>> unsigned long wb_flags;
>> struct nfs_write_verifier wb_verf; /* Commit cookie */
>> + struct nfs_page *wb_this_page; /* list of reqs for this page */
>> + struct nfs_page *wb_head; /* head pointer for req list */
>
> Hmm ok, so to make sure I understand...
>
> So page->private will point to the "head" req (struct page_private).

Only in the buffered write case. Page->private is not set for read path / direct i/o path.

> Then we'll have a singly-linked list of reqs hanging off of
> wb_this_page. Is that right?
>
> If so, then it seems like it would be clearer to use a standard
> list_head here. If you need to get to the wb_head, you could always do
> something like this:
>
> list_first_entry(&req->wb_page->wb_this_page);

Well, wb_page is a struct page and doesn?t have wb_this_page (which is in struct
nfs_page), but I see where you?re going with this.

A strategy like this only works if we always have page->private pointing to the head
request. We chose not to go that way because it messes with the buffered
write path?s setting / clearing of page private which interacts with the swappable
nfs pages code that everyone seems to be afraid to touch ;)

So we decided to go this route (not messing with page_private) as a first step - we
certainly could add it later, but the current approach makes things less complex.

>
> ...and could even turn that into a macro or static inline for some
> syntactic sugar. It's a little more pointer chasing to find the head,
> but it seems like that would be clearer than using yet another
> linked-list implementation.

So, I?m not against using list_head.. I didn?t go that route initially because I was:

1) following the buffer_head example, which rolls it?s own list

2) trying to grow nfs_page as little as possible - but we might have room within
the allocator bucket it currently lives in?

3) not sure list_head is suitable for a circular list (I haven?t ever looked into it).

and until we have a way to find the head request (via page private, etc) without
walking the circular list (chicken / egg problem needing to grab head lock before walking
list to find the head to lock it), we?ll still need the head pointer.

Thoughts?

-dros

>
>> };
>>
>> struct nfs_pageio_descriptor;
>> @@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {
>>
>> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
>> struct page *page,
>> + struct nfs_page *last,
>> unsigned int offset,
>> unsigned int count);
>> -extern void nfs_release_request(struct nfs_page *req);
>> +extern void nfs_release_request(struct nfs_page *);
>>
>>
>> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
>> @@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
>> struct nfs_page *req);
>> extern int nfs_wait_on_request(struct nfs_page *);
>> extern void nfs_unlock_request(struct nfs_page *req);
>> -extern void nfs_unlock_and_release_request(struct nfs_page *req);
>> +extern void nfs_unlock_and_release_request(struct nfs_page *);
>> +extern void nfs_page_group_lock(struct nfs_page *);
>> +extern void nfs_page_group_unlock(struct nfs_page *);
>> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
>>
>> /*
>> * Lock the page of an asynchronous request
>
>
> --
> Jeff Layton <[email protected]>


2014-04-22 21:29:00

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 02/17] nfs: remove unused arg from nfs_create_request

@inode is passed but not used.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/direct.c | 6 ++----
fs/nfs/pagelist.c | 4 +---
fs/nfs/read.c | 5 ++---
fs/nfs/write.c | 2 +-
include/linux/nfs_page.h | 1 -
5 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b8797ae..a0c30c5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -380,8 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
/* XXX do we need to do the eof zeroing found in async_filler? */
- req = nfs_create_request(dreq->ctx, dreq->inode,
- pagevec[i],
+ req = nfs_create_request(dreq->ctx, pagevec[i],
pgbase, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
@@ -750,8 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);

- req = nfs_create_request(dreq->ctx, dreq->inode,
- pagevec[i],
+ req = nfs_create_request(dreq->ctx, pagevec[i],
pgbase, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2ffebf2..ecd34b7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -136,7 +136,6 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
/**
* nfs_create_request - Create an NFS read/write request.
* @ctx: open context to use
- * @inode: inode to which the request is attached
* @page: page to write
* @offset: starting offset within the page for the write
* @count: number of bytes to read/write
@@ -146,8 +145,7 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
* User should ensure it is safe to sleep in this function.
*/
struct nfs_page *
-nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
- struct page *page,
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
unsigned int offset, unsigned int count)
{
struct nfs_page *req;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 411aedd..95a0855 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(ctx, inode, page, 0, len);
+ new = nfs_create_request(ctx, page, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
return PTR_ERR(new);
@@ -592,7 +592,6 @@ static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *new;
unsigned int len;
int error;
@@ -601,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
if (len == 0)
return nfs_return_empty_page(page);

- new = nfs_create_request(desc->ctx, inode, page, 0, len);
+ new = nfs_create_request(desc->ctx, page, 0, len);
if (IS_ERR(new))
goto out_error;

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index cd7c651..ca20ec7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -819,7 +819,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
req = nfs_try_to_update_request(inode, page, offset, bytes);
if (req != NULL)
goto out;
- req = nfs_create_request(ctx, inode, page, offset, bytes);
+ req = nfs_create_request(ctx, page, offset, bytes);
if (IS_ERR(req))
goto out;
nfs_inode_add_request(inode, req);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 93c7293..905809d 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -73,7 +73,6 @@ struct nfs_pageio_descriptor {
#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags))

extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
- struct inode *inode,
struct page *page,
unsigned int offset,
unsigned int count);
--
1.8.5.2 (Apple Git-48)


2014-04-24 17:23:54

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

On Apr 24, 2014, at 12:52 PM, Jeff Layton <[email protected]> wrote:

> On Thu, 24 Apr 2014 12:15:08 -0400
> Weston Andros Adamson <[email protected]> wrote:
>
>> On Apr 24, 2014, at 11:45 AM, Jeff Layton <[email protected]> wrote:
>>
>>> On Thu, 24 Apr 2014 11:23:19 -0400
>>> Weston Andros Adamson <[email protected]> wrote:
>>>
>>>> On Apr 24, 2014, at 10:50 AM, Jeff Layton <[email protected]> wrote:
>>>>
>>>>> On Tue, 22 Apr 2014 17:29:13 -0400
>>>>> Weston Andros Adamson <[email protected]> wrote:
>>>>>
>>>>>> Add "page groups" - a circular list of nfs requests (struct nfs_page)
>>>>>> that all reference the same page. This gives nfs read and write paths
>>>>>> the ability to account for sub-page regions independently. This
>>>>>> somewhat follows the design of struct buffer_head's sub-page
>>>>>> accounting.
>>>>>>
>>>>>> Only "head" requests are ever added/removed from the inode list in
>>>>>> the buffered write path. "head" and "sub" requests are treated the
>>>>>> same through the read path and the rest of the write/commit path.
>>>>>> Requests are given an extra reference across the life of the list.
>>>>>>
>>>>>> Page groups are never rejoined after being split. If the read/write
>>>>>> request fails and the client falls back to another path (ie revert
>>>>>> to MDS in PNFS case), the already split requests are pushed through
>>>>>> the recoalescing code again, which may split them further and then
>>>>>> coalesce them into properly sized requests on the wire. Fragmentation
>>>>>> shouldn't be a problem with the current design, because we flush all
>>>>>> requests in page group when a non-contiguous request is added, so
>>>>>> the only time resplitting should occur is on a resend of a read or
>>>>>> write.
>>>>>>
>>>>>> This patch lays the groundwork for sub-page splitting, but does not
>>>>>> actually do any splitting. For now all page groups have one request
>>>>>> as pg_test functions don't yet split pages. There are several related
>>>>>> patches that are needed support multiple requests per page group.
>>>>>>
>>>>>> Signed-off-by: Weston Andros Adamson <[email protected]>
>>>>>> ---
>>>>>> fs/nfs/direct.c | 7 +-
>>>>>> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
>>>>>> fs/nfs/read.c | 4 +-
>>>>>> fs/nfs/write.c | 12 ++-
>>>>>> include/linux/nfs_page.h | 12 ++-
>>>>>> 5 files changed, 231 insertions(+), 22 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>>>>>> index a0c30c5..9d968ca 100644
>>>>>> --- a/fs/nfs/direct.c
>>>>>> +++ b/fs/nfs/direct.c
>>>>>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
>>>>>> struct nfs_page *req;
>>>>>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>>>>> /* XXX do we need to do the eof zeroing found in async_filler? */
>>>>>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>>>>>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>>>>>> pgbase, req_len);
>>>>>> if (IS_ERR(req)) {
>>>>>> result = PTR_ERR(req);
>>>>>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
>>>>>> struct nfs_page *req;
>>>>>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>>>>>
>>>>>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>>>>>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>>>>>> pgbase, req_len);
>>>>>> if (IS_ERR(req)) {
>>>>>> result = PTR_ERR(req);
>>>>>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>>>>>> spin_unlock(&dreq->lock);
>>>>>>
>>>>>> while (!list_empty(&hdr->pages)) {
>>>>>> + bool do_destroy = true;
>>>>>> +
>>>>>> req = nfs_list_entry(hdr->pages.next);
>>>>>> nfs_list_remove_request(req);
>>>>>> switch (bit) {
>>>>>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>>>>>> case NFS_IOHDR_NEED_COMMIT:
>>>>>> kref_get(&req->wb_kref);
>>>>>> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
>>>>>> + do_destroy = false;
>>>>>> }
>>>>>> nfs_unlock_and_release_request(req);
>>>>>> }
>>>>>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>>>>>> index ac4fb64..8cb8e14 100644
>>>>>> --- a/fs/nfs/pagelist.c
>>>>>> +++ b/fs/nfs/pagelist.c
>>>>>> @@ -26,6 +26,8 @@
>>>>>>
>>>>>> static struct kmem_cache *nfs_page_cachep;
>>>>>>
>>>>>> +static void nfs_free_request(struct nfs_page *);
>>>>>> +
>>>>>> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
>>>>>> {
>>>>>> p->npages = pagecount;
>>>>>> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>>>>>> return __nfs_iocounter_wait(c);
>>>>>> }
>>>>>>
>>>>>> +/*
>>>>>> + * nfs_page_group_lock - lock the head of the page group
>>>>>> + * @req - request in group that is to be locked
>>>>>> + *
>>>>>> + * this lock must be held if modifying the page group list
>>>>>> + */
>>>>>> +void
>>>>>> +nfs_page_group_lock(struct nfs_page *req)
>>>>>> +{
>>>>>> + struct nfs_page *head = req->wb_head;
>>>>>> + int err = -EAGAIN;
>>>>>> +
>>>>>> + WARN_ON_ONCE(head != head->wb_head);
>>>>>> +
>>>>>> + while (err)
>>>>>> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
>>>>>> + nfs_wait_bit_killable, TASK_KILLABLE);
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * nfs_page_group_unlock - unlock the head of the page group
>>>>>> + * @req - request in group that is to be unlocked
>>>>>> + */
>>>>>> +void
>>>>>> +nfs_page_group_unlock(struct nfs_page *req)
>>>>>> +{
>>>>>> + struct nfs_page *head = req->wb_head;
>>>>>> +
>>>>>> + WARN_ON_ONCE(head != head->wb_head);
>>>>>> +
>>>>>> + smp_mb__before_clear_bit();
>>>>>> + clear_bit(PG_HEADLOCK, &head->wb_flags);
>>>>>> + smp_mb__after_clear_bit();
>>>>>> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * nfs_page_group_sync_on_bit_locked
>>>>>> + *
>>>>>> + * must be called with page group lock held
>>>>>> + */
>>>>>> +static bool
>>>>>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
>>>>>> +{
>>>>>> + struct nfs_page *head = req->wb_head;
>>>>>> + struct nfs_page *tmp;
>>>>>> +
>>>>>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
>>>>>> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
>>>>>> +
>>>>>> + tmp = req->wb_this_page;
>>>>>> + while (tmp != req) {
>>>>>> + if (!test_bit(bit, &tmp->wb_flags))
>>>>>> + return false;
>>>>>> + tmp = tmp->wb_this_page;
>>>>>> + }
>>>>>> +
>>>>>> + /* true! reset all bits */
>>>>>> + tmp = req;
>>>>>> + do {
>>>>>> + clear_bit(bit, &tmp->wb_flags);
>>>>>> + tmp = tmp->wb_this_page;
>>>>>> + } while (tmp != req);
>>>>>> +
>>>>>> + return true;
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * nfs_page_group_sync_on_bit - set bit on current request, but only
>>>>>> + * return true if the bit is set for all requests in page group
>>>>>> + * @req - request in page group
>>>>>> + * @bit - PG_* bit that is used to sync page group
>>>>>> + */
>>>>>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
>>>>>> +{
>>>>>> + bool ret;
>>>>>> +
>>>>>> + nfs_page_group_lock(req);
>>>>>> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
>>>>>> + nfs_page_group_unlock(req);
>>>>>> +
>>>>>> + return ret;
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * nfs_page_group_init - Initialize the page group linkage for @req
>>>>>> + * @req - a new nfs request
>>>>>> + * @prev - the previous request in page group, or NULL if @req is the first
>>>>>> + * or only request in the group (the head).
>>>>>> + */
>>>>>> +static inline void
>>>>>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
>>>>>> +{
>>>>>> + WARN_ON_ONCE(prev == req);
>>>>>> +
>>>>>> + if (!prev) {
>>>>>> + req->wb_head = req;
>>>>>> + req->wb_this_page = req;
>>>>>> + } else {
>>>>>> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
>>>>>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
>>>>>> + req->wb_head = prev->wb_head;
>>>>>> + req->wb_this_page = prev->wb_this_page;
>>>>>> + prev->wb_this_page = req;
>>>>>> + }
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * nfs_page_group_destroy - sync the destruction of page groups
>>>>>> + * @req - request that no longer needs the page group
>>>>>> + *
>>>>>> + * releases the page group reference from each member once all
>>>>>> + * members have called this function.
>>>>>> + */
>>>>>> +static void
>>>>>> +nfs_page_group_destroy(struct kref *kref)
>>>>>> +{
>>>>>> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>>>>>> + struct nfs_page *tmp, *next;
>>>>>> +
>>>>>> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
>>>>>> + return;
>>>>>> +
>>>>>> + tmp = req;
>>>>>> + do {
>>>>>> + next = tmp->wb_this_page;
>>>>>> + /* unlink and free */
>>>>>> + tmp->wb_this_page = tmp;
>>>>>> + tmp->wb_head = tmp;
>>>>>> + nfs_free_request(tmp);
>>>>>> + tmp = next;
>>>>>> + } while (tmp != req);
>>>>>> +}
>>>>>> +
>>>>>> /**
>>>>>> * nfs_create_request - Create an NFS read/write request.
>>>>>> * @ctx: open context to use
>>>>>> * @page: page to write
>>>>>> + * @last: last nfs request created for this page group or NULL if head
>>>>>> * @offset: starting offset within the page for the write
>>>>>> * @count: number of bytes to read/write
>>>>>> *
>>>>>> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>>>>>> */
>>>>>> struct nfs_page *
>>>>>> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>>>>>> - unsigned int offset, unsigned int count)
>>>>>> + struct nfs_page *last, unsigned int offset,
>>>>>> + unsigned int count)
>>>>>> {
>>>>>> struct nfs_page *req;
>>>>>> struct nfs_lock_context *l_ctx;
>>>>>> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>>>>>> req->wb_bytes = count;
>>>>>> req->wb_context = get_nfs_open_context(ctx);
>>>>>> kref_init(&req->wb_kref);
>>>>>> + nfs_page_group_init(req, last);
>>>>>> return req;
>>>>>> }
>>>>>>
>>>>>> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
>>>>>> }
>>>>>> }
>>>>>>
>>>>>> -
>>>>>> /**
>>>>>> * nfs_release_request - Release the count on an NFS read/write request
>>>>>> * @req: request to release
>>>>>> *
>>>>>> * Note: Should never be called with the spinlock held!
>>>>>> */
>>>>>> -static void nfs_free_request(struct kref *kref)
>>>>>> +static void nfs_free_request(struct nfs_page *req)
>>>>>> {
>>>>>> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>>>>>> + WARN_ON_ONCE(req->wb_this_page != req);
>>>>>> +
>>>>>> + /* extra debug: make sure no sync bits are still set */
>>>>>> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
>>>>>> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
>>>>>> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
>>>>>> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
>>>>>> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
>>>>>>
>>>>>> /* Release struct file and open context */
>>>>>> nfs_clear_request(req);
>>>>>> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
>>>>>>
>>>>>> void nfs_release_request(struct nfs_page *req)
>>>>>> {
>>>>>> - kref_put(&req->wb_kref, nfs_free_request);
>>>>>> + kref_put(&req->wb_kref, nfs_page_group_destroy);
>>>>>> }
>>>>>>
>>>>>> static int nfs_wait_bit_uninterruptible(void *word)
>>>>>> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
>>>>>> * @desc: destination io descriptor
>>>>>> * @req: request
>>>>>> *
>>>>>> + * This may split a request into subrequests which are all part of the
>>>>>> + * same page group.
>>>>>> + *
>>>>>> * Returns true if the request 'req' was successfully coalesced into the
>>>>>> * existing list of pages 'desc'.
>>>>>> */
>>>>>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
>>>>>> struct nfs_page *req)
>>>>>> {
>>>>>> - while (!nfs_pageio_do_add_request(desc, req)) {
>>>>>> - desc->pg_moreio = 1;
>>>>>> - nfs_pageio_doio(desc);
>>>>>> - if (desc->pg_error < 0)
>>>>>> - return 0;
>>>>>> - desc->pg_moreio = 0;
>>>>>> - if (desc->pg_recoalesce)
>>>>>> - return 0;
>>>>>> - }
>>>>>> + struct nfs_page *subreq;
>>>>>> + unsigned int bytes_left = 0;
>>>>>> + unsigned int offset, pgbase;
>>>>>> +
>>>>>> + nfs_page_group_lock(req);
>>>>>> +
>>>>>> + subreq = req;
>>>>>> + bytes_left = subreq->wb_bytes;
>>>>>> + offset = subreq->wb_offset;
>>>>>> + pgbase = subreq->wb_pgbase;
>>>>>> +
>>>>>> + do {
>>>>>> + if (!nfs_pageio_do_add_request(desc, subreq)) {
>>>>>> + /* make sure pg_test call(s) did nothing */
>>>>>> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
>>>>>> + WARN_ON_ONCE(subreq->wb_offset != offset);
>>>>>> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
>>>>>> +
>>>>>> + nfs_page_group_unlock(req);
>>>>>> + desc->pg_moreio = 1;
>>>>>> + nfs_pageio_doio(desc);
>>>>>> + if (desc->pg_error < 0)
>>>>>> + return 0;
>>>>>> + desc->pg_moreio = 0;
>>>>>> + if (desc->pg_recoalesce)
>>>>>> + return 0;
>>>>>> + /* retry add_request for this subreq */
>>>>>> + nfs_page_group_lock(req);
>>>>>> + continue;
>>>>>> + }
>>>>>> +
>>>>>> + /* check for buggy pg_test call(s) */
>>>>>> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
>>>>>> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
>>>>>> + WARN_ON_ONCE(subreq->wb_bytes == 0);
>>>>>> +
>>>>>> + bytes_left -= subreq->wb_bytes;
>>>>>> + offset += subreq->wb_bytes;
>>>>>> + pgbase += subreq->wb_bytes;
>>>>>> +
>>>>>> + if (bytes_left) {
>>>>>> + subreq = nfs_create_request(req->wb_context,
>>>>>> + req->wb_page,
>>>>>> + subreq, pgbase, bytes_left);
>>>>>> + nfs_lock_request(subreq);
>>>>>> + subreq->wb_offset = offset;
>>>>>> + subreq->wb_index = req->wb_index;
>>>>>> + }
>>>>>> + } while (bytes_left > 0);
>>>>>> +
>>>>>> + nfs_page_group_unlock(req);
>>>>>> return 1;
>>>>>> }
>>>>>>
>>>>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>>>>> index 95a0855..ee0a3cd 100644
>>>>>> --- a/fs/nfs/read.c
>>>>>> +++ b/fs/nfs/read.c
>>>>>> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>>>>>> len = nfs_page_length(page);
>>>>>> if (len == 0)
>>>>>> return nfs_return_empty_page(page);
>>>>>> - new = nfs_create_request(ctx, page, 0, len);
>>>>>> + new = nfs_create_request(ctx, page, NULL, 0, len);
>>>>>> if (IS_ERR(new)) {
>>>>>> unlock_page(page);
>>>>>> return PTR_ERR(new);
>>>>>> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
>>>>>> if (len == 0)
>>>>>> return nfs_return_empty_page(page);
>>>>>>
>>>>>> - new = nfs_create_request(desc->ctx, page, 0, len);
>>>>>> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
>>>>>> if (IS_ERR(new))
>>>>>> goto out_error;
>>>>>>
>>>>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>>>>> index ca20ec7..d1453f2 100644
>>>>>> --- a/fs/nfs/write.c
>>>>>> +++ b/fs/nfs/write.c
>>>>>> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
>>>>>> }
>>>>>> nfsi->npages--;
>>>>>> spin_unlock(&inode->i_lock);
>>>>>> - nfs_release_request(req);
>>>>>> + nfs_release_request(head);
>>>>>> }
>>>>>>
>>>>>> static void
>>>>>> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
>>>>>> {
>>>>>> struct nfs_commit_info cinfo;
>>>>>> unsigned long bytes = 0;
>>>>>> + bool do_destroy;
>>>>>>
>>>>>> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
>>>>>> goto out;
>>>>>> @@ -654,6 +655,7 @@ remove_req:
>>>>>> next:
>>>>>> nfs_unlock_request(req);
>>>>>> nfs_end_page_writeback(req->wb_page);
>>>>>> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
>>>>>> nfs_release_request(req);
>>>>>> }
>>>>>> out:
>>>>>> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
>>>>>> if (req == NULL)
>>>>>> goto out_unlock;
>>>>>>
>>>>>> + /* should be handled by nfs_flush_incompatible */
>>>>>> + WARN_ON_ONCE(req->wb_head != req);
>>>>>> + WARN_ON_ONCE(req->wb_this_page != req);
>>>>>> +
>>>>>> rqend = req->wb_offset + req->wb_bytes;
>>>>>> /*
>>>>>> * Tell the caller to flush out the request if
>>>>>> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
>>>>>> req = nfs_try_to_update_request(inode, page, offset, bytes);
>>>>>> if (req != NULL)
>>>>>> goto out;
>>>>>> - req = nfs_create_request(ctx, page, offset, bytes);
>>>>>> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
>>>>>> if (IS_ERR(req))
>>>>>> goto out;
>>>>>> nfs_inode_add_request(inode, req);
>>>>>> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>>>>>> return 0;
>>>>>> l_ctx = req->wb_lock_context;
>>>>>> do_flush = req->wb_page != page || req->wb_context != ctx;
>>>>>> + /* for now, flush if more than 1 request in page_group */
>>>>>> + do_flush |= req->wb_this_page != req;
>>>>>> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
>>>>>> do_flush |= l_ctx->lockowner.l_owner != current->files
>>>>>> || l_ctx->lockowner.l_pid != current->tgid;
>>>>>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
>>>>>> index 214e098..1fb161b 100644
>>>>>> --- a/include/linux/nfs_page.h
>>>>>> +++ b/include/linux/nfs_page.h
>>>>>> @@ -26,6 +26,8 @@ enum {
>>>>>> PG_MAPPED, /* page private set for buffered io */
>>>>>> PG_CLEAN, /* write succeeded */
>>>>>> PG_COMMIT_TO_DS, /* used by pnfs layouts */
>>>>>> + PG_HEADLOCK, /* page group lock of wb_head */
>>>>>> + PG_TEARDOWN, /* page group sync for destroy */
>>>>>> };
>>>>>>
>>>>>> struct nfs_inode;
>>>>>> @@ -41,6 +43,8 @@ struct nfs_page {
>>>>>> struct kref wb_kref; /* reference count */
>>>>>> unsigned long wb_flags;
>>>>>> struct nfs_write_verifier wb_verf; /* Commit cookie */
>>>>>> + struct nfs_page *wb_this_page; /* list of reqs for this page */
>>>>>> + struct nfs_page *wb_head; /* head pointer for req list */
>>>>>
>>>>> Hmm ok, so to make sure I understand...
>>>>>
>>>>> So page->private will point to the "head" req (struct page_private).
>>>>
>>>> Only in the buffered write case. Page->private is not set for read path / direct i/o path.
>>>>
>>>>> Then we'll have a singly-linked list of reqs hanging off of
>>>>> wb_this_page. Is that right?
>>>>>
>>>>> If so, then it seems like it would be clearer to use a standard
>>>>> list_head here. If you need to get to the wb_head, you could always do
>>>>> something like this:
>>>>>
>>>>> list_first_entry(&req->wb_page->wb_this_page);
>>>>
>>>> Well, wb_page is a struct page and doesn?t have wb_this_page (which is in struct
>>>> nfs_page), but I see where you?re going with this.
>>>>
>>>
>>> Doh, right! Sorry, I threw that together in haste, but you get the
>>> idea. I was thinking you could go back to the page and dereference
>>> ->private.
>>>
>>>> A strategy like this only works if we always have page->private pointing to the head
>>>> request. We chose not to go that way because it messes with the buffered
>>>> write path?s setting / clearing of page private which interacts with the swappable
>>>> nfs pages code that everyone seems to be afraid to touch ;)
>>>>
>>>> So we decided to go this route (not messing with page_private) as a first step - we
>>>> certainly could add it later, but the current approach makes things less complex.
>>>>
>>>
>>> Ok, that makes sense. Thanks...
>>>
>>>>>
>>>>> ...and could even turn that into a macro or static inline for some
>>>>> syntactic sugar. It's a little more pointer chasing to find the head,
>>>>> but it seems like that would be clearer than using yet another
>>>>> linked-list implementation.
>>>>
>>>> So, I?m not against using list_head.. I didn?t go that route initially because I was:
>>>>
>>>> 1) following the buffer_head example, which rolls it?s own list
>>>>
>>>
>>> I wouldn't be surprised if the buffer_head code predates the standard
>>> linked-list macros, so that probably explains why they did it that way.
>>> The file locking code has a similar construct in inode->i_flock list.
>>
>> AFAIK the sub-page functionality was added somewhat recently.
>>
>>>
>>>> 2) trying to grow nfs_page as little as possible - but we might have room within
>>>> the allocator bucket it currently lives in?
>>>>
>>>
>>> nfs_page comes out of a dedicated slabcache, so that probably won't be the case.
>>
>> Ah, right!
>>
>>>
>>>> 3) not sure list_head is suitable for a circular list (I haven?t ever looked into it).
>>>>
>>>> and until we have a way to find the head request (via page private, etc) without
>>>> walking the circular list (chicken / egg problem needing to grab head lock before walking
>>>> list to find the head to lock it), we?ll still need the head pointer.
>>>>
>>>> Thoughts?
>>>>
>>>> -dros
>>>>
>>>
>>> If you can't rely on page->private pointing to the request, then that
>>> does make it tough to do what I was suggesting. struct list_head lists
>>> are doubly-linked and circular by nature, so that does seem to be a
>>> natural fit for what you're trying to do.
>>
>> Oh I see -- you?re totally right about list_head being circular, one just has
>> to call for_each on whatever head they wish to start from.
>>
>>>
>>> The only problem is that struct list_head is two pointers instead of
>>> one, so it's not going to be as space-efficient as what you're doing
>>> here. If that's a large concern then you may have no choice but to do
>>> this after all.
>>
>> Right. How much do we care about an extra pointer here? It seems to me
>> that we should try to keep it as small as possible - I know Trond has been unwilling
>> to add members to rpc_task (for example) unless absolutely necessary and there will
>> be at least one (if not more) nfs_page structures per rpc_task.
>>
>
> Well there are potentially a lot of these structs, so an extra pointer
> in each adds up.

Indeed.

>
> In fact, if only the head req is ever on the per-inode list, then I
> guess the wb_list is unused for sub requests, right? That might be an
> opportunity for space savings too -- you could union wb_head and
> wb_list, and use a wb_flag to indicate which is valid?

There isn?t actually an inode list, even though I think I mentioned something like that
recently ;)

The write path uses nfs_inode_(add|remove)_request to:

- hold an extra reference to handle handoff between write list and commit list.
- handle setting / clearing page_private for swappable page semantics
- per inode page counting book keeping.

wb_list is used on sub requests exactly like head requests - for keeping them on
read/write/commit lists for passing through pgio layer.

-dros

>
>> One immediate takeaway: I need to add much better comments about this.
>>
>> As far as eventually removing the wb_head pointer, it gets really ugly to do without
>> changing the buffered write path (and swappable page semantics) because page_group
>> operations happen *after* nfs_inode_remove_request() clears page_private (syncing the
>> destruction of the page group). This means that nfs_release_request and
>> nfs_unlock_and_release_request will both have to be passed a previously cached head
>> pointer. yuck.
>>
>
> Ahh right -- that is tricky then. I'd have to ponder that a bit more...
> --
> Jeff Layton <[email protected]>


2014-04-23 14:17:01

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

On 04/22/2014 05:29 PM, Weston Andros Adamson wrote:
> Since the ability to split pages into subpage requests has been added,
> nfs_pgio_header->rpc_list only ever has one wdata/rdata.
>
> Signed-off-by: Weston Andros Adamson <[email protected]>
> ---
> fs/nfs/pnfs.c | 41 +++++++++++++++--------------------------
> fs/nfs/read.c | 35 +++++------------------------------
> fs/nfs/write.c | 38 +++++++-------------------------------
> include/linux/nfs_xdr.h | 35 ++++++++++++++++++-----------------
> 4 files changed, 45 insertions(+), 104 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 7c89385..3b3ec46 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1600,23 +1600,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
> }
>
> static void
> -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
> +pnfs_do_write(struct nfs_pageio_descriptor *desc,
> + struct nfs_pgio_header *hdr, int how)
> {
> - struct nfs_write_data *data;
> + struct nfs_write_data *data = hdr->data.write;
> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
> struct pnfs_layout_segment *lseg = desc->pg_lseg;
> + enum pnfs_try_status trypnfs;
>
> desc->pg_lseg = NULL;
> - while (!list_empty(head)) {
> - enum pnfs_try_status trypnfs;
> -
> - data = list_first_entry(head, struct nfs_write_data, list);
> - list_del_init(&data->list);
> -
> - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
> - if (trypnfs == PNFS_NOT_ATTEMPTED)
> - pnfs_write_through_mds(desc, data);
> - }
> + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
> + if (trypnfs == PNFS_NOT_ATTEMPTED)
> + pnfs_write_through_mds(desc, data);
> pnfs_put_lseg(lseg);
> }
>
> @@ -1650,7 +1645,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
> pnfs_put_lseg(desc->pg_lseg);
> desc->pg_lseg = NULL;
> } else
> - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
> + pnfs_do_write(desc, hdr, desc->pg_ioflags);
> if (atomic_dec_and_test(&hdr->refcnt))
> hdr->completion_ops->completion(hdr);
> return ret;
> @@ -1758,23 +1753,17 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
> }
>
> static void
> -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
> +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
> {
> - struct nfs_read_data *data;
> + struct nfs_read_data *data = hdr->data.read;
> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
> struct pnfs_layout_segment *lseg = desc->pg_lseg;
> + enum pnfs_try_status trypnfs;
>
> desc->pg_lseg = NULL;
> - while (!list_empty(head)) {
> - enum pnfs_try_status trypnfs;
> -
> - data = list_first_entry(head, struct nfs_read_data, list);
> - list_del_init(&data->list);
> -
> - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
> - if (trypnfs == PNFS_NOT_ATTEMPTED)
> - pnfs_read_through_mds(desc, data);
> - }
> + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
> + if (trypnfs == PNFS_NOT_ATTEMPTED)
> + pnfs_read_through_mds(desc, data);
> pnfs_put_lseg(lseg);
> }
>
> @@ -1809,7 +1798,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
> pnfs_put_lseg(desc->pg_lseg);
> desc->pg_lseg = NULL;
> } else
> - pnfs_do_multiple_reads(desc, &hdr->rpc_list);
> + pnfs_do_read(desc, hdr);
> if (atomic_dec_and_test(&hdr->refcnt))
> hdr->completion_ops->completion(hdr);
> return ret;
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index daeff0c..c6b7dd0 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -42,7 +42,6 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
> struct nfs_pgio_header *hdr = &rhdr->header;
>
> INIT_LIST_HEAD(&hdr->pages);
> - INIT_LIST_HEAD(&hdr->rpc_list);
> spin_lock_init(&hdr->lock);
> atomic_set(&hdr->refcnt, 0);
> }
> @@ -286,26 +285,6 @@ static int nfs_do_read(struct nfs_read_data *data,
> return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
> }
>
> -static int
> -nfs_do_multiple_reads(struct list_head *head,
> - const struct rpc_call_ops *call_ops)
> -{
> - struct nfs_read_data *data;
> - int ret = 0;
> -
> - while (!list_empty(head)) {
> - int ret2;
> -
> - data = list_first_entry(head, struct nfs_read_data, list);
> - list_del_init(&data->list);
> -
> - ret2 = nfs_do_read(data, call_ops);
> - if (ret == 0)
> - ret = ret2;
> - }
> - return ret;
> -}
> -
> static void
> nfs_async_read_error(struct list_head *head)
> {
> @@ -327,12 +306,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
> struct nfs_pgio_header *hdr)
> {
> set_bit(NFS_IOHDR_REDO, &hdr->flags);
> - while (!list_empty(&hdr->rpc_list)) {
> - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
> - struct nfs_read_data, list);
> - list_del(&data->list);
> - nfs_readdata_release(data);
> - }
> + nfs_readdata_release(hdr->data.read);
> + hdr->data.read = NULL;
> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
> }
>
> @@ -364,7 +339,8 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
> }
>
> nfs_read_rpcsetup(data, desc->pg_count, 0);
> - list_add(&data->list, &hdr->rpc_list);
> + WARN_ON_ONCE(hdr->data.read);
> + hdr->data.read = data;
> desc->pg_rpc_callops = &nfs_read_common_ops;
> return 0;
> }
> @@ -386,8 +362,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
> atomic_inc(&hdr->refcnt);
> ret = nfs_generic_pagein(desc, hdr);
> if (ret == 0)
> - ret = nfs_do_multiple_reads(&hdr->rpc_list,
> - desc->pg_rpc_callops);
> + ret = nfs_do_read(hdr->data.read, desc->pg_rpc_callops);
> if (atomic_dec_and_test(&hdr->refcnt))
> hdr->completion_ops->completion(hdr);
> return ret;
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index f40db93..cd24a14 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -79,7 +79,6 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
>
> memset(p, 0, sizeof(*p));
> INIT_LIST_HEAD(&hdr->pages);
> - INIT_LIST_HEAD(&hdr->rpc_list);
> spin_lock_init(&hdr->lock);
> atomic_set(&hdr->refcnt, 0);
> hdr->verf = &p->verf;
> @@ -1171,26 +1170,6 @@ static int nfs_do_write(struct nfs_write_data *data,
> return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
> }
>
> -static int nfs_do_multiple_writes(struct list_head *head,
> - const struct rpc_call_ops *call_ops,
> - int how)
> -{
> - struct nfs_write_data *data;
> - int ret = 0;
> -
> - while (!list_empty(head)) {
> - int ret2;
> -
> - data = list_first_entry(head, struct nfs_write_data, list);
> - list_del_init(&data->list);
> -
> - ret2 = nfs_do_write(data, call_ops, how);
> - if (ret == 0)
> - ret = ret2;
> - }
> - return ret;
> -}
> -
> /* If a nfs_flush_* function fails, it should remove reqs from @head and
> * call this on each, which will prepare them to be retried on next
> * writeback using standard nfs.
> @@ -1223,12 +1202,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
> struct nfs_pgio_header *hdr)
> {
> set_bit(NFS_IOHDR_REDO, &hdr->flags);
> - while (!list_empty(&hdr->rpc_list)) {
> - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
> - struct nfs_write_data, list);
> - list_del(&data->list);
> - nfs_writedata_release(data);
> - }
> + nfs_writedata_release(hdr->data.write);
> + hdr->data.write = NULL;
> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
> }
>
> @@ -1275,7 +1250,8 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>
> /* Set up the argument struct */
> nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
> - list_add(&data->list, &hdr->rpc_list);
> + WARN_ON_ONCE(hdr->data.write);
> + hdr->data.write = data;
> desc->pg_rpc_callops = &nfs_write_common_ops;
> return 0;
> }
> @@ -1297,9 +1273,9 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
> atomic_inc(&hdr->refcnt);
> ret = nfs_generic_flush(desc, hdr);
> if (ret == 0)
> - ret = nfs_do_multiple_writes(&hdr->rpc_list,
> - desc->pg_rpc_callops,
> - desc->pg_ioflags);
> + ret = nfs_do_write(hdr->data.write,
> + desc->pg_rpc_callops,
> + desc->pg_ioflags);
> if (atomic_dec_and_test(&hdr->refcnt))
> hdr->completion_ops->completion(hdr);
> return ret;
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 6fb5b23..239274d 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -1266,7 +1266,6 @@ struct nfs_page_array {
>
> struct nfs_read_data {
> struct nfs_pgio_header *header;
> - struct list_head list;
> struct rpc_task task;
> struct nfs_fattr fattr; /* fattr storage */
> struct nfs_readargs args;
> @@ -1278,6 +1277,20 @@ struct nfs_read_data {
> struct nfs_client *ds_clp; /* pNFS data server */
> };
>
> +struct nfs_write_data {
> + struct nfs_pgio_header *header;
> + struct rpc_task task;
> + struct nfs_fattr fattr;
> + struct nfs_writeverf verf;
> + struct nfs_writeargs args; /* argument struct */
> + struct nfs_writeres res; /* result struct */
> + unsigned long timestamp; /* For lease renewal */
> + int (*write_done_cb)(struct rpc_task *, struct nfs_write_data *);
> + __u64 mds_offset; /* Filelayout dense stripe */
> + struct nfs_page_array pages;
> + struct nfs_client *ds_clp; /* pNFS data server */
> +};
> +
> /* used as flag bits in nfs_pgio_header */
> enum {
> NFS_IOHDR_ERROR = 0,
> @@ -1291,7 +1304,10 @@ struct nfs_pgio_header {
> struct inode *inode;
> struct rpc_cred *cred;
> struct list_head pages;
> - struct list_head rpc_list;
> + union {
> + struct nfs_read_data *read;
> + struct nfs_write_data *write;
> + } data;

The first 5 patches in my series makes it so we can share all of these structs. Would it be useful to put those in first?

Anna

> atomic_t refcnt;
> struct nfs_page *req;
> struct nfs_writeverf *verf;
> @@ -1315,21 +1331,6 @@ struct nfs_read_header {
> struct nfs_read_data rpc_data;
> };
>
> -struct nfs_write_data {
> - struct nfs_pgio_header *header;
> - struct list_head list;
> - struct rpc_task task;
> - struct nfs_fattr fattr;
> - struct nfs_writeverf verf;
> - struct nfs_writeargs args; /* argument struct */
> - struct nfs_writeres res; /* result struct */
> - unsigned long timestamp; /* For lease renewal */
> - int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
> - __u64 mds_offset; /* Filelayout dense stripe */
> - struct nfs_page_array pages;
> - struct nfs_client *ds_clp; /* pNFS data server */
> -};
> -
> struct nfs_write_header {
> struct nfs_pgio_header header;
> struct nfs_write_data rpc_data;


2014-04-22 21:29:06

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 06/17] nfs: page group syncing in read path

Operations that modify state for a whole page must be syncronized across
all requests within a page group. In the read path, this is calling
unlock_page and SetPageUptodate. Both of these functions should not be
called until all requests in a page group have reached the point where
they would call them.

This patch should have no effect yet since all page groups currently
have one request, but will come into play when pg_test functions are
modified to split pages into sub-page regions.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/read.c | 22 +++++++++++++++++-----
include/linux/nfs_page.h | 2 ++
2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ee0a3cd..c774810 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -158,10 +158,16 @@ static void nfs_readpage_release(struct nfs_page *req)
{
struct inode *d_inode = req->wb_context->dentry->d_inode;

- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes,
+ (long long)req_offset(req));

- unlock_page(req->wb_page);
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }

dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -171,6 +177,12 @@ static void nfs_readpage_release(struct nfs_page *req)
nfs_release_request(req);
}

+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+ SetPageUptodate(req->wb_page);
+}
+
/* Note io was page aligned */
static void nfs_read_completion(struct nfs_pgio_header *hdr)
{
@@ -193,9 +205,9 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
bytes += req->wb_bytes;
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (bytes <= hdr->good_bytes)
- SetPageUptodate(page);
+ nfs_page_group_set_uptodate(req);
} else
- SetPageUptodate(page);
+ nfs_page_group_set_uptodate(req);
nfs_list_remove_request(req);
nfs_readpage_release(req);
}
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 1fb161b..56b1f1c 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -28,6 +28,8 @@ enum {
PG_COMMIT_TO_DS, /* used by pnfs layouts */
PG_HEADLOCK, /* page group lock of wb_head */
PG_TEARDOWN, /* page group sync for destroy */
+ PG_UNLOCKPAGE, /* page group sync bit in read path */
+ PG_UPTODATE, /* page group sync bit in read path */
};

struct nfs_inode;
--
1.8.5.2 (Apple Git-48)


2014-04-23 17:51:52

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

On 04/23/2014 01:44 PM, Weston Andros Adamson wrote:
> On Apr 23, 2014, at 10:36 AM, Anna Schumaker <[email protected]> wrote:
>
>> On 04/23/2014 10:31 AM, Weston Andros Adamson wrote:
>>> On Apr 23, 2014, at 10:16 AM, Anna Schumaker <[email protected]> wrote:
>>>
>>>> On 04/22/2014 05:29 PM, Weston Andros Adamson wrote:
>>>>> Since the ability to split pages into subpage requests has been added,
>>>>> nfs_pgio_header->rpc_list only ever has one wdata/rdata.
>>>>>
>>>>> Signed-off-by: Weston Andros Adamson <[email protected]>
>>>>> ---
>>>>> fs/nfs/pnfs.c | 41 +++++++++++++++--------------------------
>>>>> fs/nfs/read.c | 35 +++++------------------------------
>>>>> fs/nfs/write.c | 38 +++++++-------------------------------
>>>>> include/linux/nfs_xdr.h | 35 ++++++++++++++++++-----------------
>>>>> 4 files changed, 45 insertions(+), 104 deletions(-)
>>>>>
>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>> index 7c89385..3b3ec46 100644
>>>>> --- a/fs/nfs/pnfs.c
>>>>> +++ b/fs/nfs/pnfs.c
>>>>> @@ -1600,23 +1600,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
>>>>> }
>>>>>
>>>>> static void
>>>>> -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
>>>>> +pnfs_do_write(struct nfs_pageio_descriptor *desc,
>>>>> + struct nfs_pgio_header *hdr, int how)
>>>>> {
>>>>> - struct nfs_write_data *data;
>>>>> + struct nfs_write_data *data = hdr->data.write;
>>>>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>>>>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>>>>> + enum pnfs_try_status trypnfs;
>>>>>
>>>>> desc->pg_lseg = NULL;
>>>>> - while (!list_empty(head)) {
>>>>> - enum pnfs_try_status trypnfs;
>>>>> -
>>>>> - data = list_first_entry(head, struct nfs_write_data, list);
>>>>> - list_del_init(&data->list);
>>>>> -
>>>>> - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>>>>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>>> - pnfs_write_through_mds(desc, data);
>>>>> - }
>>>>> + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>>>>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>>> + pnfs_write_through_mds(desc, data);
>>>>> pnfs_put_lseg(lseg);
>>>>> }
>>>>>
>>>>> @@ -1650,7 +1645,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>>>>> pnfs_put_lseg(desc->pg_lseg);
>>>>> desc->pg_lseg = NULL;
>>>>> } else
>>>>> - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
>>>>> + pnfs_do_write(desc, hdr, desc->pg_ioflags);
>>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>>> hdr->completion_ops->completion(hdr);
>>>>> return ret;
>>>>> @@ -1758,23 +1753,17 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
>>>>> }
>>>>>
>>>>> static void
>>>>> -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
>>>>> +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
>>>>> {
>>>>> - struct nfs_read_data *data;
>>>>> + struct nfs_read_data *data = hdr->data.read;
>>>>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>>>>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>>>>> + enum pnfs_try_status trypnfs;
>>>>>
>>>>> desc->pg_lseg = NULL;
>>>>> - while (!list_empty(head)) {
>>>>> - enum pnfs_try_status trypnfs;
>>>>> -
>>>>> - data = list_first_entry(head, struct nfs_read_data, list);
>>>>> - list_del_init(&data->list);
>>>>> -
>>>>> - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>>>>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>>> - pnfs_read_through_mds(desc, data);
>>>>> - }
>>>>> + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>>>>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>>> + pnfs_read_through_mds(desc, data);
>>>>> pnfs_put_lseg(lseg);
>>>>> }
>>>>>
>>>>> @@ -1809,7 +1798,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>>>>> pnfs_put_lseg(desc->pg_lseg);
>>>>> desc->pg_lseg = NULL;
>>>>> } else
>>>>> - pnfs_do_multiple_reads(desc, &hdr->rpc_list);
>>>>> + pnfs_do_read(desc, hdr);
>>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>>> hdr->completion_ops->completion(hdr);
>>>>> return ret;
>>>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>>>> index daeff0c..c6b7dd0 100644
>>>>> --- a/fs/nfs/read.c
>>>>> +++ b/fs/nfs/read.c
>>>>> @@ -42,7 +42,6 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
>>>>> struct nfs_pgio_header *hdr = &rhdr->header;
>>>>>
>>>>> INIT_LIST_HEAD(&hdr->pages);
>>>>> - INIT_LIST_HEAD(&hdr->rpc_list);
>>>>> spin_lock_init(&hdr->lock);
>>>>> atomic_set(&hdr->refcnt, 0);
>>>>> }
>>>>> @@ -286,26 +285,6 @@ static int nfs_do_read(struct nfs_read_data *data,
>>>>> return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
>>>>> }
>>>>>
>>>>> -static int
>>>>> -nfs_do_multiple_reads(struct list_head *head,
>>>>> - const struct rpc_call_ops *call_ops)
>>>>> -{
>>>>> - struct nfs_read_data *data;
>>>>> - int ret = 0;
>>>>> -
>>>>> - while (!list_empty(head)) {
>>>>> - int ret2;
>>>>> -
>>>>> - data = list_first_entry(head, struct nfs_read_data, list);
>>>>> - list_del_init(&data->list);
>>>>> -
>>>>> - ret2 = nfs_do_read(data, call_ops);
>>>>> - if (ret == 0)
>>>>> - ret = ret2;
>>>>> - }
>>>>> - return ret;
>>>>> -}
>>>>> -
>>>>> static void
>>>>> nfs_async_read_error(struct list_head *head)
>>>>> {
>>>>> @@ -327,12 +306,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
>>>>> struct nfs_pgio_header *hdr)
>>>>> {
>>>>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>>>>> - while (!list_empty(&hdr->rpc_list)) {
>>>>> - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
>>>>> - struct nfs_read_data, list);
>>>>> - list_del(&data->list);
>>>>> - nfs_readdata_release(data);
>>>>> - }
>>>>> + nfs_readdata_release(hdr->data.read);
>>>>> + hdr->data.read = NULL;
>>>>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>>>>> }
>>>>>
>>>>> @@ -364,7 +339,8 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
>>>>> }
>>>>>
>>>>> nfs_read_rpcsetup(data, desc->pg_count, 0);
>>>>> - list_add(&data->list, &hdr->rpc_list);
>>>>> + WARN_ON_ONCE(hdr->data.read);
>>>>> + hdr->data.read = data;
>>>>> desc->pg_rpc_callops = &nfs_read_common_ops;
>>>>> return 0;
>>>>> }
>>>>> @@ -386,8 +362,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>>>>> atomic_inc(&hdr->refcnt);
>>>>> ret = nfs_generic_pagein(desc, hdr);
>>>>> if (ret == 0)
>>>>> - ret = nfs_do_multiple_reads(&hdr->rpc_list,
>>>>> - desc->pg_rpc_callops);
>>>>> + ret = nfs_do_read(hdr->data.read, desc->pg_rpc_callops);
>>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>>> hdr->completion_ops->completion(hdr);
>>>>> return ret;
>>>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>>>> index f40db93..cd24a14 100644
>>>>> --- a/fs/nfs/write.c
>>>>> +++ b/fs/nfs/write.c
>>>>> @@ -79,7 +79,6 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
>>>>>
>>>>> memset(p, 0, sizeof(*p));
>>>>> INIT_LIST_HEAD(&hdr->pages);
>>>>> - INIT_LIST_HEAD(&hdr->rpc_list);
>>>>> spin_lock_init(&hdr->lock);
>>>>> atomic_set(&hdr->refcnt, 0);
>>>>> hdr->verf = &p->verf;
>>>>> @@ -1171,26 +1170,6 @@ static int nfs_do_write(struct nfs_write_data *data,
>>>>> return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
>>>>> }
>>>>>
>>>>> -static int nfs_do_multiple_writes(struct list_head *head,
>>>>> - const struct rpc_call_ops *call_ops,
>>>>> - int how)
>>>>> -{
>>>>> - struct nfs_write_data *data;
>>>>> - int ret = 0;
>>>>> -
>>>>> - while (!list_empty(head)) {
>>>>> - int ret2;
>>>>> -
>>>>> - data = list_first_entry(head, struct nfs_write_data, list);
>>>>> - list_del_init(&data->list);
>>>>> -
>>>>> - ret2 = nfs_do_write(data, call_ops, how);
>>>>> - if (ret == 0)
>>>>> - ret = ret2;
>>>>> - }
>>>>> - return ret;
>>>>> -}
>>>>> -
>>>>> /* If a nfs_flush_* function fails, it should remove reqs from @head and
>>>>> * call this on each, which will prepare them to be retried on next
>>>>> * writeback using standard nfs.
>>>>> @@ -1223,12 +1202,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
>>>>> struct nfs_pgio_header *hdr)
>>>>> {
>>>>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>>>>> - while (!list_empty(&hdr->rpc_list)) {
>>>>> - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
>>>>> - struct nfs_write_data, list);
>>>>> - list_del(&data->list);
>>>>> - nfs_writedata_release(data);
>>>>> - }
>>>>> + nfs_writedata_release(hdr->data.write);
>>>>> + hdr->data.write = NULL;
>>>>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>>>>> }
>>>>>
>>>>> @@ -1275,7 +1250,8 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>>>>>
>>>>> /* Set up the argument struct */
>>>>> nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
>>>>> - list_add(&data->list, &hdr->rpc_list);
>>>>> + WARN_ON_ONCE(hdr->data.write);
>>>>> + hdr->data.write = data;
>>>>> desc->pg_rpc_callops = &nfs_write_common_ops;
>>>>> return 0;
>>>>> }
>>>>> @@ -1297,9 +1273,9 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>>>>> atomic_inc(&hdr->refcnt);
>>>>> ret = nfs_generic_flush(desc, hdr);
>>>>> if (ret == 0)
>>>>> - ret = nfs_do_multiple_writes(&hdr->rpc_list,
>>>>> - desc->pg_rpc_callops,
>>>>> - desc->pg_ioflags);
>>>>> + ret = nfs_do_write(hdr->data.write,
>>>>> + desc->pg_rpc_callops,
>>>>> + desc->pg_ioflags);
>>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>>> hdr->completion_ops->completion(hdr);
>>>>> return ret;
>>>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>>>>> index 6fb5b23..239274d 100644
>>>>> --- a/include/linux/nfs_xdr.h
>>>>> +++ b/include/linux/nfs_xdr.h
>>>>> @@ -1266,7 +1266,6 @@ struct nfs_page_array {
>>>>>
>>>>> struct nfs_read_data {
>>>>> struct nfs_pgio_header *header;
>>>>> - struct list_head list;
>>>>> struct rpc_task task;
>>>>> struct nfs_fattr fattr; /* fattr storage */
>>>>> struct nfs_readargs args;
>>>>> @@ -1278,6 +1277,20 @@ struct nfs_read_data {
>>>>> struct nfs_client *ds_clp; /* pNFS data server */
>>>>> };
>>>>>
>>>>> +struct nfs_write_data {
>>>>> + struct nfs_pgio_header *header;
>>>>> + struct rpc_task task;
>>>>> + struct nfs_fattr fattr;
>>>>> + struct nfs_writeverf verf;
>>>>> + struct nfs_writeargs args; /* argument struct */
>>>>> + struct nfs_writeres res; /* result struct */
>>>>> + unsigned long timestamp; /* For lease renewal */
>>>>> + int (*write_done_cb)(struct rpc_task *, struct nfs_write_data *);
>>>>> + __u64 mds_offset; /* Filelayout dense stripe */
>>>>> + struct nfs_page_array pages;
>>>>> + struct nfs_client *ds_clp; /* pNFS data server */
>>>>> +};
>>>>> +
>>>>> /* used as flag bits in nfs_pgio_header */
>>>>> enum {
>>>>> NFS_IOHDR_ERROR = 0,
>>>>> @@ -1291,7 +1304,10 @@ struct nfs_pgio_header {
>>>>> struct inode *inode;
>>>>> struct rpc_cred *cred;
>>>>> struct list_head pages;
>>>>> - struct list_head rpc_list;
>>>>> + union {
>>>>> + struct nfs_read_data *read;
>>>>> + struct nfs_write_data *write;
>>>>> + } data;
>>>> The first 5 patches in my series makes it so we can share all of these structs. Would it be useful to put those in first?
>>>>
>>>> Anna
>>>>
>>> Yes, I think it makes sense to stage most (if not all) of your patches first then merge my patches in.
>>>
>>> I think I�ll just give it a shot and see how bad it is. I need to post a rebased version of my patchset anyway,
>>> so I�ll see if I can also rebase on top of your changes.
>>>
>>> Any objections?
>> No objections! As a reminder, I'm based off of Trond's [testing] branch with two extra pageio cleanups from Christoph. Shoot me an email if you need help!
>>
>> Anna
> Great news - the merge was pretty easy!
>
> I ended up merging by hand - doing “git am --3way” on each patch so I could ensure
> that they each build cleanly. When there were conflicts, I was able to compare the
> old patch to the newly rebased patch to make sure I didn’t miss anything.
>
> This exercise also helped me find a few problems with my patchset ;)
>
> Now it’s time to test! I’ll share my branch on a public repo and repost my patchset
> soon.

Great! I'm glad it went smoothly!


>
> -dros
>
>>>>> atomic_t refcnt;
>>>>> struct nfs_page *req;
>>>>> struct nfs_writeverf *verf;
>>>>> @@ -1315,21 +1331,6 @@ struct nfs_read_header {
>>>>> struct nfs_read_data rpc_data;
>>>>> };
>>>>>
>>>>> -struct nfs_write_data {
>>>>> - struct nfs_pgio_header *header;
>>>>> - struct list_head list;
>>>>> - struct rpc_task task;
>>>>> - struct nfs_fattr fattr;
>>>>> - struct nfs_writeverf verf;
>>>>> - struct nfs_writeargs args; /* argument struct */
>>>>> - struct nfs_writeres res; /* result struct */
>>>>> - unsigned long timestamp; /* For lease renewal */
>>>>> - int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
>>>>> - __u64 mds_offset; /* Filelayout dense stripe */
>>>>> - struct nfs_page_array pages;
>>>>> - struct nfs_client *ds_clp; /* pNFS data server */
>>>>> -};
>>>>> -
>>>>> struct nfs_write_header {
>>>>> struct nfs_pgio_header header;
>>>>> struct nfs_write_data rpc_data;


2014-04-22 21:29:10

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 08/17] nfs: page group support in nfs_mark_uptodate

Change how nfs_mark_uptodate checks to see if writes cover a whole page.

This patch should have no effect yet since all page groups currently
have one request, but will come into play when pg_test functions are
modified to split pages into sub-page regions.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/write.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 67 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c6f6449..43892e0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -211,18 +211,78 @@ static void nfs_set_pageerror(struct page *page)
nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
}

+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+ struct nfs_page *req;
+
+ WARN_ON_ONCE(head != head->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
+
+ req = head;
+ do {
+ if (page_offset >= req->wb_pgbase &&
+ page_offset < (req->wb_pgbase + req->wb_bytes))
+ return req;
+
+ req = req->wb_this_page;
+ } while (req != head);
+
+ return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+ unsigned int pos = 0;
+ unsigned int len = nfs_page_length(req->wb_page);
+
+ nfs_page_group_lock(req);
+
+ do {
+ tmp = nfs_page_group_search_locked(req->wb_head, pos);
+ if (tmp) {
+ /* no way this should happen */
+ WARN_ON_ONCE(tmp->wb_pgbase != pos);
+ pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
+ }
+ } while (tmp && pos < len);
+
+ nfs_page_group_unlock(req);
+ WARN_ON_ONCE(pos > len);
+ return pos == len;
+}
+
/* We can set the PG_uptodate flag if we see that a write request
* covers the full page.
*/
-static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
+static void nfs_mark_uptodate(struct nfs_page *req)
{
- if (PageUptodate(page))
- return;
- if (base != 0)
+ if (PageUptodate(req->wb_page))
return;
- if (count != nfs_page_length(page))
+ if (!nfs_page_group_covers_page(req))
return;
- SetPageUptodate(page);
+ SetPageUptodate(req->wb_page);
}

static int wb_priority(struct writeback_control *wbc)
@@ -854,7 +914,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
return PTR_ERR(req);
/* Update file length */
nfs_grow_file(page, offset, count);
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_mark_uptodate(req);
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
return 0;
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:40:08

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

Oh boy, I posted this with a ?cleanup? of page group reference counting,
but this doesn?t work with certain file layout stripe sizes :-/

I?ll post the older, clunky version (that works) tomorrow if I can?t figure this out quickly.

-dros



On Apr 22, 2014, at 5:29 PM, Weston Andros Adamson <[email protected]> wrote:

> Add "page groups" - a circular list of nfs requests (struct nfs_page)
> that all reference the same page. This gives nfs read and write paths
> the ability to account for sub-page regions independently. This
> somewhat follows the design of struct buffer_head's sub-page
> accounting.
>
> Only "head" requests are ever added/removed from the inode list in
> the buffered write path. "head" and "sub" requests are treated the
> same through the read path and the rest of the write/commit path.
> Requests are given an extra reference across the life of the list.
>
> Page groups are never rejoined after being split. If the read/write
> request fails and the client falls back to another path (ie revert
> to MDS in PNFS case), the already split requests are pushed through
> the recoalescing code again, which may split them further and then
> coalesce them into properly sized requests on the wire. Fragmentation
> shouldn't be a problem with the current design, because we flush all
> requests in page group when a non-contiguous request is added, so
> the only time resplitting should occur is on a resend of a read or
> write.
>
> This patch lays the groundwork for sub-page splitting, but does not
> actually do any splitting. For now all page groups have one request
> as pg_test functions don't yet split pages. There are several related
> patches that are needed support multiple requests per page group.
>
> Signed-off-by: Weston Andros Adamson <[email protected]>
> ---
> fs/nfs/direct.c | 7 +-
> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
> fs/nfs/read.c | 4 +-
> fs/nfs/write.c | 12 ++-
> include/linux/nfs_page.h | 12 ++-
> 5 files changed, 231 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index a0c30c5..9d968ca 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
> struct nfs_page *req;
> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
> /* XXX do we need to do the eof zeroing found in async_filler? */
> - req = nfs_create_request(dreq->ctx, pagevec[i],
> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> pgbase, req_len);
> if (IS_ERR(req)) {
> result = PTR_ERR(req);
> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
> struct nfs_page *req;
> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>
> - req = nfs_create_request(dreq->ctx, pagevec[i],
> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> pgbase, req_len);
> if (IS_ERR(req)) {
> result = PTR_ERR(req);
> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> spin_unlock(&dreq->lock);
>
> while (!list_empty(&hdr->pages)) {
> + bool do_destroy = true;
> +
> req = nfs_list_entry(hdr->pages.next);
> nfs_list_remove_request(req);
> switch (bit) {
> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> case NFS_IOHDR_NEED_COMMIT:
> kref_get(&req->wb_kref);
> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
> + do_destroy = false;
> }
> nfs_unlock_and_release_request(req);
> }
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index ac4fb64..8cb8e14 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -26,6 +26,8 @@
>
> static struct kmem_cache *nfs_page_cachep;
>
> +static void nfs_free_request(struct nfs_page *);
> +
> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
> {
> p->npages = pagecount;
> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> return __nfs_iocounter_wait(c);
> }
>
> +/*
> + * nfs_page_group_lock - lock the head of the page group
> + * @req - request in group that is to be locked
> + *
> + * this lock must be held if modifying the page group list
> + */
> +void
> +nfs_page_group_lock(struct nfs_page *req)
> +{
> + struct nfs_page *head = req->wb_head;
> + int err = -EAGAIN;
> +
> + WARN_ON_ONCE(head != head->wb_head);
> +
> + while (err)
> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
> + nfs_wait_bit_killable, TASK_KILLABLE);
> +}
> +
> +/*
> + * nfs_page_group_unlock - unlock the head of the page group
> + * @req - request in group that is to be unlocked
> + */
> +void
> +nfs_page_group_unlock(struct nfs_page *req)
> +{
> + struct nfs_page *head = req->wb_head;
> +
> + WARN_ON_ONCE(head != head->wb_head);
> +
> + smp_mb__before_clear_bit();
> + clear_bit(PG_HEADLOCK, &head->wb_flags);
> + smp_mb__after_clear_bit();
> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
> +}
> +
> +/*
> + * nfs_page_group_sync_on_bit_locked
> + *
> + * must be called with page group lock held
> + */
> +static bool
> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
> +{
> + struct nfs_page *head = req->wb_head;
> + struct nfs_page *tmp;
> +
> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
> +
> + tmp = req->wb_this_page;
> + while (tmp != req) {
> + if (!test_bit(bit, &tmp->wb_flags))
> + return false;
> + tmp = tmp->wb_this_page;
> + }
> +
> + /* true! reset all bits */
> + tmp = req;
> + do {
> + clear_bit(bit, &tmp->wb_flags);
> + tmp = tmp->wb_this_page;
> + } while (tmp != req);
> +
> + return true;
> +}
> +
> +/*
> + * nfs_page_group_sync_on_bit - set bit on current request, but only
> + * return true if the bit is set for all requests in page group
> + * @req - request in page group
> + * @bit - PG_* bit that is used to sync page group
> + */
> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
> +{
> + bool ret;
> +
> + nfs_page_group_lock(req);
> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
> + nfs_page_group_unlock(req);
> +
> + return ret;
> +}
> +
> +/*
> + * nfs_page_group_init - Initialize the page group linkage for @req
> + * @req - a new nfs request
> + * @prev - the previous request in page group, or NULL if @req is the first
> + * or only request in the group (the head).
> + */
> +static inline void
> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
> +{
> + WARN_ON_ONCE(prev == req);
> +
> + if (!prev) {
> + req->wb_head = req;
> + req->wb_this_page = req;
> + } else {
> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
> + req->wb_head = prev->wb_head;
> + req->wb_this_page = prev->wb_this_page;
> + prev->wb_this_page = req;
> + }
> +}
> +
> +/*
> + * nfs_page_group_destroy - sync the destruction of page groups
> + * @req - request that no longer needs the page group
> + *
> + * releases the page group reference from each member once all
> + * members have called this function.
> + */
> +static void
> +nfs_page_group_destroy(struct kref *kref)
> +{
> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> + struct nfs_page *tmp, *next;
> +
> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
> + return;
> +
> + tmp = req;
> + do {
> + next = tmp->wb_this_page;
> + /* unlink and free */
> + tmp->wb_this_page = tmp;
> + tmp->wb_head = tmp;
> + nfs_free_request(tmp);
> + tmp = next;
> + } while (tmp != req);
> +}
> +
> /**
> * nfs_create_request - Create an NFS read/write request.
> * @ctx: open context to use
> * @page: page to write
> + * @last: last nfs request created for this page group or NULL if head
> * @offset: starting offset within the page for the write
> * @count: number of bytes to read/write
> *
> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> */
> struct nfs_page *
> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> - unsigned int offset, unsigned int count)
> + struct nfs_page *last, unsigned int offset,
> + unsigned int count)
> {
> struct nfs_page *req;
> struct nfs_lock_context *l_ctx;
> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> req->wb_bytes = count;
> req->wb_context = get_nfs_open_context(ctx);
> kref_init(&req->wb_kref);
> + nfs_page_group_init(req, last);
> return req;
> }
>
> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
> }
> }
>
> -
> /**
> * nfs_release_request - Release the count on an NFS read/write request
> * @req: request to release
> *
> * Note: Should never be called with the spinlock held!
> */
> -static void nfs_free_request(struct kref *kref)
> +static void nfs_free_request(struct nfs_page *req)
> {
> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> + WARN_ON_ONCE(req->wb_this_page != req);
> +
> + /* extra debug: make sure no sync bits are still set */
> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
>
> /* Release struct file and open context */
> nfs_clear_request(req);
> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
>
> void nfs_release_request(struct nfs_page *req)
> {
> - kref_put(&req->wb_kref, nfs_free_request);
> + kref_put(&req->wb_kref, nfs_page_group_destroy);
> }
>
> static int nfs_wait_bit_uninterruptible(void *word)
> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
> * @desc: destination io descriptor
> * @req: request
> *
> + * This may split a request into subrequests which are all part of the
> + * same page group.
> + *
> * Returns true if the request 'req' was successfully coalesced into the
> * existing list of pages 'desc'.
> */
> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
> struct nfs_page *req)
> {
> - while (!nfs_pageio_do_add_request(desc, req)) {
> - desc->pg_moreio = 1;
> - nfs_pageio_doio(desc);
> - if (desc->pg_error < 0)
> - return 0;
> - desc->pg_moreio = 0;
> - if (desc->pg_recoalesce)
> - return 0;
> - }
> + struct nfs_page *subreq;
> + unsigned int bytes_left = 0;
> + unsigned int offset, pgbase;
> +
> + nfs_page_group_lock(req);
> +
> + subreq = req;
> + bytes_left = subreq->wb_bytes;
> + offset = subreq->wb_offset;
> + pgbase = subreq->wb_pgbase;
> +
> + do {
> + if (!nfs_pageio_do_add_request(desc, subreq)) {
> + /* make sure pg_test call(s) did nothing */
> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
> + WARN_ON_ONCE(subreq->wb_offset != offset);
> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
> +
> + nfs_page_group_unlock(req);
> + desc->pg_moreio = 1;
> + nfs_pageio_doio(desc);
> + if (desc->pg_error < 0)
> + return 0;
> + desc->pg_moreio = 0;
> + if (desc->pg_recoalesce)
> + return 0;
> + /* retry add_request for this subreq */
> + nfs_page_group_lock(req);
> + continue;
> + }
> +
> + /* check for buggy pg_test call(s) */
> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
> + WARN_ON_ONCE(subreq->wb_bytes == 0);
> +
> + bytes_left -= subreq->wb_bytes;
> + offset += subreq->wb_bytes;
> + pgbase += subreq->wb_bytes;
> +
> + if (bytes_left) {
> + subreq = nfs_create_request(req->wb_context,
> + req->wb_page,
> + subreq, pgbase, bytes_left);
> + nfs_lock_request(subreq);
> + subreq->wb_offset = offset;
> + subreq->wb_index = req->wb_index;
> + }
> + } while (bytes_left > 0);
> +
> + nfs_page_group_unlock(req);
> return 1;
> }
>
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index 95a0855..ee0a3cd 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
> len = nfs_page_length(page);
> if (len == 0)
> return nfs_return_empty_page(page);
> - new = nfs_create_request(ctx, page, 0, len);
> + new = nfs_create_request(ctx, page, NULL, 0, len);
> if (IS_ERR(new)) {
> unlock_page(page);
> return PTR_ERR(new);
> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
> if (len == 0)
> return nfs_return_empty_page(page);
>
> - new = nfs_create_request(desc->ctx, page, 0, len);
> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
> if (IS_ERR(new))
> goto out_error;
>
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index ca20ec7..d1453f2 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
> }
> nfsi->npages--;
> spin_unlock(&inode->i_lock);
> - nfs_release_request(req);
> + nfs_release_request(head);
> }
>
> static void
> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
> {
> struct nfs_commit_info cinfo;
> unsigned long bytes = 0;
> + bool do_destroy;
>
> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
> goto out;
> @@ -654,6 +655,7 @@ remove_req:
> next:
> nfs_unlock_request(req);
> nfs_end_page_writeback(req->wb_page);
> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
> nfs_release_request(req);
> }
> out:
> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
> if (req == NULL)
> goto out_unlock;
>
> + /* should be handled by nfs_flush_incompatible */
> + WARN_ON_ONCE(req->wb_head != req);
> + WARN_ON_ONCE(req->wb_this_page != req);
> +
> rqend = req->wb_offset + req->wb_bytes;
> /*
> * Tell the caller to flush out the request if
> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
> req = nfs_try_to_update_request(inode, page, offset, bytes);
> if (req != NULL)
> goto out;
> - req = nfs_create_request(ctx, page, offset, bytes);
> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
> if (IS_ERR(req))
> goto out;
> nfs_inode_add_request(inode, req);
> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
> return 0;
> l_ctx = req->wb_lock_context;
> do_flush = req->wb_page != page || req->wb_context != ctx;
> + /* for now, flush if more than 1 request in page_group */
> + do_flush |= req->wb_this_page != req;
> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
> do_flush |= l_ctx->lockowner.l_owner != current->files
> || l_ctx->lockowner.l_pid != current->tgid;
> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> index 214e098..1fb161b 100644
> --- a/include/linux/nfs_page.h
> +++ b/include/linux/nfs_page.h
> @@ -26,6 +26,8 @@ enum {
> PG_MAPPED, /* page private set for buffered io */
> PG_CLEAN, /* write succeeded */
> PG_COMMIT_TO_DS, /* used by pnfs layouts */
> + PG_HEADLOCK, /* page group lock of wb_head */
> + PG_TEARDOWN, /* page group sync for destroy */
> };
>
> struct nfs_inode;
> @@ -41,6 +43,8 @@ struct nfs_page {
> struct kref wb_kref; /* reference count */
> unsigned long wb_flags;
> struct nfs_write_verifier wb_verf; /* Commit cookie */
> + struct nfs_page *wb_this_page; /* list of reqs for this page */
> + struct nfs_page *wb_head; /* head pointer for req list */
> };
>
> struct nfs_pageio_descriptor;
> @@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {
>
> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
> struct page *page,
> + struct nfs_page *last,
> unsigned int offset,
> unsigned int count);
> -extern void nfs_release_request(struct nfs_page *req);
> +extern void nfs_release_request(struct nfs_page *);
>
>
> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
> @@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
> struct nfs_page *req);
> extern int nfs_wait_on_request(struct nfs_page *);
> extern void nfs_unlock_request(struct nfs_page *req);
> -extern void nfs_unlock_and_release_request(struct nfs_page *req);
> +extern void nfs_unlock_and_release_request(struct nfs_page *);
> +extern void nfs_page_group_lock(struct nfs_page *);
> +extern void nfs_page_group_unlock(struct nfs_page *);
> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
>
> /*
> * Lock the page of an asynchronous request
> --
> 1.8.5.2 (Apple Git-48)
>


2014-04-22 21:29:02

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 03/17] nfs: modify pg_test interface to return size_t

This is a step toward allowing pg_test to inform the the
coalescing code to reduce the size of requests so they may fit in
whatever scheme the pg_test callback wants to define.

For now, just return the size of the request if there is space, or 0
if there is not. This shouldn't change any behavior as it acts
the same as when the pg_test functions returned bool.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 16 ++++++++++++----
fs/nfs/nfs4filelayout.c | 12 +++++++-----
fs/nfs/objlayout/objio_osd.c | 14 ++++++++++----
fs/nfs/pagelist.c | 22 +++++++++++++++++++---
fs/nfs/pnfs.c | 12 +++++++++---
fs/nfs/pnfs.h | 3 ++-
include/linux/nfs_page.h | 5 +++--
7 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 65d849b..3867976 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1189,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
pnfs_generic_pg_init_read(pgio, req);
}

-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (pgio->pg_dreq != NULL &&
!is_aligned_req(req, SECTOR_SIZE))
- return false;
+ return 0;

return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1241,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
}
}

-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (pgio->pg_dreq != NULL &&
!is_aligned_req(req, PAGE_CACHE_SIZE))
- return false;
+ return 0;

return pnfs_generic_pg_test(pgio, prev, req);
}
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b6fc8c1..dfc7282 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -915,10 +915,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
- * return true : coalesce page
- * return false : don't coalesce page
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
-static bool
+static size_t
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
@@ -927,7 +927,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,

if (!pnfs_generic_pg_test(pgio, prev, req) ||
!nfs_generic_pg_test(pgio, prev, req))
- return false;
+ return 0;

p_stripe = (u64)req_offset(prev);
r_stripe = (u64)req_offset(req);
@@ -936,7 +936,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
do_div(p_stripe, stripe_unit);
do_div(r_stripe, stripe_unit);

- return (p_stripe == r_stripe);
+ if (p_stripe == r_stripe)
+ return req->wb_bytes;
+ return 0;
}

static void
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5457745..c20352a 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -564,14 +564,20 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
return 0;
}

-static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req)
{
if (!pnfs_generic_pg_test(pgio, prev, req))
- return false;
+ return 0;

- return pgio->pg_count + req->wb_bytes <=
- (unsigned long)pgio->pg_layout_private;
+ if (pgio->pg_count + req->wb_bytes <=
+ (unsigned long)pgio->pg_layout_private)
+ return req->wb_bytes;
+ return 0;
}

static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ecd34b7..3c35b9e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -277,7 +277,17 @@ nfs_wait_on_request(struct nfs_page *req)
TASK_UNINTERRUPTIBLE);
}

-bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *prev, struct nfs_page *req)
{
/*
* FIXME: ideally we should be able to coalesce all requests
@@ -289,7 +299,9 @@ bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *pr
if (desc->pg_bsize < PAGE_SIZE)
return 0;

- return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+ if (desc->pg_count + req->wb_bytes <= desc->pg_bsize)
+ return req->wb_bytes;
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);

@@ -354,6 +366,8 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
+ size_t size;
+
if (!nfs_match_open_context(req->wb_context, prev->wb_context))
return false;
if (req->wb_context->dentry->d_inode->i_flock != NULL &&
@@ -365,7 +379,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
return false;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
return false;
- return pgio->pg_ops->pg_test(pgio, prev, req);
+ size = pgio->pg_ops->pg_test(pgio, prev, req);
+ WARN_ON_ONCE(size && size != req->wb_bytes);
+ return size > 0;
}

/**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cb53d45..6201bf6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1461,7 +1461,11 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
}

-bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
@@ -1482,8 +1486,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
* first byte that lies outside the pnfs_layout_range. FIXME?
*
*/
- return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
- pgio->pg_lseg->pls_range.length);
+ if (req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length))
+ return req->wb_bytes;
+ return 0;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0237939..0386d7c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -192,7 +192,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size);
int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
-bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req);
void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 905809d..214e098 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -46,7 +46,8 @@ struct nfs_page {
struct nfs_pageio_descriptor;
struct nfs_pageio_ops {
void (*pg_init)(struct nfs_pageio_descriptor *, struct nfs_page *);
- bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+ size_t (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *,
+ struct nfs_page *);
int (*pg_doio)(struct nfs_pageio_descriptor *);
};

@@ -89,7 +90,7 @@ extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
struct nfs_page *);
extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
-extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
struct nfs_page *prev,
struct nfs_page *req);
extern int nfs_wait_on_request(struct nfs_page *);
--
1.8.5.2 (Apple Git-48)


2014-04-24 16:52:51

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

On Thu, 24 Apr 2014 12:15:08 -0400
Weston Andros Adamson <[email protected]> wrote:

> On Apr 24, 2014, at 11:45 AM, Jeff Layton <[email protected]> wrote:
>
> > On Thu, 24 Apr 2014 11:23:19 -0400
> > Weston Andros Adamson <[email protected]> wrote:
> >
> >> On Apr 24, 2014, at 10:50 AM, Jeff Layton <[email protected]> wrote:
> >>
> >>> On Tue, 22 Apr 2014 17:29:13 -0400
> >>> Weston Andros Adamson <[email protected]> wrote:
> >>>
> >>>> Add "page groups" - a circular list of nfs requests (struct nfs_page)
> >>>> that all reference the same page. This gives nfs read and write paths
> >>>> the ability to account for sub-page regions independently. This
> >>>> somewhat follows the design of struct buffer_head's sub-page
> >>>> accounting.
> >>>>
> >>>> Only "head" requests are ever added/removed from the inode list in
> >>>> the buffered write path. "head" and "sub" requests are treated the
> >>>> same through the read path and the rest of the write/commit path.
> >>>> Requests are given an extra reference across the life of the list.
> >>>>
> >>>> Page groups are never rejoined after being split. If the read/write
> >>>> request fails and the client falls back to another path (ie revert
> >>>> to MDS in PNFS case), the already split requests are pushed through
> >>>> the recoalescing code again, which may split them further and then
> >>>> coalesce them into properly sized requests on the wire. Fragmentation
> >>>> shouldn't be a problem with the current design, because we flush all
> >>>> requests in page group when a non-contiguous request is added, so
> >>>> the only time resplitting should occur is on a resend of a read or
> >>>> write.
> >>>>
> >>>> This patch lays the groundwork for sub-page splitting, but does not
> >>>> actually do any splitting. For now all page groups have one request
> >>>> as pg_test functions don't yet split pages. There are several related
> >>>> patches that are needed support multiple requests per page group.
> >>>>
> >>>> Signed-off-by: Weston Andros Adamson <[email protected]>
> >>>> ---
> >>>> fs/nfs/direct.c | 7 +-
> >>>> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
> >>>> fs/nfs/read.c | 4 +-
> >>>> fs/nfs/write.c | 12 ++-
> >>>> include/linux/nfs_page.h | 12 ++-
> >>>> 5 files changed, 231 insertions(+), 22 deletions(-)
> >>>>
> >>>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> >>>> index a0c30c5..9d968ca 100644
> >>>> --- a/fs/nfs/direct.c
> >>>> +++ b/fs/nfs/direct.c
> >>>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
> >>>> struct nfs_page *req;
> >>>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
> >>>> /* XXX do we need to do the eof zeroing found in async_filler? */
> >>>> - req = nfs_create_request(dreq->ctx, pagevec[i],
> >>>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> >>>> pgbase, req_len);
> >>>> if (IS_ERR(req)) {
> >>>> result = PTR_ERR(req);
> >>>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
> >>>> struct nfs_page *req;
> >>>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
> >>>>
> >>>> - req = nfs_create_request(dreq->ctx, pagevec[i],
> >>>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> >>>> pgbase, req_len);
> >>>> if (IS_ERR(req)) {
> >>>> result = PTR_ERR(req);
> >>>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> >>>> spin_unlock(&dreq->lock);
> >>>>
> >>>> while (!list_empty(&hdr->pages)) {
> >>>> + bool do_destroy = true;
> >>>> +
> >>>> req = nfs_list_entry(hdr->pages.next);
> >>>> nfs_list_remove_request(req);
> >>>> switch (bit) {
> >>>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> >>>> case NFS_IOHDR_NEED_COMMIT:
> >>>> kref_get(&req->wb_kref);
> >>>> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
> >>>> + do_destroy = false;
> >>>> }
> >>>> nfs_unlock_and_release_request(req);
> >>>> }
> >>>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> >>>> index ac4fb64..8cb8e14 100644
> >>>> --- a/fs/nfs/pagelist.c
> >>>> +++ b/fs/nfs/pagelist.c
> >>>> @@ -26,6 +26,8 @@
> >>>>
> >>>> static struct kmem_cache *nfs_page_cachep;
> >>>>
> >>>> +static void nfs_free_request(struct nfs_page *);
> >>>> +
> >>>> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
> >>>> {
> >>>> p->npages = pagecount;
> >>>> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> >>>> return __nfs_iocounter_wait(c);
> >>>> }
> >>>>
> >>>> +/*
> >>>> + * nfs_page_group_lock - lock the head of the page group
> >>>> + * @req - request in group that is to be locked
> >>>> + *
> >>>> + * this lock must be held if modifying the page group list
> >>>> + */
> >>>> +void
> >>>> +nfs_page_group_lock(struct nfs_page *req)
> >>>> +{
> >>>> + struct nfs_page *head = req->wb_head;
> >>>> + int err = -EAGAIN;
> >>>> +
> >>>> + WARN_ON_ONCE(head != head->wb_head);
> >>>> +
> >>>> + while (err)
> >>>> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
> >>>> + nfs_wait_bit_killable, TASK_KILLABLE);
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * nfs_page_group_unlock - unlock the head of the page group
> >>>> + * @req - request in group that is to be unlocked
> >>>> + */
> >>>> +void
> >>>> +nfs_page_group_unlock(struct nfs_page *req)
> >>>> +{
> >>>> + struct nfs_page *head = req->wb_head;
> >>>> +
> >>>> + WARN_ON_ONCE(head != head->wb_head);
> >>>> +
> >>>> + smp_mb__before_clear_bit();
> >>>> + clear_bit(PG_HEADLOCK, &head->wb_flags);
> >>>> + smp_mb__after_clear_bit();
> >>>> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * nfs_page_group_sync_on_bit_locked
> >>>> + *
> >>>> + * must be called with page group lock held
> >>>> + */
> >>>> +static bool
> >>>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
> >>>> +{
> >>>> + struct nfs_page *head = req->wb_head;
> >>>> + struct nfs_page *tmp;
> >>>> +
> >>>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
> >>>> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
> >>>> +
> >>>> + tmp = req->wb_this_page;
> >>>> + while (tmp != req) {
> >>>> + if (!test_bit(bit, &tmp->wb_flags))
> >>>> + return false;
> >>>> + tmp = tmp->wb_this_page;
> >>>> + }
> >>>> +
> >>>> + /* true! reset all bits */
> >>>> + tmp = req;
> >>>> + do {
> >>>> + clear_bit(bit, &tmp->wb_flags);
> >>>> + tmp = tmp->wb_this_page;
> >>>> + } while (tmp != req);
> >>>> +
> >>>> + return true;
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * nfs_page_group_sync_on_bit - set bit on current request, but only
> >>>> + * return true if the bit is set for all requests in page group
> >>>> + * @req - request in page group
> >>>> + * @bit - PG_* bit that is used to sync page group
> >>>> + */
> >>>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
> >>>> +{
> >>>> + bool ret;
> >>>> +
> >>>> + nfs_page_group_lock(req);
> >>>> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
> >>>> + nfs_page_group_unlock(req);
> >>>> +
> >>>> + return ret;
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * nfs_page_group_init - Initialize the page group linkage for @req
> >>>> + * @req - a new nfs request
> >>>> + * @prev - the previous request in page group, or NULL if @req is the first
> >>>> + * or only request in the group (the head).
> >>>> + */
> >>>> +static inline void
> >>>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
> >>>> +{
> >>>> + WARN_ON_ONCE(prev == req);
> >>>> +
> >>>> + if (!prev) {
> >>>> + req->wb_head = req;
> >>>> + req->wb_this_page = req;
> >>>> + } else {
> >>>> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
> >>>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
> >>>> + req->wb_head = prev->wb_head;
> >>>> + req->wb_this_page = prev->wb_this_page;
> >>>> + prev->wb_this_page = req;
> >>>> + }
> >>>> +}
> >>>> +
> >>>> +/*
> >>>> + * nfs_page_group_destroy - sync the destruction of page groups
> >>>> + * @req - request that no longer needs the page group
> >>>> + *
> >>>> + * releases the page group reference from each member once all
> >>>> + * members have called this function.
> >>>> + */
> >>>> +static void
> >>>> +nfs_page_group_destroy(struct kref *kref)
> >>>> +{
> >>>> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> >>>> + struct nfs_page *tmp, *next;
> >>>> +
> >>>> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
> >>>> + return;
> >>>> +
> >>>> + tmp = req;
> >>>> + do {
> >>>> + next = tmp->wb_this_page;
> >>>> + /* unlink and free */
> >>>> + tmp->wb_this_page = tmp;
> >>>> + tmp->wb_head = tmp;
> >>>> + nfs_free_request(tmp);
> >>>> + tmp = next;
> >>>> + } while (tmp != req);
> >>>> +}
> >>>> +
> >>>> /**
> >>>> * nfs_create_request - Create an NFS read/write request.
> >>>> * @ctx: open context to use
> >>>> * @page: page to write
> >>>> + * @last: last nfs request created for this page group or NULL if head
> >>>> * @offset: starting offset within the page for the write
> >>>> * @count: number of bytes to read/write
> >>>> *
> >>>> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> >>>> */
> >>>> struct nfs_page *
> >>>> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> >>>> - unsigned int offset, unsigned int count)
> >>>> + struct nfs_page *last, unsigned int offset,
> >>>> + unsigned int count)
> >>>> {
> >>>> struct nfs_page *req;
> >>>> struct nfs_lock_context *l_ctx;
> >>>> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> >>>> req->wb_bytes = count;
> >>>> req->wb_context = get_nfs_open_context(ctx);
> >>>> kref_init(&req->wb_kref);
> >>>> + nfs_page_group_init(req, last);
> >>>> return req;
> >>>> }
> >>>>
> >>>> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
> >>>> }
> >>>> }
> >>>>
> >>>> -
> >>>> /**
> >>>> * nfs_release_request - Release the count on an NFS read/write request
> >>>> * @req: request to release
> >>>> *
> >>>> * Note: Should never be called with the spinlock held!
> >>>> */
> >>>> -static void nfs_free_request(struct kref *kref)
> >>>> +static void nfs_free_request(struct nfs_page *req)
> >>>> {
> >>>> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> >>>> + WARN_ON_ONCE(req->wb_this_page != req);
> >>>> +
> >>>> + /* extra debug: make sure no sync bits are still set */
> >>>> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
> >>>> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
> >>>> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
> >>>> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
> >>>> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
> >>>>
> >>>> /* Release struct file and open context */
> >>>> nfs_clear_request(req);
> >>>> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
> >>>>
> >>>> void nfs_release_request(struct nfs_page *req)
> >>>> {
> >>>> - kref_put(&req->wb_kref, nfs_free_request);
> >>>> + kref_put(&req->wb_kref, nfs_page_group_destroy);
> >>>> }
> >>>>
> >>>> static int nfs_wait_bit_uninterruptible(void *word)
> >>>> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
> >>>> * @desc: destination io descriptor
> >>>> * @req: request
> >>>> *
> >>>> + * This may split a request into subrequests which are all part of the
> >>>> + * same page group.
> >>>> + *
> >>>> * Returns true if the request 'req' was successfully coalesced into the
> >>>> * existing list of pages 'desc'.
> >>>> */
> >>>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
> >>>> struct nfs_page *req)
> >>>> {
> >>>> - while (!nfs_pageio_do_add_request(desc, req)) {
> >>>> - desc->pg_moreio = 1;
> >>>> - nfs_pageio_doio(desc);
> >>>> - if (desc->pg_error < 0)
> >>>> - return 0;
> >>>> - desc->pg_moreio = 0;
> >>>> - if (desc->pg_recoalesce)
> >>>> - return 0;
> >>>> - }
> >>>> + struct nfs_page *subreq;
> >>>> + unsigned int bytes_left = 0;
> >>>> + unsigned int offset, pgbase;
> >>>> +
> >>>> + nfs_page_group_lock(req);
> >>>> +
> >>>> + subreq = req;
> >>>> + bytes_left = subreq->wb_bytes;
> >>>> + offset = subreq->wb_offset;
> >>>> + pgbase = subreq->wb_pgbase;
> >>>> +
> >>>> + do {
> >>>> + if (!nfs_pageio_do_add_request(desc, subreq)) {
> >>>> + /* make sure pg_test call(s) did nothing */
> >>>> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
> >>>> + WARN_ON_ONCE(subreq->wb_offset != offset);
> >>>> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
> >>>> +
> >>>> + nfs_page_group_unlock(req);
> >>>> + desc->pg_moreio = 1;
> >>>> + nfs_pageio_doio(desc);
> >>>> + if (desc->pg_error < 0)
> >>>> + return 0;
> >>>> + desc->pg_moreio = 0;
> >>>> + if (desc->pg_recoalesce)
> >>>> + return 0;
> >>>> + /* retry add_request for this subreq */
> >>>> + nfs_page_group_lock(req);
> >>>> + continue;
> >>>> + }
> >>>> +
> >>>> + /* check for buggy pg_test call(s) */
> >>>> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
> >>>> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
> >>>> + WARN_ON_ONCE(subreq->wb_bytes == 0);
> >>>> +
> >>>> + bytes_left -= subreq->wb_bytes;
> >>>> + offset += subreq->wb_bytes;
> >>>> + pgbase += subreq->wb_bytes;
> >>>> +
> >>>> + if (bytes_left) {
> >>>> + subreq = nfs_create_request(req->wb_context,
> >>>> + req->wb_page,
> >>>> + subreq, pgbase, bytes_left);
> >>>> + nfs_lock_request(subreq);
> >>>> + subreq->wb_offset = offset;
> >>>> + subreq->wb_index = req->wb_index;
> >>>> + }
> >>>> + } while (bytes_left > 0);
> >>>> +
> >>>> + nfs_page_group_unlock(req);
> >>>> return 1;
> >>>> }
> >>>>
> >>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> >>>> index 95a0855..ee0a3cd 100644
> >>>> --- a/fs/nfs/read.c
> >>>> +++ b/fs/nfs/read.c
> >>>> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
> >>>> len = nfs_page_length(page);
> >>>> if (len == 0)
> >>>> return nfs_return_empty_page(page);
> >>>> - new = nfs_create_request(ctx, page, 0, len);
> >>>> + new = nfs_create_request(ctx, page, NULL, 0, len);
> >>>> if (IS_ERR(new)) {
> >>>> unlock_page(page);
> >>>> return PTR_ERR(new);
> >>>> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
> >>>> if (len == 0)
> >>>> return nfs_return_empty_page(page);
> >>>>
> >>>> - new = nfs_create_request(desc->ctx, page, 0, len);
> >>>> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
> >>>> if (IS_ERR(new))
> >>>> goto out_error;
> >>>>
> >>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> >>>> index ca20ec7..d1453f2 100644
> >>>> --- a/fs/nfs/write.c
> >>>> +++ b/fs/nfs/write.c
> >>>> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
> >>>> }
> >>>> nfsi->npages--;
> >>>> spin_unlock(&inode->i_lock);
> >>>> - nfs_release_request(req);
> >>>> + nfs_release_request(head);
> >>>> }
> >>>>
> >>>> static void
> >>>> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
> >>>> {
> >>>> struct nfs_commit_info cinfo;
> >>>> unsigned long bytes = 0;
> >>>> + bool do_destroy;
> >>>>
> >>>> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
> >>>> goto out;
> >>>> @@ -654,6 +655,7 @@ remove_req:
> >>>> next:
> >>>> nfs_unlock_request(req);
> >>>> nfs_end_page_writeback(req->wb_page);
> >>>> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
> >>>> nfs_release_request(req);
> >>>> }
> >>>> out:
> >>>> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
> >>>> if (req == NULL)
> >>>> goto out_unlock;
> >>>>
> >>>> + /* should be handled by nfs_flush_incompatible */
> >>>> + WARN_ON_ONCE(req->wb_head != req);
> >>>> + WARN_ON_ONCE(req->wb_this_page != req);
> >>>> +
> >>>> rqend = req->wb_offset + req->wb_bytes;
> >>>> /*
> >>>> * Tell the caller to flush out the request if
> >>>> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
> >>>> req = nfs_try_to_update_request(inode, page, offset, bytes);
> >>>> if (req != NULL)
> >>>> goto out;
> >>>> - req = nfs_create_request(ctx, page, offset, bytes);
> >>>> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
> >>>> if (IS_ERR(req))
> >>>> goto out;
> >>>> nfs_inode_add_request(inode, req);
> >>>> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
> >>>> return 0;
> >>>> l_ctx = req->wb_lock_context;
> >>>> do_flush = req->wb_page != page || req->wb_context != ctx;
> >>>> + /* for now, flush if more than 1 request in page_group */
> >>>> + do_flush |= req->wb_this_page != req;
> >>>> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
> >>>> do_flush |= l_ctx->lockowner.l_owner != current->files
> >>>> || l_ctx->lockowner.l_pid != current->tgid;
> >>>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> >>>> index 214e098..1fb161b 100644
> >>>> --- a/include/linux/nfs_page.h
> >>>> +++ b/include/linux/nfs_page.h
> >>>> @@ -26,6 +26,8 @@ enum {
> >>>> PG_MAPPED, /* page private set for buffered io */
> >>>> PG_CLEAN, /* write succeeded */
> >>>> PG_COMMIT_TO_DS, /* used by pnfs layouts */
> >>>> + PG_HEADLOCK, /* page group lock of wb_head */
> >>>> + PG_TEARDOWN, /* page group sync for destroy */
> >>>> };
> >>>>
> >>>> struct nfs_inode;
> >>>> @@ -41,6 +43,8 @@ struct nfs_page {
> >>>> struct kref wb_kref; /* reference count */
> >>>> unsigned long wb_flags;
> >>>> struct nfs_write_verifier wb_verf; /* Commit cookie */
> >>>> + struct nfs_page *wb_this_page; /* list of reqs for this page */
> >>>> + struct nfs_page *wb_head; /* head pointer for req list */
> >>>
> >>> Hmm ok, so to make sure I understand...
> >>>
> >>> So page->private will point to the "head" req (struct page_private).
> >>
> >> Only in the buffered write case. Page->private is not set for read path / direct i/o path.
> >>
> >>> Then we'll have a singly-linked list of reqs hanging off of
> >>> wb_this_page. Is that right?
> >>>
> >>> If so, then it seems like it would be clearer to use a standard
> >>> list_head here. If you need to get to the wb_head, you could always do
> >>> something like this:
> >>>
> >>> list_first_entry(&req->wb_page->wb_this_page);
> >>
> >> Well, wb_page is a struct page and doesn?t have wb_this_page (which is in struct
> >> nfs_page), but I see where you?re going with this.
> >>
> >
> > Doh, right! Sorry, I threw that together in haste, but you get the
> > idea. I was thinking you could go back to the page and dereference
> > ->private.
> >
> >> A strategy like this only works if we always have page->private pointing to the head
> >> request. We chose not to go that way because it messes with the buffered
> >> write path?s setting / clearing of page private which interacts with the swappable
> >> nfs pages code that everyone seems to be afraid to touch ;)
> >>
> >> So we decided to go this route (not messing with page_private) as a first step - we
> >> certainly could add it later, but the current approach makes things less complex.
> >>
> >
> > Ok, that makes sense. Thanks...
> >
> >>>
> >>> ...and could even turn that into a macro or static inline for some
> >>> syntactic sugar. It's a little more pointer chasing to find the head,
> >>> but it seems like that would be clearer than using yet another
> >>> linked-list implementation.
> >>
> >> So, I?m not against using list_head.. I didn?t go that route initially because I was:
> >>
> >> 1) following the buffer_head example, which rolls it?s own list
> >>
> >
> > I wouldn't be surprised if the buffer_head code predates the standard
> > linked-list macros, so that probably explains why they did it that way.
> > The file locking code has a similar construct in inode->i_flock list.
>
> AFAIK the sub-page functionality was added somewhat recently.
>
> >
> >> 2) trying to grow nfs_page as little as possible - but we might have room within
> >> the allocator bucket it currently lives in?
> >>
> >
> > nfs_page comes out of a dedicated slabcache, so that probably won't be the case.
>
> Ah, right!
>
> >
> >> 3) not sure list_head is suitable for a circular list (I haven?t ever looked into it).
> >>
> >> and until we have a way to find the head request (via page private, etc) without
> >> walking the circular list (chicken / egg problem needing to grab head lock before walking
> >> list to find the head to lock it), we?ll still need the head pointer.
> >>
> >> Thoughts?
> >>
> >> -dros
> >>
> >
> > If you can't rely on page->private pointing to the request, then that
> > does make it tough to do what I was suggesting. struct list_head lists
> > are doubly-linked and circular by nature, so that does seem to be a
> > natural fit for what you're trying to do.
>
> Oh I see -- you?re totally right about list_head being circular, one just has
> to call for_each on whatever head they wish to start from.
>
> >
> > The only problem is that struct list_head is two pointers instead of
> > one, so it's not going to be as space-efficient as what you're doing
> > here. If that's a large concern then you may have no choice but to do
> > this after all.
>
> Right. How much do we care about an extra pointer here? It seems to me
> that we should try to keep it as small as possible - I know Trond has been unwilling
> to add members to rpc_task (for example) unless absolutely necessary and there will
> be at least one (if not more) nfs_page structures per rpc_task.
>

Well there are potentially a lot of these structs, so an extra pointer
in each adds up.

In fact, if only the head req is ever on the per-inode list, then I
guess the wb_list is unused for sub requests, right? That might be an
opportunity for space savings too -- you could union wb_head and
wb_list, and use a wb_flag to indicate which is valid...

> One immediate takeaway: I need to add much better comments about this.
>
> As far as eventually removing the wb_head pointer, it gets really ugly to do without
> changing the buffered write path (and swappable page semantics) because page_group
> operations happen *after* nfs_inode_remove_request() clears page_private (syncing the
> destruction of the page group). This means that nfs_release_request and
> nfs_unlock_and_release_request will both have to be passed a previously cached head
> pointer. yuck.
>

Ahh right -- that is tricky then. I'd have to ponder that a bit more...
--
Jeff Layton <[email protected]>

2014-04-22 21:29:05

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

Add "page groups" - a circular list of nfs requests (struct nfs_page)
that all reference the same page. This gives nfs read and write paths
the ability to account for sub-page regions independently. This
somewhat follows the design of struct buffer_head's sub-page
accounting.

Only "head" requests are ever added/removed from the inode list in
the buffered write path. "head" and "sub" requests are treated the
same through the read path and the rest of the write/commit path.
Requests are given an extra reference across the life of the list.

Page groups are never rejoined after being split. If the read/write
request fails and the client falls back to another path (ie revert
to MDS in PNFS case), the already split requests are pushed through
the recoalescing code again, which may split them further and then
coalesce them into properly sized requests on the wire. Fragmentation
shouldn't be a problem with the current design, because we flush all
requests in page group when a non-contiguous request is added, so
the only time resplitting should occur is on a resend of a read or
write.

This patch lays the groundwork for sub-page splitting, but does not
actually do any splitting. For now all page groups have one request
as pg_test functions don't yet split pages. There are several related
patches that are needed support multiple requests per page group.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/direct.c | 7 +-
fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
fs/nfs/read.c | 4 +-
fs/nfs/write.c | 12 ++-
include/linux/nfs_page.h | 12 ++-
5 files changed, 231 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a0c30c5..9d968ca 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
/* XXX do we need to do the eof zeroing found in async_filler? */
- req = nfs_create_request(dreq->ctx, pagevec[i],
+ req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
pgbase, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
@@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);

- req = nfs_create_request(dreq->ctx, pagevec[i],
+ req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
pgbase, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
@@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
spin_unlock(&dreq->lock);

while (!list_empty(&hdr->pages)) {
+ bool do_destroy = true;
+
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
switch (bit) {
@@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
case NFS_IOHDR_NEED_COMMIT:
kref_get(&req->wb_kref);
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+ do_destroy = false;
}
nfs_unlock_and_release_request(req);
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ac4fb64..8cb8e14 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,6 +26,8 @@

static struct kmem_cache *nfs_page_cachep;

+static void nfs_free_request(struct nfs_page *);
+
bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
{
p->npages = pagecount;
@@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
return __nfs_iocounter_wait(c);
}

+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req - request in group that is to be locked
+ *
+ * this lock must be held if modifying the page group list
+ */
+void
+nfs_page_group_lock(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+ int err = -EAGAIN;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ while (err)
+ err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req - request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ smp_mb__before_clear_bit();
+ clear_bit(PG_HEADLOCK, &head->wb_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp;
+
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+ WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+ tmp = req->wb_this_page;
+ while (tmp != req) {
+ if (!test_bit(bit, &tmp->wb_flags))
+ return false;
+ tmp = tmp->wb_this_page;
+ }
+
+ /* true! reset all bits */
+ tmp = req;
+ do {
+ clear_bit(bit, &tmp->wb_flags);
+ tmp = tmp->wb_this_page;
+ } while (tmp != req);
+
+ return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ * return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+ bool ret;
+
+ nfs_page_group_lock(req);
+ ret = nfs_page_group_sync_on_bit_locked(req, bit);
+ nfs_page_group_unlock(req);
+
+ return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ * or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+ WARN_ON_ONCE(prev == req);
+
+ if (!prev) {
+ req->wb_head = req;
+ req->wb_this_page = req;
+ } else {
+ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+ req->wb_head = prev->wb_head;
+ req->wb_this_page = prev->wb_this_page;
+ prev->wb_this_page = req;
+ }
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+ struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ struct nfs_page *tmp, *next;
+
+ if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+ return;
+
+ tmp = req;
+ do {
+ next = tmp->wb_this_page;
+ /* unlink and free */
+ tmp->wb_this_page = tmp;
+ tmp->wb_head = tmp;
+ nfs_free_request(tmp);
+ tmp = next;
+ } while (tmp != req);
+}
+
/**
* nfs_create_request - Create an NFS read/write request.
* @ctx: open context to use
* @page: page to write
+ * @last: last nfs request created for this page group or NULL if head
* @offset: starting offset within the page for the write
* @count: number of bytes to read/write
*
@@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
*/
struct nfs_page *
nfs_create_request(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
+ struct nfs_page *last, unsigned int offset,
+ unsigned int count)
{
struct nfs_page *req;
struct nfs_lock_context *l_ctx;
@@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
req->wb_bytes = count;
req->wb_context = get_nfs_open_context(ctx);
kref_init(&req->wb_kref);
+ nfs_page_group_init(req, last);
return req;
}

@@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
}
}

-
/**
* nfs_release_request - Release the count on an NFS read/write request
* @req: request to release
*
* Note: Should never be called with the spinlock held!
*/
-static void nfs_free_request(struct kref *kref)
+static void nfs_free_request(struct nfs_page *req)
{
- struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* extra debug: make sure no sync bits are still set */
+ WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));

/* Release struct file and open context */
nfs_clear_request(req);
@@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)

void nfs_release_request(struct nfs_page *req)
{
- kref_put(&req->wb_kref, nfs_free_request);
+ kref_put(&req->wb_kref, nfs_page_group_destroy);
}

static int nfs_wait_bit_uninterruptible(void *word)
@@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
* @desc: destination io descriptor
* @req: request
*
+ * This may split a request into subrequests which are all part of the
+ * same page group.
+ *
* Returns true if the request 'req' was successfully coalesced into the
* existing list of pages 'desc'.
*/
static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
- while (!nfs_pageio_do_add_request(desc, req)) {
- desc->pg_moreio = 1;
- nfs_pageio_doio(desc);
- if (desc->pg_error < 0)
- return 0;
- desc->pg_moreio = 0;
- if (desc->pg_recoalesce)
- return 0;
- }
+ struct nfs_page *subreq;
+ unsigned int bytes_left = 0;
+ unsigned int offset, pgbase;
+
+ nfs_page_group_lock(req);
+
+ subreq = req;
+ bytes_left = subreq->wb_bytes;
+ offset = subreq->wb_offset;
+ pgbase = subreq->wb_pgbase;
+
+ do {
+ if (!nfs_pageio_do_add_request(desc, subreq)) {
+ /* make sure pg_test call(s) did nothing */
+ WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
+ WARN_ON_ONCE(subreq->wb_offset != offset);
+ WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
+
+ nfs_page_group_unlock(req);
+ desc->pg_moreio = 1;
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0)
+ return 0;
+ desc->pg_moreio = 0;
+ if (desc->pg_recoalesce)
+ return 0;
+ /* retry add_request for this subreq */
+ nfs_page_group_lock(req);
+ continue;
+ }
+
+ /* check for buggy pg_test call(s) */
+ WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
+ WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
+ WARN_ON_ONCE(subreq->wb_bytes == 0);
+
+ bytes_left -= subreq->wb_bytes;
+ offset += subreq->wb_bytes;
+ pgbase += subreq->wb_bytes;
+
+ if (bytes_left) {
+ subreq = nfs_create_request(req->wb_context,
+ req->wb_page,
+ subreq, pgbase, bytes_left);
+ nfs_lock_request(subreq);
+ subreq->wb_offset = offset;
+ subreq->wb_index = req->wb_index;
+ }
+ } while (bytes_left > 0);
+
+ nfs_page_group_unlock(req);
return 1;
}

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 95a0855..ee0a3cd 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(ctx, page, 0, len);
+ new = nfs_create_request(ctx, page, NULL, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
return PTR_ERR(new);
@@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
if (len == 0)
return nfs_return_empty_page(page);

- new = nfs_create_request(desc->ctx, page, 0, len);
+ new = nfs_create_request(desc->ctx, page, NULL, 0, len);
if (IS_ERR(new))
goto out_error;

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ca20ec7..d1453f2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
nfsi->npages--;
spin_unlock(&inode->i_lock);
- nfs_release_request(req);
+ nfs_release_request(head);
}

static void
@@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_commit_info cinfo;
unsigned long bytes = 0;
+ bool do_destroy;

if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
goto out;
@@ -654,6 +655,7 @@ remove_req:
next:
nfs_unlock_request(req);
nfs_end_page_writeback(req->wb_page);
+ do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
nfs_release_request(req);
}
out:
@@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
if (req == NULL)
goto out_unlock;

+ /* should be handled by nfs_flush_incompatible */
+ WARN_ON_ONCE(req->wb_head != req);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
rqend = req->wb_offset + req->wb_bytes;
/*
* Tell the caller to flush out the request if
@@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
req = nfs_try_to_update_request(inode, page, offset, bytes);
if (req != NULL)
goto out;
- req = nfs_create_request(ctx, page, offset, bytes);
+ req = nfs_create_request(ctx, page, NULL, offset, bytes);
if (IS_ERR(req))
goto out;
nfs_inode_add_request(inode, req);
@@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
return 0;
l_ctx = req->wb_lock_context;
do_flush = req->wb_page != page || req->wb_context != ctx;
+ /* for now, flush if more than 1 request in page_group */
+ do_flush |= req->wb_this_page != req;
if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
do_flush |= l_ctx->lockowner.l_owner != current->files
|| l_ctx->lockowner.l_pid != current->tgid;
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 214e098..1fb161b 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -26,6 +26,8 @@ enum {
PG_MAPPED, /* page private set for buffered io */
PG_CLEAN, /* write succeeded */
PG_COMMIT_TO_DS, /* used by pnfs layouts */
+ PG_HEADLOCK, /* page group lock of wb_head */
+ PG_TEARDOWN, /* page group sync for destroy */
};

struct nfs_inode;
@@ -41,6 +43,8 @@ struct nfs_page {
struct kref wb_kref; /* reference count */
unsigned long wb_flags;
struct nfs_write_verifier wb_verf; /* Commit cookie */
+ struct nfs_page *wb_this_page; /* list of reqs for this page */
+ struct nfs_page *wb_head; /* head pointer for req list */
};

struct nfs_pageio_descriptor;
@@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {

extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
struct page *page,
+ struct nfs_page *last,
unsigned int offset,
unsigned int count);
-extern void nfs_release_request(struct nfs_page *req);
+extern void nfs_release_request(struct nfs_page *);


extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
@@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
struct nfs_page *req);
extern int nfs_wait_on_request(struct nfs_page *);
extern void nfs_unlock_request(struct nfs_page *req);
-extern void nfs_unlock_and_release_request(struct nfs_page *req);
+extern void nfs_unlock_and_release_request(struct nfs_page *);
+extern void nfs_page_group_lock(struct nfs_page *);
+extern void nfs_page_group_unlock(struct nfs_page *);
+extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);

/*
* Lock the page of an asynchronous request
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:29:22

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 16/17] pnfs: filelayout: support non page aligned layouts

Use the new pg_test interface to adjust requests to fit in the current
stripe / segment.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/nfs4filelayout.c | 50 ++++++++++++++++++-------------------------------
1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 9a91f4f..145944f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -638,7 +638,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
struct nfs4_deviceid_node *d;
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
- struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);

dprintk("--> %s\n", __func__);

@@ -656,7 +655,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out;
}

- if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+ if (!fl->stripe_unit) {
dprintk("%s Invalid stripe unit (%u)\n",
__func__, fl->stripe_unit);
goto out;
@@ -693,12 +692,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out_put;
}

- if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
- dprintk("%s Stripe unit (%u) not aligned with rsize %u "
- "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
- nfss->wsize);
- }
-
status = 0;
out:
dprintk("--> %s returns %d\n", __func__, status);
@@ -935,44 +928,40 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
{
unsigned int size;
u64 p_stripe, r_stripe;
- u32 stripe_unit;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;

/* calls nfs_generic_pg_test */
size = pnfs_generic_pg_test(pgio, prev, req);
if (!size)
return 0;

+ /* see if req and prev are in the same stripe */
if (prev) {
- p_stripe = (u64)req_offset(prev);
- r_stripe = (u64)req_offset(req);
- stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
-
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
do_div(p_stripe, stripe_unit);
do_div(r_stripe, stripe_unit);

if (p_stripe != r_stripe)
return 0;
}
- return min(size, req->wb_bytes);
+
+ /* calculate remaining bytes in the current stripe */
+ stripe_offset = ((u64)req_offset(req) - segment_offset) % stripe_unit;
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
}

static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (req->wb_offset != req->wb_pgbase) {
- /*
- * Handling unaligned pages is difficult, because have to
- * somehow split a req in two in certain cases in the
- * pg.test code. Avoid this by just not using pnfs
- * in this case.
- */
- nfs_pageio_reset_read_mds(pgio);
- return;
- }
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
@@ -990,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;

- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (req->wb_offset != req->wb_pgbase)
- goto out_mds;
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
--
1.8.5.2 (Apple Git-48)


2014-04-23 17:44:15

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

On Apr 23, 2014, at 10:36 AM, Anna Schumaker <[email protected]> wrote:

> On 04/23/2014 10:31 AM, Weston Andros Adamson wrote:
>> On Apr 23, 2014, at 10:16 AM, Anna Schumaker <[email protected]> wrote:
>>
>>> On 04/22/2014 05:29 PM, Weston Andros Adamson wrote:
>>>> Since the ability to split pages into subpage requests has been added,
>>>> nfs_pgio_header->rpc_list only ever has one wdata/rdata.
>>>>
>>>> Signed-off-by: Weston Andros Adamson <[email protected]>
>>>> ---
>>>> fs/nfs/pnfs.c | 41 +++++++++++++++--------------------------
>>>> fs/nfs/read.c | 35 +++++------------------------------
>>>> fs/nfs/write.c | 38 +++++++-------------------------------
>>>> include/linux/nfs_xdr.h | 35 ++++++++++++++++++-----------------
>>>> 4 files changed, 45 insertions(+), 104 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>> index 7c89385..3b3ec46 100644
>>>> --- a/fs/nfs/pnfs.c
>>>> +++ b/fs/nfs/pnfs.c
>>>> @@ -1600,23 +1600,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
>>>> }
>>>>
>>>> static void
>>>> -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
>>>> +pnfs_do_write(struct nfs_pageio_descriptor *desc,
>>>> + struct nfs_pgio_header *hdr, int how)
>>>> {
>>>> - struct nfs_write_data *data;
>>>> + struct nfs_write_data *data = hdr->data.write;
>>>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>>>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>>>> + enum pnfs_try_status trypnfs;
>>>>
>>>> desc->pg_lseg = NULL;
>>>> - while (!list_empty(head)) {
>>>> - enum pnfs_try_status trypnfs;
>>>> -
>>>> - data = list_first_entry(head, struct nfs_write_data, list);
>>>> - list_del_init(&data->list);
>>>> -
>>>> - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>>>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>> - pnfs_write_through_mds(desc, data);
>>>> - }
>>>> + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>>>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>> + pnfs_write_through_mds(desc, data);
>>>> pnfs_put_lseg(lseg);
>>>> }
>>>>
>>>> @@ -1650,7 +1645,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>>>> pnfs_put_lseg(desc->pg_lseg);
>>>> desc->pg_lseg = NULL;
>>>> } else
>>>> - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
>>>> + pnfs_do_write(desc, hdr, desc->pg_ioflags);
>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>> hdr->completion_ops->completion(hdr);
>>>> return ret;
>>>> @@ -1758,23 +1753,17 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
>>>> }
>>>>
>>>> static void
>>>> -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
>>>> +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
>>>> {
>>>> - struct nfs_read_data *data;
>>>> + struct nfs_read_data *data = hdr->data.read;
>>>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>>>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>>>> + enum pnfs_try_status trypnfs;
>>>>
>>>> desc->pg_lseg = NULL;
>>>> - while (!list_empty(head)) {
>>>> - enum pnfs_try_status trypnfs;
>>>> -
>>>> - data = list_first_entry(head, struct nfs_read_data, list);
>>>> - list_del_init(&data->list);
>>>> -
>>>> - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>>>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>> - pnfs_read_through_mds(desc, data);
>>>> - }
>>>> + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>>>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>>>> + pnfs_read_through_mds(desc, data);
>>>> pnfs_put_lseg(lseg);
>>>> }
>>>>
>>>> @@ -1809,7 +1798,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>>>> pnfs_put_lseg(desc->pg_lseg);
>>>> desc->pg_lseg = NULL;
>>>> } else
>>>> - pnfs_do_multiple_reads(desc, &hdr->rpc_list);
>>>> + pnfs_do_read(desc, hdr);
>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>> hdr->completion_ops->completion(hdr);
>>>> return ret;
>>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>>> index daeff0c..c6b7dd0 100644
>>>> --- a/fs/nfs/read.c
>>>> +++ b/fs/nfs/read.c
>>>> @@ -42,7 +42,6 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
>>>> struct nfs_pgio_header *hdr = &rhdr->header;
>>>>
>>>> INIT_LIST_HEAD(&hdr->pages);
>>>> - INIT_LIST_HEAD(&hdr->rpc_list);
>>>> spin_lock_init(&hdr->lock);
>>>> atomic_set(&hdr->refcnt, 0);
>>>> }
>>>> @@ -286,26 +285,6 @@ static int nfs_do_read(struct nfs_read_data *data,
>>>> return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
>>>> }
>>>>
>>>> -static int
>>>> -nfs_do_multiple_reads(struct list_head *head,
>>>> - const struct rpc_call_ops *call_ops)
>>>> -{
>>>> - struct nfs_read_data *data;
>>>> - int ret = 0;
>>>> -
>>>> - while (!list_empty(head)) {
>>>> - int ret2;
>>>> -
>>>> - data = list_first_entry(head, struct nfs_read_data, list);
>>>> - list_del_init(&data->list);
>>>> -
>>>> - ret2 = nfs_do_read(data, call_ops);
>>>> - if (ret == 0)
>>>> - ret = ret2;
>>>> - }
>>>> - return ret;
>>>> -}
>>>> -
>>>> static void
>>>> nfs_async_read_error(struct list_head *head)
>>>> {
>>>> @@ -327,12 +306,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
>>>> struct nfs_pgio_header *hdr)
>>>> {
>>>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>>>> - while (!list_empty(&hdr->rpc_list)) {
>>>> - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
>>>> - struct nfs_read_data, list);
>>>> - list_del(&data->list);
>>>> - nfs_readdata_release(data);
>>>> - }
>>>> + nfs_readdata_release(hdr->data.read);
>>>> + hdr->data.read = NULL;
>>>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>>>> }
>>>>
>>>> @@ -364,7 +339,8 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
>>>> }
>>>>
>>>> nfs_read_rpcsetup(data, desc->pg_count, 0);
>>>> - list_add(&data->list, &hdr->rpc_list);
>>>> + WARN_ON_ONCE(hdr->data.read);
>>>> + hdr->data.read = data;
>>>> desc->pg_rpc_callops = &nfs_read_common_ops;
>>>> return 0;
>>>> }
>>>> @@ -386,8 +362,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>>>> atomic_inc(&hdr->refcnt);
>>>> ret = nfs_generic_pagein(desc, hdr);
>>>> if (ret == 0)
>>>> - ret = nfs_do_multiple_reads(&hdr->rpc_list,
>>>> - desc->pg_rpc_callops);
>>>> + ret = nfs_do_read(hdr->data.read, desc->pg_rpc_callops);
>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>> hdr->completion_ops->completion(hdr);
>>>> return ret;
>>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>>> index f40db93..cd24a14 100644
>>>> --- a/fs/nfs/write.c
>>>> +++ b/fs/nfs/write.c
>>>> @@ -79,7 +79,6 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
>>>>
>>>> memset(p, 0, sizeof(*p));
>>>> INIT_LIST_HEAD(&hdr->pages);
>>>> - INIT_LIST_HEAD(&hdr->rpc_list);
>>>> spin_lock_init(&hdr->lock);
>>>> atomic_set(&hdr->refcnt, 0);
>>>> hdr->verf = &p->verf;
>>>> @@ -1171,26 +1170,6 @@ static int nfs_do_write(struct nfs_write_data *data,
>>>> return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
>>>> }
>>>>
>>>> -static int nfs_do_multiple_writes(struct list_head *head,
>>>> - const struct rpc_call_ops *call_ops,
>>>> - int how)
>>>> -{
>>>> - struct nfs_write_data *data;
>>>> - int ret = 0;
>>>> -
>>>> - while (!list_empty(head)) {
>>>> - int ret2;
>>>> -
>>>> - data = list_first_entry(head, struct nfs_write_data, list);
>>>> - list_del_init(&data->list);
>>>> -
>>>> - ret2 = nfs_do_write(data, call_ops, how);
>>>> - if (ret == 0)
>>>> - ret = ret2;
>>>> - }
>>>> - return ret;
>>>> -}
>>>> -
>>>> /* If a nfs_flush_* function fails, it should remove reqs from @head and
>>>> * call this on each, which will prepare them to be retried on next
>>>> * writeback using standard nfs.
>>>> @@ -1223,12 +1202,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
>>>> struct nfs_pgio_header *hdr)
>>>> {
>>>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>>>> - while (!list_empty(&hdr->rpc_list)) {
>>>> - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
>>>> - struct nfs_write_data, list);
>>>> - list_del(&data->list);
>>>> - nfs_writedata_release(data);
>>>> - }
>>>> + nfs_writedata_release(hdr->data.write);
>>>> + hdr->data.write = NULL;
>>>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>>>> }
>>>>
>>>> @@ -1275,7 +1250,8 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>>>>
>>>> /* Set up the argument struct */
>>>> nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
>>>> - list_add(&data->list, &hdr->rpc_list);
>>>> + WARN_ON_ONCE(hdr->data.write);
>>>> + hdr->data.write = data;
>>>> desc->pg_rpc_callops = &nfs_write_common_ops;
>>>> return 0;
>>>> }
>>>> @@ -1297,9 +1273,9 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>>>> atomic_inc(&hdr->refcnt);
>>>> ret = nfs_generic_flush(desc, hdr);
>>>> if (ret == 0)
>>>> - ret = nfs_do_multiple_writes(&hdr->rpc_list,
>>>> - desc->pg_rpc_callops,
>>>> - desc->pg_ioflags);
>>>> + ret = nfs_do_write(hdr->data.write,
>>>> + desc->pg_rpc_callops,
>>>> + desc->pg_ioflags);
>>>> if (atomic_dec_and_test(&hdr->refcnt))
>>>> hdr->completion_ops->completion(hdr);
>>>> return ret;
>>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>>>> index 6fb5b23..239274d 100644
>>>> --- a/include/linux/nfs_xdr.h
>>>> +++ b/include/linux/nfs_xdr.h
>>>> @@ -1266,7 +1266,6 @@ struct nfs_page_array {
>>>>
>>>> struct nfs_read_data {
>>>> struct nfs_pgio_header *header;
>>>> - struct list_head list;
>>>> struct rpc_task task;
>>>> struct nfs_fattr fattr; /* fattr storage */
>>>> struct nfs_readargs args;
>>>> @@ -1278,6 +1277,20 @@ struct nfs_read_data {
>>>> struct nfs_client *ds_clp; /* pNFS data server */
>>>> };
>>>>
>>>> +struct nfs_write_data {
>>>> + struct nfs_pgio_header *header;
>>>> + struct rpc_task task;
>>>> + struct nfs_fattr fattr;
>>>> + struct nfs_writeverf verf;
>>>> + struct nfs_writeargs args; /* argument struct */
>>>> + struct nfs_writeres res; /* result struct */
>>>> + unsigned long timestamp; /* For lease renewal */
>>>> + int (*write_done_cb)(struct rpc_task *, struct nfs_write_data *);
>>>> + __u64 mds_offset; /* Filelayout dense stripe */
>>>> + struct nfs_page_array pages;
>>>> + struct nfs_client *ds_clp; /* pNFS data server */
>>>> +};
>>>> +
>>>> /* used as flag bits in nfs_pgio_header */
>>>> enum {
>>>> NFS_IOHDR_ERROR = 0,
>>>> @@ -1291,7 +1304,10 @@ struct nfs_pgio_header {
>>>> struct inode *inode;
>>>> struct rpc_cred *cred;
>>>> struct list_head pages;
>>>> - struct list_head rpc_list;
>>>> + union {
>>>> + struct nfs_read_data *read;
>>>> + struct nfs_write_data *write;
>>>> + } data;
>>> The first 5 patches in my series makes it so we can share all of these structs. Would it be useful to put those in first?
>>>
>>> Anna
>>>
>> Yes, I think it makes sense to stage most (if not all) of your patches first then merge my patches in.
>>
>> I think I�ll just give it a shot and see how bad it is. I need to post a rebased version of my patchset anyway,
>> so I�ll see if I can also rebase on top of your changes.
>>
>> Any objections?
>
> No objections! As a reminder, I'm based off of Trond's [testing] branch with two extra pageio cleanups from Christoph. Shoot me an email if you need help!
>
> Anna
>>

Great news - the merge was pretty easy!

I ended up merging by hand - doing “git am --3way” on each patch so I could ensure
that they each build cleanly. When there were conflicts, I was able to compare the
old patch to the newly rebased patch to make sure I didn’t miss anything.

This exercise also helped me find a few problems with my patchset ;)

Now it’s time to test! I’ll share my branch on a public repo and repost my patchset
soon.

-dros

>>
>>>> atomic_t refcnt;
>>>> struct nfs_page *req;
>>>> struct nfs_writeverf *verf;
>>>> @@ -1315,21 +1331,6 @@ struct nfs_read_header {
>>>> struct nfs_read_data rpc_data;
>>>> };
>>>>
>>>> -struct nfs_write_data {
>>>> - struct nfs_pgio_header *header;
>>>> - struct list_head list;
>>>> - struct rpc_task task;
>>>> - struct nfs_fattr fattr;
>>>> - struct nfs_writeverf verf;
>>>> - struct nfs_writeargs args; /* argument struct */
>>>> - struct nfs_writeres res; /* result struct */
>>>> - unsigned long timestamp; /* For lease renewal */
>>>> - int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
>>>> - __u64 mds_offset; /* Filelayout dense stripe */
>>>> - struct nfs_page_array pages;
>>>> - struct nfs_client *ds_clp; /* pNFS data server */
>>>> -};
>>>> -
>>>> struct nfs_write_header {
>>>> struct nfs_pgio_header header;
>>>> struct nfs_write_data rpc_data;


2014-04-24 11:55:23

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

On 04/23/2014 08:51 PM, Anna Schumaker wrote:
> On 04/23/2014 01:44 PM, Weston Andros Adamson wrote:
>> Great news - the merge was pretty easy!
>>
>> I ended up merging by hand - doing “git am --3way” on each patch so I could ensure
>> that they each build cleanly. When there were conflicts, I was able to compare the
>> old patch to the newly rebased patch to make sure I didn’t miss anything.
>>
>> This exercise also helped me find a few problems with my patchset ;)
>>
>> Now it’s time to test! I’ll share my branch on a public repo and repost my patchset
>> soon.
>
> Great! I'm glad it went smoothly!
>
>
>>
>> -dros

Cool so I'll wait with the testing for your combined branch
Thanks
Boaz


2014-04-22 21:29:19

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 14/17] pnfs: support multiple verfs per direct req

Support direct requests that span multiple pnfs data servers by
comparing nfs_pgio_header->verf to a cached verf in pnfs_commit_bucket.
Continue to use dreq->verf if the MDS is used / non-pNFS.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/direct.c | 84 ++++++++++++++++++++++++++++++++++++++++++-------
fs/nfs/nfs4filelayout.c | 3 ++
include/linux/nfs.h | 5 ++-
include/linux/nfs_xdr.h | 2 ++
4 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9d968ca..53b86e6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -90,7 +90,7 @@ struct nfs_direct_req {
int flags;
#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
- struct nfs_writeverf verf; /* unstable write verifier */
+ struct nfs_writeverf verf; /* unstable write verifier */
};

static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
@@ -108,6 +108,70 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}

+/*
+ * nfs_direct_set_or_cmp_hdr_verf - ensure that the same verifier is used
+ * for all requests (to each sever) in a
+ * direct io request.
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verfs
+ *
+ * If a verf has yet to be seen for the server associated with @hdr, set
+ * the verf for that server and return 0.
+ * If a verf has been seen for this server, compare @hdr->verf to it and
+ * return the result.
+ */
+static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_writeverf *ds_verf = &dreq->verf;
+ struct nfs_client *ds_clp = hdr->data.write->ds_clp;
+ int ds_idx = hdr->data.write->ds_idx;
+
+ if (ds_clp) {
+ /* pNFS is in use, use the DS verf */
+ if (ds_idx > dreq->ds_cinfo.nbuckets)
+ WARN_ON_ONCE(1);
+ else
+ ds_verf = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+ }
+ if (ds_verf->committed < 0) {
+ memcpy(ds_verf, hdr->verf, sizeof(struct nfs_writeverf));
+ WARN_ON_ONCE(ds_verf->committed < 0);
+ return 0;
+ }
+ return memcmp(ds_verf, hdr->verf, sizeof(struct nfs_writeverf));
+}
+
+/*
+ * nfs_direct_cmp_commit_data_verf - check that a write verifier from a commit
+ * matches the verifier seen on write from
+ * the same server.
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verfs
+ *
+ * This function relies on nfs_direct_set_or_cmp_hdr_verf being called at least
+ * once to set the verf for this server - this is OK because both called (and
+ * only called) in the unstable write path.
+ *
+ * Compare @hdr->verf to it and return the result.
+ */
+static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
+ struct nfs_commit_data *data)
+{
+ struct nfs_writeverf *ds_verf = &dreq->verf;
+ int ds_idx = data->ds_commit_index;
+
+ if (data->ds_clp) {
+ /* pNFS is in use, use the DS verf */
+ if (ds_idx > dreq->ds_cinfo.nbuckets)
+ WARN_ON_ONCE(1);
+ else
+ ds_verf = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+ }
+ WARN_ON_ONCE(ds_verf->committed < 0);
+ return memcmp(ds_verf, &data->verf, sizeof(struct nfs_writeverf));
+}
+
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @rw: direction (read or write)
@@ -168,6 +232,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_get(&dreq->kref);
init_completion(&dreq->completion);
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+ dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
spin_lock_init(&dreq->lock);

@@ -602,7 +667,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
dprintk("NFS: %5u commit failed with error %d.\n",
data->task.tk_pid, status);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+ } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
@@ -810,16 +875,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
} else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
bit = NFS_IOHDR_NEED_RESCHED;
- else if (dreq->flags == 0) {
- memcpy(&dreq->verf, hdr->verf,
- sizeof(dreq->verf));
- bit = NFS_IOHDR_NEED_COMMIT;
- dreq->flags = NFS_ODIRECT_DO_COMMIT;
- } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ else {
+ if (dreq->flags == 0)
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+
+ if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
bit = NFS_IOHDR_NEED_RESCHED;
- } else
+ else
bit = NFS_IOHDR_NEED_COMMIT;
}
}
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index cfd76bd..9a91f4f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -603,6 +603,7 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
data->write_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
+ data->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
data->args.fh = fh;
@@ -875,6 +876,8 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
for (i = 0; i < size; i++) {
INIT_LIST_HEAD(&buckets[i].written);
INIT_LIST_HEAD(&buckets[i].committing);
+ /* mark direct verifier as unset */
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
}

spin_lock(cinfo->lock);
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 3e794c1..610af51 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -46,6 +46,9 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc
enum nfs3_stable_how {
NFS_UNSTABLE = 0,
NFS_DATA_SYNC = 1,
- NFS_FILE_SYNC = 2
+ NFS_FILE_SYNC = 2,
+
+ /* used by direct.c to mark verf as invalid */
+ NFS_INVALID_STABLE_HOW = -1
};
#endif /* _LINUX_NFS_H */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 239274d..f94d804 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1129,6 +1129,7 @@ struct pnfs_commit_bucket {
struct list_head committing;
struct pnfs_layout_segment *wlseg;
struct pnfs_layout_segment *clseg;
+ struct nfs_writeverf direct_verf;
};

struct pnfs_ds_commit_info {
@@ -1289,6 +1290,7 @@ struct nfs_write_data {
__u64 mds_offset; /* Filelayout dense stripe */
struct nfs_page_array pages;
struct nfs_client *ds_clp; /* pNFS data server */
+ int ds_idx; /* ds index if ds_clp is set */
};

/* used as flag bits in nfs_pgio_header */
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:29:04

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 04/17] nfs: call nfs_can_coalesce_requests for every req

Call nfs_can_coalesce_requests for every request, even the first one.
This is needed for future patches to give pg_test a way to inform
add_request to reduce the size of the request.

Now @prev can be null in nfs_can_coalesce_requests and pg_test functions.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/nfs4filelayout.c | 3 +++
fs/nfs/pagelist.c | 34 +++++++++++++++++++---------------
2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index dfc7282..8f27847 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -929,6 +929,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
!nfs_generic_pg_test(pgio, prev, req))
return 0;

+ if (!prev)
+ return req->wb_bytes;
+
p_stripe = (u64)req_offset(prev);
r_stripe = (u64)req_offset(req);
stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 3c35b9e..ac4fb64 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -289,6 +289,8 @@ nfs_wait_on_request(struct nfs_page *req)
size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
struct nfs_page *prev, struct nfs_page *req)
{
+ if (!prev)
+ return req->wb_bytes;
/*
* FIXME: ideally we should be able to coalesce all requests
* that are not block boundary aligned, but currently this
@@ -368,17 +370,20 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
{
size_t size;

- if (!nfs_match_open_context(req->wb_context, prev->wb_context))
- return false;
- if (req->wb_context->dentry->d_inode->i_flock != NULL &&
- !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context))
- return false;
- if (req->wb_pgbase != 0)
- return false;
- if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return false;
- if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
- return false;
+ if (prev) {
+ if (!nfs_match_open_context(req->wb_context, prev->wb_context))
+ return false;
+ if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+ !nfs_match_lock_context(req->wb_lock_context,
+ prev->wb_lock_context))
+ return false;
+ if (req->wb_pgbase != 0)
+ return false;
+ if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ return false;
+ if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ return false;
+ }
size = pgio->pg_ops->pg_test(pgio, prev, req);
WARN_ON_ONCE(size && size != req->wb_bytes);
return size > 0;
@@ -395,17 +400,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
+ struct nfs_page *prev = NULL;
if (desc->pg_count != 0) {
- struct nfs_page *prev;
-
prev = nfs_list_entry(desc->pg_list.prev);
- if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
desc->pg_base = req->wb_pgbase;
}
+ if (!nfs_can_coalesce_requests(prev, req, desc))
+ return 0;
nfs_list_remove_request(req);
nfs_list_add_request(req, &desc->pg_list);
desc->pg_count += req->wb_bytes;
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:29:17

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

Since the ability to split pages into subpage requests has been added,
nfs_pgio_header->rpc_list only ever has one wdata/rdata.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/pnfs.c | 41 +++++++++++++++--------------------------
fs/nfs/read.c | 35 +++++------------------------------
fs/nfs/write.c | 38 +++++++-------------------------------
include/linux/nfs_xdr.h | 35 ++++++++++++++++++-----------------
4 files changed, 45 insertions(+), 104 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7c89385..3b3ec46 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1600,23 +1600,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
}

static void
-pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr, int how)
{
- struct nfs_write_data *data;
+ struct nfs_write_data *data = hdr->data.write;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;

desc->pg_lseg = NULL;
- while (!list_empty(head)) {
- enum pnfs_try_status trypnfs;
-
- data = list_first_entry(head, struct nfs_write_data, list);
- list_del_init(&data->list);
-
- trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_write_through_mds(desc, data);
- }
+ trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+ if (trypnfs == PNFS_NOT_ATTEMPTED)
+ pnfs_write_through_mds(desc, data);
pnfs_put_lseg(lseg);
}

@@ -1650,7 +1645,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
- pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
+ pnfs_do_write(desc, hdr, desc->pg_ioflags);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
@@ -1758,23 +1753,17 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
}

static void
-pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
{
- struct nfs_read_data *data;
+ struct nfs_read_data *data = hdr->data.read;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;

desc->pg_lseg = NULL;
- while (!list_empty(head)) {
- enum pnfs_try_status trypnfs;
-
- data = list_first_entry(head, struct nfs_read_data, list);
- list_del_init(&data->list);
-
- trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_read_through_mds(desc, data);
- }
+ trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+ if (trypnfs == PNFS_NOT_ATTEMPTED)
+ pnfs_read_through_mds(desc, data);
pnfs_put_lseg(lseg);
}

@@ -1809,7 +1798,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
- pnfs_do_multiple_reads(desc, &hdr->rpc_list);
+ pnfs_do_read(desc, hdr);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index daeff0c..c6b7dd0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -42,7 +42,6 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
struct nfs_pgio_header *hdr = &rhdr->header;

INIT_LIST_HEAD(&hdr->pages);
- INIT_LIST_HEAD(&hdr->rpc_list);
spin_lock_init(&hdr->lock);
atomic_set(&hdr->refcnt, 0);
}
@@ -286,26 +285,6 @@ static int nfs_do_read(struct nfs_read_data *data,
return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
}

-static int
-nfs_do_multiple_reads(struct list_head *head,
- const struct rpc_call_ops *call_ops)
-{
- struct nfs_read_data *data;
- int ret = 0;
-
- while (!list_empty(head)) {
- int ret2;
-
- data = list_first_entry(head, struct nfs_read_data, list);
- list_del_init(&data->list);
-
- ret2 = nfs_do_read(data, call_ops);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
static void
nfs_async_read_error(struct list_head *head)
{
@@ -327,12 +306,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
set_bit(NFS_IOHDR_REDO, &hdr->flags);
- while (!list_empty(&hdr->rpc_list)) {
- struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
- struct nfs_read_data, list);
- list_del(&data->list);
- nfs_readdata_release(data);
- }
+ nfs_readdata_release(hdr->data.read);
+ hdr->data.read = NULL;
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
}

@@ -364,7 +339,8 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
}

nfs_read_rpcsetup(data, desc->pg_count, 0);
- list_add(&data->list, &hdr->rpc_list);
+ WARN_ON_ONCE(hdr->data.read);
+ hdr->data.read = data;
desc->pg_rpc_callops = &nfs_read_common_ops;
return 0;
}
@@ -386,8 +362,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
atomic_inc(&hdr->refcnt);
ret = nfs_generic_pagein(desc, hdr);
if (ret == 0)
- ret = nfs_do_multiple_reads(&hdr->rpc_list,
- desc->pg_rpc_callops);
+ ret = nfs_do_read(hdr->data.read, desc->pg_rpc_callops);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f40db93..cd24a14 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -79,7 +79,6 @@ struct nfs_write_header *nfs_writehdr_alloc(void)

memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&hdr->pages);
- INIT_LIST_HEAD(&hdr->rpc_list);
spin_lock_init(&hdr->lock);
atomic_set(&hdr->refcnt, 0);
hdr->verf = &p->verf;
@@ -1171,26 +1170,6 @@ static int nfs_do_write(struct nfs_write_data *data,
return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
}

-static int nfs_do_multiple_writes(struct list_head *head,
- const struct rpc_call_ops *call_ops,
- int how)
-{
- struct nfs_write_data *data;
- int ret = 0;
-
- while (!list_empty(head)) {
- int ret2;
-
- data = list_first_entry(head, struct nfs_write_data, list);
- list_del_init(&data->list);
-
- ret2 = nfs_do_write(data, call_ops, how);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
/* If a nfs_flush_* function fails, it should remove reqs from @head and
* call this on each, which will prepare them to be retried on next
* writeback using standard nfs.
@@ -1223,12 +1202,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
set_bit(NFS_IOHDR_REDO, &hdr->flags);
- while (!list_empty(&hdr->rpc_list)) {
- struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
- struct nfs_write_data, list);
- list_del(&data->list);
- nfs_writedata_release(data);
- }
+ nfs_writedata_release(hdr->data.write);
+ hdr->data.write = NULL;
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
}

@@ -1275,7 +1250,8 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc,

/* Set up the argument struct */
nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
- list_add(&data->list, &hdr->rpc_list);
+ WARN_ON_ONCE(hdr->data.write);
+ hdr->data.write = data;
desc->pg_rpc_callops = &nfs_write_common_ops;
return 0;
}
@@ -1297,9 +1273,9 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
atomic_inc(&hdr->refcnt);
ret = nfs_generic_flush(desc, hdr);
if (ret == 0)
- ret = nfs_do_multiple_writes(&hdr->rpc_list,
- desc->pg_rpc_callops,
- desc->pg_ioflags);
+ ret = nfs_do_write(hdr->data.write,
+ desc->pg_rpc_callops,
+ desc->pg_ioflags);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6fb5b23..239274d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1266,7 +1266,6 @@ struct nfs_page_array {

struct nfs_read_data {
struct nfs_pgio_header *header;
- struct list_head list;
struct rpc_task task;
struct nfs_fattr fattr; /* fattr storage */
struct nfs_readargs args;
@@ -1278,6 +1277,20 @@ struct nfs_read_data {
struct nfs_client *ds_clp; /* pNFS data server */
};

+struct nfs_write_data {
+ struct nfs_pgio_header *header;
+ struct rpc_task task;
+ struct nfs_fattr fattr;
+ struct nfs_writeverf verf;
+ struct nfs_writeargs args; /* argument struct */
+ struct nfs_writeres res; /* result struct */
+ unsigned long timestamp; /* For lease renewal */
+ int (*write_done_cb)(struct rpc_task *, struct nfs_write_data *);
+ __u64 mds_offset; /* Filelayout dense stripe */
+ struct nfs_page_array pages;
+ struct nfs_client *ds_clp; /* pNFS data server */
+};
+
/* used as flag bits in nfs_pgio_header */
enum {
NFS_IOHDR_ERROR = 0,
@@ -1291,7 +1304,10 @@ struct nfs_pgio_header {
struct inode *inode;
struct rpc_cred *cred;
struct list_head pages;
- struct list_head rpc_list;
+ union {
+ struct nfs_read_data *read;
+ struct nfs_write_data *write;
+ } data;
atomic_t refcnt;
struct nfs_page *req;
struct nfs_writeverf *verf;
@@ -1315,21 +1331,6 @@ struct nfs_read_header {
struct nfs_read_data rpc_data;
};

-struct nfs_write_data {
- struct nfs_pgio_header *header;
- struct list_head list;
- struct rpc_task task;
- struct nfs_fattr fattr;
- struct nfs_writeverf verf;
- struct nfs_writeargs args; /* argument struct */
- struct nfs_writeres res; /* result struct */
- unsigned long timestamp; /* For lease renewal */
- int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
- __u64 mds_offset; /* Filelayout dense stripe */
- struct nfs_page_array pages;
- struct nfs_client *ds_clp; /* pNFS data server */
-};
-
struct nfs_write_header {
struct nfs_pgio_header header;
struct nfs_write_data rpc_data;
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:29:12

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 09/17] pnfs: clean up filelayout_alloc_commit_info

Remove unneeded else statement and clean up how commit info
dataserver buckets are replaced.

Suggested-by: Trond Myklebust <[email protected]>
Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/nfs4filelayout.c | 48 ++++++++++++++++++++++++++++--------------------
1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 8f27847..3b32c95 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -850,11 +850,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
struct pnfs_commit_bucket *buckets;
- int size;
+ int size, i;

if (fl->commit_through_mds)
return 0;
- if (cinfo->ds->nbuckets != 0) {
+
+ size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ if (cinfo->ds->nbuckets >= size) {
/* This assumes there is only one IOMODE_RW lseg. What
* we really want to do is have a layout_hdr level
* dictionary of <multipath_list4, fh> keys, each
@@ -864,30 +868,34 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
return 0;
}

- size = (fl->stripe_type == STRIPE_SPARSE) ?
- fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
gfp_flags);
if (!buckets)
return -ENOMEM;
- else {
- int i;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&buckets[i].written);
+ INIT_LIST_HEAD(&buckets[i].committing);
+ }

- spin_lock(cinfo->lock);
- if (cinfo->ds->nbuckets != 0)
- kfree(buckets);
- else {
- cinfo->ds->buckets = buckets;
- cinfo->ds->nbuckets = size;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- }
- }
- spin_unlock(cinfo->lock);
- return 0;
+ spin_lock(cinfo->lock);
+ if (cinfo->ds->nbuckets >= size)
+ goto out;
+ for (i = 0; i < cinfo->ds->nbuckets; i++) {
+ list_splice(&cinfo->ds->buckets[i].written,
+ &buckets[i].written);
+ list_splice(&cinfo->ds->buckets[i].committing,
+ &buckets[i].committing);
+ buckets[i].direct_verf.committed =
+ cinfo->ds->buckets[i].direct_verf.committed;
+ buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
+ buckets[i].clseg = cinfo->ds->buckets[i].clseg;
}
+ swap(cinfo->ds->buckets, buckets);
+ cinfo->ds->nbuckets = size;
+out:
+ spin_unlock(cinfo->lock);
+ kfree(buckets);
+ return 0;
}

static struct pnfs_layout_segment *
--
1.8.5.2 (Apple Git-48)


2014-04-24 16:15:11

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

On Apr 24, 2014, at 11:45 AM, Jeff Layton <[email protected]> wrote:

> On Thu, 24 Apr 2014 11:23:19 -0400
> Weston Andros Adamson <[email protected]> wrote:
>
>> On Apr 24, 2014, at 10:50 AM, Jeff Layton <[email protected]> wrote:
>>
>>> On Tue, 22 Apr 2014 17:29:13 -0400
>>> Weston Andros Adamson <[email protected]> wrote:
>>>
>>>> Add "page groups" - a circular list of nfs requests (struct nfs_page)
>>>> that all reference the same page. This gives nfs read and write paths
>>>> the ability to account for sub-page regions independently. This
>>>> somewhat follows the design of struct buffer_head's sub-page
>>>> accounting.
>>>>
>>>> Only "head" requests are ever added/removed from the inode list in
>>>> the buffered write path. "head" and "sub" requests are treated the
>>>> same through the read path and the rest of the write/commit path.
>>>> Requests are given an extra reference across the life of the list.
>>>>
>>>> Page groups are never rejoined after being split. If the read/write
>>>> request fails and the client falls back to another path (ie revert
>>>> to MDS in PNFS case), the already split requests are pushed through
>>>> the recoalescing code again, which may split them further and then
>>>> coalesce them into properly sized requests on the wire. Fragmentation
>>>> shouldn't be a problem with the current design, because we flush all
>>>> requests in page group when a non-contiguous request is added, so
>>>> the only time resplitting should occur is on a resend of a read or
>>>> write.
>>>>
>>>> This patch lays the groundwork for sub-page splitting, but does not
>>>> actually do any splitting. For now all page groups have one request
>>>> as pg_test functions don't yet split pages. There are several related
>>>> patches that are needed support multiple requests per page group.
>>>>
>>>> Signed-off-by: Weston Andros Adamson <[email protected]>
>>>> ---
>>>> fs/nfs/direct.c | 7 +-
>>>> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
>>>> fs/nfs/read.c | 4 +-
>>>> fs/nfs/write.c | 12 ++-
>>>> include/linux/nfs_page.h | 12 ++-
>>>> 5 files changed, 231 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>>>> index a0c30c5..9d968ca 100644
>>>> --- a/fs/nfs/direct.c
>>>> +++ b/fs/nfs/direct.c
>>>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
>>>> struct nfs_page *req;
>>>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>>> /* XXX do we need to do the eof zeroing found in async_filler? */
>>>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>>>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>>>> pgbase, req_len);
>>>> if (IS_ERR(req)) {
>>>> result = PTR_ERR(req);
>>>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
>>>> struct nfs_page *req;
>>>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>>>
>>>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>>>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>>>> pgbase, req_len);
>>>> if (IS_ERR(req)) {
>>>> result = PTR_ERR(req);
>>>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>>>> spin_unlock(&dreq->lock);
>>>>
>>>> while (!list_empty(&hdr->pages)) {
>>>> + bool do_destroy = true;
>>>> +
>>>> req = nfs_list_entry(hdr->pages.next);
>>>> nfs_list_remove_request(req);
>>>> switch (bit) {
>>>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>>>> case NFS_IOHDR_NEED_COMMIT:
>>>> kref_get(&req->wb_kref);
>>>> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
>>>> + do_destroy = false;
>>>> }
>>>> nfs_unlock_and_release_request(req);
>>>> }
>>>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>>>> index ac4fb64..8cb8e14 100644
>>>> --- a/fs/nfs/pagelist.c
>>>> +++ b/fs/nfs/pagelist.c
>>>> @@ -26,6 +26,8 @@
>>>>
>>>> static struct kmem_cache *nfs_page_cachep;
>>>>
>>>> +static void nfs_free_request(struct nfs_page *);
>>>> +
>>>> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
>>>> {
>>>> p->npages = pagecount;
>>>> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>>>> return __nfs_iocounter_wait(c);
>>>> }
>>>>
>>>> +/*
>>>> + * nfs_page_group_lock - lock the head of the page group
>>>> + * @req - request in group that is to be locked
>>>> + *
>>>> + * this lock must be held if modifying the page group list
>>>> + */
>>>> +void
>>>> +nfs_page_group_lock(struct nfs_page *req)
>>>> +{
>>>> + struct nfs_page *head = req->wb_head;
>>>> + int err = -EAGAIN;
>>>> +
>>>> + WARN_ON_ONCE(head != head->wb_head);
>>>> +
>>>> + while (err)
>>>> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
>>>> + nfs_wait_bit_killable, TASK_KILLABLE);
>>>> +}
>>>> +
>>>> +/*
>>>> + * nfs_page_group_unlock - unlock the head of the page group
>>>> + * @req - request in group that is to be unlocked
>>>> + */
>>>> +void
>>>> +nfs_page_group_unlock(struct nfs_page *req)
>>>> +{
>>>> + struct nfs_page *head = req->wb_head;
>>>> +
>>>> + WARN_ON_ONCE(head != head->wb_head);
>>>> +
>>>> + smp_mb__before_clear_bit();
>>>> + clear_bit(PG_HEADLOCK, &head->wb_flags);
>>>> + smp_mb__after_clear_bit();
>>>> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
>>>> +}
>>>> +
>>>> +/*
>>>> + * nfs_page_group_sync_on_bit_locked
>>>> + *
>>>> + * must be called with page group lock held
>>>> + */
>>>> +static bool
>>>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
>>>> +{
>>>> + struct nfs_page *head = req->wb_head;
>>>> + struct nfs_page *tmp;
>>>> +
>>>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
>>>> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
>>>> +
>>>> + tmp = req->wb_this_page;
>>>> + while (tmp != req) {
>>>> + if (!test_bit(bit, &tmp->wb_flags))
>>>> + return false;
>>>> + tmp = tmp->wb_this_page;
>>>> + }
>>>> +
>>>> + /* true! reset all bits */
>>>> + tmp = req;
>>>> + do {
>>>> + clear_bit(bit, &tmp->wb_flags);
>>>> + tmp = tmp->wb_this_page;
>>>> + } while (tmp != req);
>>>> +
>>>> + return true;
>>>> +}
>>>> +
>>>> +/*
>>>> + * nfs_page_group_sync_on_bit - set bit on current request, but only
>>>> + * return true if the bit is set for all requests in page group
>>>> + * @req - request in page group
>>>> + * @bit - PG_* bit that is used to sync page group
>>>> + */
>>>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
>>>> +{
>>>> + bool ret;
>>>> +
>>>> + nfs_page_group_lock(req);
>>>> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
>>>> + nfs_page_group_unlock(req);
>>>> +
>>>> + return ret;
>>>> +}
>>>> +
>>>> +/*
>>>> + * nfs_page_group_init - Initialize the page group linkage for @req
>>>> + * @req - a new nfs request
>>>> + * @prev - the previous request in page group, or NULL if @req is the first
>>>> + * or only request in the group (the head).
>>>> + */
>>>> +static inline void
>>>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
>>>> +{
>>>> + WARN_ON_ONCE(prev == req);
>>>> +
>>>> + if (!prev) {
>>>> + req->wb_head = req;
>>>> + req->wb_this_page = req;
>>>> + } else {
>>>> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
>>>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
>>>> + req->wb_head = prev->wb_head;
>>>> + req->wb_this_page = prev->wb_this_page;
>>>> + prev->wb_this_page = req;
>>>> + }
>>>> +}
>>>> +
>>>> +/*
>>>> + * nfs_page_group_destroy - sync the destruction of page groups
>>>> + * @req - request that no longer needs the page group
>>>> + *
>>>> + * releases the page group reference from each member once all
>>>> + * members have called this function.
>>>> + */
>>>> +static void
>>>> +nfs_page_group_destroy(struct kref *kref)
>>>> +{
>>>> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>>>> + struct nfs_page *tmp, *next;
>>>> +
>>>> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
>>>> + return;
>>>> +
>>>> + tmp = req;
>>>> + do {
>>>> + next = tmp->wb_this_page;
>>>> + /* unlink and free */
>>>> + tmp->wb_this_page = tmp;
>>>> + tmp->wb_head = tmp;
>>>> + nfs_free_request(tmp);
>>>> + tmp = next;
>>>> + } while (tmp != req);
>>>> +}
>>>> +
>>>> /**
>>>> * nfs_create_request - Create an NFS read/write request.
>>>> * @ctx: open context to use
>>>> * @page: page to write
>>>> + * @last: last nfs request created for this page group or NULL if head
>>>> * @offset: starting offset within the page for the write
>>>> * @count: number of bytes to read/write
>>>> *
>>>> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>>>> */
>>>> struct nfs_page *
>>>> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>>>> - unsigned int offset, unsigned int count)
>>>> + struct nfs_page *last, unsigned int offset,
>>>> + unsigned int count)
>>>> {
>>>> struct nfs_page *req;
>>>> struct nfs_lock_context *l_ctx;
>>>> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>>>> req->wb_bytes = count;
>>>> req->wb_context = get_nfs_open_context(ctx);
>>>> kref_init(&req->wb_kref);
>>>> + nfs_page_group_init(req, last);
>>>> return req;
>>>> }
>>>>
>>>> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
>>>> }
>>>> }
>>>>
>>>> -
>>>> /**
>>>> * nfs_release_request - Release the count on an NFS read/write request
>>>> * @req: request to release
>>>> *
>>>> * Note: Should never be called with the spinlock held!
>>>> */
>>>> -static void nfs_free_request(struct kref *kref)
>>>> +static void nfs_free_request(struct nfs_page *req)
>>>> {
>>>> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>>>> + WARN_ON_ONCE(req->wb_this_page != req);
>>>> +
>>>> + /* extra debug: make sure no sync bits are still set */
>>>> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
>>>> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
>>>> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
>>>> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
>>>> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
>>>>
>>>> /* Release struct file and open context */
>>>> nfs_clear_request(req);
>>>> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
>>>>
>>>> void nfs_release_request(struct nfs_page *req)
>>>> {
>>>> - kref_put(&req->wb_kref, nfs_free_request);
>>>> + kref_put(&req->wb_kref, nfs_page_group_destroy);
>>>> }
>>>>
>>>> static int nfs_wait_bit_uninterruptible(void *word)
>>>> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
>>>> * @desc: destination io descriptor
>>>> * @req: request
>>>> *
>>>> + * This may split a request into subrequests which are all part of the
>>>> + * same page group.
>>>> + *
>>>> * Returns true if the request 'req' was successfully coalesced into the
>>>> * existing list of pages 'desc'.
>>>> */
>>>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
>>>> struct nfs_page *req)
>>>> {
>>>> - while (!nfs_pageio_do_add_request(desc, req)) {
>>>> - desc->pg_moreio = 1;
>>>> - nfs_pageio_doio(desc);
>>>> - if (desc->pg_error < 0)
>>>> - return 0;
>>>> - desc->pg_moreio = 0;
>>>> - if (desc->pg_recoalesce)
>>>> - return 0;
>>>> - }
>>>> + struct nfs_page *subreq;
>>>> + unsigned int bytes_left = 0;
>>>> + unsigned int offset, pgbase;
>>>> +
>>>> + nfs_page_group_lock(req);
>>>> +
>>>> + subreq = req;
>>>> + bytes_left = subreq->wb_bytes;
>>>> + offset = subreq->wb_offset;
>>>> + pgbase = subreq->wb_pgbase;
>>>> +
>>>> + do {
>>>> + if (!nfs_pageio_do_add_request(desc, subreq)) {
>>>> + /* make sure pg_test call(s) did nothing */
>>>> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
>>>> + WARN_ON_ONCE(subreq->wb_offset != offset);
>>>> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
>>>> +
>>>> + nfs_page_group_unlock(req);
>>>> + desc->pg_moreio = 1;
>>>> + nfs_pageio_doio(desc);
>>>> + if (desc->pg_error < 0)
>>>> + return 0;
>>>> + desc->pg_moreio = 0;
>>>> + if (desc->pg_recoalesce)
>>>> + return 0;
>>>> + /* retry add_request for this subreq */
>>>> + nfs_page_group_lock(req);
>>>> + continue;
>>>> + }
>>>> +
>>>> + /* check for buggy pg_test call(s) */
>>>> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
>>>> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
>>>> + WARN_ON_ONCE(subreq->wb_bytes == 0);
>>>> +
>>>> + bytes_left -= subreq->wb_bytes;
>>>> + offset += subreq->wb_bytes;
>>>> + pgbase += subreq->wb_bytes;
>>>> +
>>>> + if (bytes_left) {
>>>> + subreq = nfs_create_request(req->wb_context,
>>>> + req->wb_page,
>>>> + subreq, pgbase, bytes_left);
>>>> + nfs_lock_request(subreq);
>>>> + subreq->wb_offset = offset;
>>>> + subreq->wb_index = req->wb_index;
>>>> + }
>>>> + } while (bytes_left > 0);
>>>> +
>>>> + nfs_page_group_unlock(req);
>>>> return 1;
>>>> }
>>>>
>>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>>> index 95a0855..ee0a3cd 100644
>>>> --- a/fs/nfs/read.c
>>>> +++ b/fs/nfs/read.c
>>>> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>>>> len = nfs_page_length(page);
>>>> if (len == 0)
>>>> return nfs_return_empty_page(page);
>>>> - new = nfs_create_request(ctx, page, 0, len);
>>>> + new = nfs_create_request(ctx, page, NULL, 0, len);
>>>> if (IS_ERR(new)) {
>>>> unlock_page(page);
>>>> return PTR_ERR(new);
>>>> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
>>>> if (len == 0)
>>>> return nfs_return_empty_page(page);
>>>>
>>>> - new = nfs_create_request(desc->ctx, page, 0, len);
>>>> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
>>>> if (IS_ERR(new))
>>>> goto out_error;
>>>>
>>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>>> index ca20ec7..d1453f2 100644
>>>> --- a/fs/nfs/write.c
>>>> +++ b/fs/nfs/write.c
>>>> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
>>>> }
>>>> nfsi->npages--;
>>>> spin_unlock(&inode->i_lock);
>>>> - nfs_release_request(req);
>>>> + nfs_release_request(head);
>>>> }
>>>>
>>>> static void
>>>> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
>>>> {
>>>> struct nfs_commit_info cinfo;
>>>> unsigned long bytes = 0;
>>>> + bool do_destroy;
>>>>
>>>> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
>>>> goto out;
>>>> @@ -654,6 +655,7 @@ remove_req:
>>>> next:
>>>> nfs_unlock_request(req);
>>>> nfs_end_page_writeback(req->wb_page);
>>>> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
>>>> nfs_release_request(req);
>>>> }
>>>> out:
>>>> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
>>>> if (req == NULL)
>>>> goto out_unlock;
>>>>
>>>> + /* should be handled by nfs_flush_incompatible */
>>>> + WARN_ON_ONCE(req->wb_head != req);
>>>> + WARN_ON_ONCE(req->wb_this_page != req);
>>>> +
>>>> rqend = req->wb_offset + req->wb_bytes;
>>>> /*
>>>> * Tell the caller to flush out the request if
>>>> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
>>>> req = nfs_try_to_update_request(inode, page, offset, bytes);
>>>> if (req != NULL)
>>>> goto out;
>>>> - req = nfs_create_request(ctx, page, offset, bytes);
>>>> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
>>>> if (IS_ERR(req))
>>>> goto out;
>>>> nfs_inode_add_request(inode, req);
>>>> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>>>> return 0;
>>>> l_ctx = req->wb_lock_context;
>>>> do_flush = req->wb_page != page || req->wb_context != ctx;
>>>> + /* for now, flush if more than 1 request in page_group */
>>>> + do_flush |= req->wb_this_page != req;
>>>> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
>>>> do_flush |= l_ctx->lockowner.l_owner != current->files
>>>> || l_ctx->lockowner.l_pid != current->tgid;
>>>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
>>>> index 214e098..1fb161b 100644
>>>> --- a/include/linux/nfs_page.h
>>>> +++ b/include/linux/nfs_page.h
>>>> @@ -26,6 +26,8 @@ enum {
>>>> PG_MAPPED, /* page private set for buffered io */
>>>> PG_CLEAN, /* write succeeded */
>>>> PG_COMMIT_TO_DS, /* used by pnfs layouts */
>>>> + PG_HEADLOCK, /* page group lock of wb_head */
>>>> + PG_TEARDOWN, /* page group sync for destroy */
>>>> };
>>>>
>>>> struct nfs_inode;
>>>> @@ -41,6 +43,8 @@ struct nfs_page {
>>>> struct kref wb_kref; /* reference count */
>>>> unsigned long wb_flags;
>>>> struct nfs_write_verifier wb_verf; /* Commit cookie */
>>>> + struct nfs_page *wb_this_page; /* list of reqs for this page */
>>>> + struct nfs_page *wb_head; /* head pointer for req list */
>>>
>>> Hmm ok, so to make sure I understand...
>>>
>>> So page->private will point to the "head" req (struct page_private).
>>
>> Only in the buffered write case. Page->private is not set for read path / direct i/o path.
>>
>>> Then we'll have a singly-linked list of reqs hanging off of
>>> wb_this_page. Is that right?
>>>
>>> If so, then it seems like it would be clearer to use a standard
>>> list_head here. If you need to get to the wb_head, you could always do
>>> something like this:
>>>
>>> list_first_entry(&req->wb_page->wb_this_page);
>>
>> Well, wb_page is a struct page and doesn?t have wb_this_page (which is in struct
>> nfs_page), but I see where you?re going with this.
>>
>
> Doh, right! Sorry, I threw that together in haste, but you get the
> idea. I was thinking you could go back to the page and dereference
> ->private.
>
>> A strategy like this only works if we always have page->private pointing to the head
>> request. We chose not to go that way because it messes with the buffered
>> write path?s setting / clearing of page private which interacts with the swappable
>> nfs pages code that everyone seems to be afraid to touch ;)
>>
>> So we decided to go this route (not messing with page_private) as a first step - we
>> certainly could add it later, but the current approach makes things less complex.
>>
>
> Ok, that makes sense. Thanks...
>
>>>
>>> ...and could even turn that into a macro or static inline for some
>>> syntactic sugar. It's a little more pointer chasing to find the head,
>>> but it seems like that would be clearer than using yet another
>>> linked-list implementation.
>>
>> So, I?m not against using list_head.. I didn?t go that route initially because I was:
>>
>> 1) following the buffer_head example, which rolls it?s own list
>>
>
> I wouldn't be surprised if the buffer_head code predates the standard
> linked-list macros, so that probably explains why they did it that way.
> The file locking code has a similar construct in inode->i_flock list.

AFAIK the sub-page functionality was added somewhat recently.

>
>> 2) trying to grow nfs_page as little as possible - but we might have room within
>> the allocator bucket it currently lives in?
>>
>
> nfs_page comes out of a dedicated slabcache, so that probably won't be the case.

Ah, right!

>
>> 3) not sure list_head is suitable for a circular list (I haven?t ever looked into it).
>>
>> and until we have a way to find the head request (via page private, etc) without
>> walking the circular list (chicken / egg problem needing to grab head lock before walking
>> list to find the head to lock it), we?ll still need the head pointer.
>>
>> Thoughts?
>>
>> -dros
>>
>
> If you can't rely on page->private pointing to the request, then that
> does make it tough to do what I was suggesting. struct list_head lists
> are doubly-linked and circular by nature, so that does seem to be a
> natural fit for what you're trying to do.

Oh I see -- you?re totally right about list_head being circular, one just has
to call for_each on whatever head they wish to start from.

>
> The only problem is that struct list_head is two pointers instead of
> one, so it's not going to be as space-efficient as what you're doing
> here. If that's a large concern then you may have no choice but to do
> this after all.

Right. How much do we care about an extra pointer here? It seems to me
that we should try to keep it as small as possible - I know Trond has been unwilling
to add members to rpc_task (for example) unless absolutely necessary and there will
be at least one (if not more) nfs_page structures per rpc_task.

One immediate takeaway: I need to add much better comments about this.

As far as eventually removing the wb_head pointer, it gets really ugly to do without
changing the buffered write path (and swappable page semantics) because page_group
operations happen *after* nfs_inode_remove_request() clears page_private (syncing the
destruction of the page group). This means that nfs_release_request and
nfs_unlock_and_release_request will both have to be passed a previously cached head
pointer. yuck.

-dros

>
>>>
>>>> };
>>>>
>>>> struct nfs_pageio_descriptor;
>>>> @@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {
>>>>
>>>> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
>>>> struct page *page,
>>>> + struct nfs_page *last,
>>>> unsigned int offset,
>>>> unsigned int count);
>>>> -extern void nfs_release_request(struct nfs_page *req);
>>>> +extern void nfs_release_request(struct nfs_page *);
>>>>
>>>>
>>>> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
>>>> @@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
>>>> struct nfs_page *req);
>>>> extern int nfs_wait_on_request(struct nfs_page *);
>>>> extern void nfs_unlock_request(struct nfs_page *req);
>>>> -extern void nfs_unlock_and_release_request(struct nfs_page *req);
>>>> +extern void nfs_unlock_and_release_request(struct nfs_page *);
>>>> +extern void nfs_page_group_lock(struct nfs_page *);
>>>> +extern void nfs_page_group_unlock(struct nfs_page *);
>>>> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
>>>>
>>>> /*
>>>> * Lock the page of an asynchronous request
>>>
>>>
>>> --
>>> Jeff Layton <[email protected]>
>>
>
>
> --
> Jeff Layton <[email protected]>


2014-04-24 14:50:04

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

On Tue, 22 Apr 2014 17:29:13 -0400
Weston Andros Adamson <[email protected]> wrote:

> Add "page groups" - a circular list of nfs requests (struct nfs_page)
> that all reference the same page. This gives nfs read and write paths
> the ability to account for sub-page regions independently. This
> somewhat follows the design of struct buffer_head's sub-page
> accounting.
>
> Only "head" requests are ever added/removed from the inode list in
> the buffered write path. "head" and "sub" requests are treated the
> same through the read path and the rest of the write/commit path.
> Requests are given an extra reference across the life of the list.
>
> Page groups are never rejoined after being split. If the read/write
> request fails and the client falls back to another path (ie revert
> to MDS in PNFS case), the already split requests are pushed through
> the recoalescing code again, which may split them further and then
> coalesce them into properly sized requests on the wire. Fragmentation
> shouldn't be a problem with the current design, because we flush all
> requests in page group when a non-contiguous request is added, so
> the only time resplitting should occur is on a resend of a read or
> write.
>
> This patch lays the groundwork for sub-page splitting, but does not
> actually do any splitting. For now all page groups have one request
> as pg_test functions don't yet split pages. There are several related
> patches that are needed support multiple requests per page group.
>
> Signed-off-by: Weston Andros Adamson <[email protected]>
> ---
> fs/nfs/direct.c | 7 +-
> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
> fs/nfs/read.c | 4 +-
> fs/nfs/write.c | 12 ++-
> include/linux/nfs_page.h | 12 ++-
> 5 files changed, 231 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index a0c30c5..9d968ca 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
> struct nfs_page *req;
> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
> /* XXX do we need to do the eof zeroing found in async_filler? */
> - req = nfs_create_request(dreq->ctx, pagevec[i],
> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> pgbase, req_len);
> if (IS_ERR(req)) {
> result = PTR_ERR(req);
> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
> struct nfs_page *req;
> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>
> - req = nfs_create_request(dreq->ctx, pagevec[i],
> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> pgbase, req_len);
> if (IS_ERR(req)) {
> result = PTR_ERR(req);
> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> spin_unlock(&dreq->lock);
>
> while (!list_empty(&hdr->pages)) {
> + bool do_destroy = true;
> +
> req = nfs_list_entry(hdr->pages.next);
> nfs_list_remove_request(req);
> switch (bit) {
> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> case NFS_IOHDR_NEED_COMMIT:
> kref_get(&req->wb_kref);
> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
> + do_destroy = false;
> }
> nfs_unlock_and_release_request(req);
> }
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index ac4fb64..8cb8e14 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -26,6 +26,8 @@
>
> static struct kmem_cache *nfs_page_cachep;
>
> +static void nfs_free_request(struct nfs_page *);
> +
> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
> {
> p->npages = pagecount;
> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> return __nfs_iocounter_wait(c);
> }
>
> +/*
> + * nfs_page_group_lock - lock the head of the page group
> + * @req - request in group that is to be locked
> + *
> + * this lock must be held if modifying the page group list
> + */
> +void
> +nfs_page_group_lock(struct nfs_page *req)
> +{
> + struct nfs_page *head = req->wb_head;
> + int err = -EAGAIN;
> +
> + WARN_ON_ONCE(head != head->wb_head);
> +
> + while (err)
> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
> + nfs_wait_bit_killable, TASK_KILLABLE);
> +}
> +
> +/*
> + * nfs_page_group_unlock - unlock the head of the page group
> + * @req - request in group that is to be unlocked
> + */
> +void
> +nfs_page_group_unlock(struct nfs_page *req)
> +{
> + struct nfs_page *head = req->wb_head;
> +
> + WARN_ON_ONCE(head != head->wb_head);
> +
> + smp_mb__before_clear_bit();
> + clear_bit(PG_HEADLOCK, &head->wb_flags);
> + smp_mb__after_clear_bit();
> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
> +}
> +
> +/*
> + * nfs_page_group_sync_on_bit_locked
> + *
> + * must be called with page group lock held
> + */
> +static bool
> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
> +{
> + struct nfs_page *head = req->wb_head;
> + struct nfs_page *tmp;
> +
> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
> +
> + tmp = req->wb_this_page;
> + while (tmp != req) {
> + if (!test_bit(bit, &tmp->wb_flags))
> + return false;
> + tmp = tmp->wb_this_page;
> + }
> +
> + /* true! reset all bits */
> + tmp = req;
> + do {
> + clear_bit(bit, &tmp->wb_flags);
> + tmp = tmp->wb_this_page;
> + } while (tmp != req);
> +
> + return true;
> +}
> +
> +/*
> + * nfs_page_group_sync_on_bit - set bit on current request, but only
> + * return true if the bit is set for all requests in page group
> + * @req - request in page group
> + * @bit - PG_* bit that is used to sync page group
> + */
> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
> +{
> + bool ret;
> +
> + nfs_page_group_lock(req);
> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
> + nfs_page_group_unlock(req);
> +
> + return ret;
> +}
> +
> +/*
> + * nfs_page_group_init - Initialize the page group linkage for @req
> + * @req - a new nfs request
> + * @prev - the previous request in page group, or NULL if @req is the first
> + * or only request in the group (the head).
> + */
> +static inline void
> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
> +{
> + WARN_ON_ONCE(prev == req);
> +
> + if (!prev) {
> + req->wb_head = req;
> + req->wb_this_page = req;
> + } else {
> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
> + req->wb_head = prev->wb_head;
> + req->wb_this_page = prev->wb_this_page;
> + prev->wb_this_page = req;
> + }
> +}
> +
> +/*
> + * nfs_page_group_destroy - sync the destruction of page groups
> + * @req - request that no longer needs the page group
> + *
> + * releases the page group reference from each member once all
> + * members have called this function.
> + */
> +static void
> +nfs_page_group_destroy(struct kref *kref)
> +{
> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> + struct nfs_page *tmp, *next;
> +
> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
> + return;
> +
> + tmp = req;
> + do {
> + next = tmp->wb_this_page;
> + /* unlink and free */
> + tmp->wb_this_page = tmp;
> + tmp->wb_head = tmp;
> + nfs_free_request(tmp);
> + tmp = next;
> + } while (tmp != req);
> +}
> +
> /**
> * nfs_create_request - Create an NFS read/write request.
> * @ctx: open context to use
> * @page: page to write
> + * @last: last nfs request created for this page group or NULL if head
> * @offset: starting offset within the page for the write
> * @count: number of bytes to read/write
> *
> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> */
> struct nfs_page *
> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> - unsigned int offset, unsigned int count)
> + struct nfs_page *last, unsigned int offset,
> + unsigned int count)
> {
> struct nfs_page *req;
> struct nfs_lock_context *l_ctx;
> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> req->wb_bytes = count;
> req->wb_context = get_nfs_open_context(ctx);
> kref_init(&req->wb_kref);
> + nfs_page_group_init(req, last);
> return req;
> }
>
> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
> }
> }
>
> -
> /**
> * nfs_release_request - Release the count on an NFS read/write request
> * @req: request to release
> *
> * Note: Should never be called with the spinlock held!
> */
> -static void nfs_free_request(struct kref *kref)
> +static void nfs_free_request(struct nfs_page *req)
> {
> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> + WARN_ON_ONCE(req->wb_this_page != req);
> +
> + /* extra debug: make sure no sync bits are still set */
> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
>
> /* Release struct file and open context */
> nfs_clear_request(req);
> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
>
> void nfs_release_request(struct nfs_page *req)
> {
> - kref_put(&req->wb_kref, nfs_free_request);
> + kref_put(&req->wb_kref, nfs_page_group_destroy);
> }
>
> static int nfs_wait_bit_uninterruptible(void *word)
> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
> * @desc: destination io descriptor
> * @req: request
> *
> + * This may split a request into subrequests which are all part of the
> + * same page group.
> + *
> * Returns true if the request 'req' was successfully coalesced into the
> * existing list of pages 'desc'.
> */
> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
> struct nfs_page *req)
> {
> - while (!nfs_pageio_do_add_request(desc, req)) {
> - desc->pg_moreio = 1;
> - nfs_pageio_doio(desc);
> - if (desc->pg_error < 0)
> - return 0;
> - desc->pg_moreio = 0;
> - if (desc->pg_recoalesce)
> - return 0;
> - }
> + struct nfs_page *subreq;
> + unsigned int bytes_left = 0;
> + unsigned int offset, pgbase;
> +
> + nfs_page_group_lock(req);
> +
> + subreq = req;
> + bytes_left = subreq->wb_bytes;
> + offset = subreq->wb_offset;
> + pgbase = subreq->wb_pgbase;
> +
> + do {
> + if (!nfs_pageio_do_add_request(desc, subreq)) {
> + /* make sure pg_test call(s) did nothing */
> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
> + WARN_ON_ONCE(subreq->wb_offset != offset);
> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
> +
> + nfs_page_group_unlock(req);
> + desc->pg_moreio = 1;
> + nfs_pageio_doio(desc);
> + if (desc->pg_error < 0)
> + return 0;
> + desc->pg_moreio = 0;
> + if (desc->pg_recoalesce)
> + return 0;
> + /* retry add_request for this subreq */
> + nfs_page_group_lock(req);
> + continue;
> + }
> +
> + /* check for buggy pg_test call(s) */
> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
> + WARN_ON_ONCE(subreq->wb_bytes == 0);
> +
> + bytes_left -= subreq->wb_bytes;
> + offset += subreq->wb_bytes;
> + pgbase += subreq->wb_bytes;
> +
> + if (bytes_left) {
> + subreq = nfs_create_request(req->wb_context,
> + req->wb_page,
> + subreq, pgbase, bytes_left);
> + nfs_lock_request(subreq);
> + subreq->wb_offset = offset;
> + subreq->wb_index = req->wb_index;
> + }
> + } while (bytes_left > 0);
> +
> + nfs_page_group_unlock(req);
> return 1;
> }
>
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index 95a0855..ee0a3cd 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
> len = nfs_page_length(page);
> if (len == 0)
> return nfs_return_empty_page(page);
> - new = nfs_create_request(ctx, page, 0, len);
> + new = nfs_create_request(ctx, page, NULL, 0, len);
> if (IS_ERR(new)) {
> unlock_page(page);
> return PTR_ERR(new);
> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
> if (len == 0)
> return nfs_return_empty_page(page);
>
> - new = nfs_create_request(desc->ctx, page, 0, len);
> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
> if (IS_ERR(new))
> goto out_error;
>
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index ca20ec7..d1453f2 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
> }
> nfsi->npages--;
> spin_unlock(&inode->i_lock);
> - nfs_release_request(req);
> + nfs_release_request(head);
> }
>
> static void
> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
> {
> struct nfs_commit_info cinfo;
> unsigned long bytes = 0;
> + bool do_destroy;
>
> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
> goto out;
> @@ -654,6 +655,7 @@ remove_req:
> next:
> nfs_unlock_request(req);
> nfs_end_page_writeback(req->wb_page);
> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
> nfs_release_request(req);
> }
> out:
> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
> if (req == NULL)
> goto out_unlock;
>
> + /* should be handled by nfs_flush_incompatible */
> + WARN_ON_ONCE(req->wb_head != req);
> + WARN_ON_ONCE(req->wb_this_page != req);
> +
> rqend = req->wb_offset + req->wb_bytes;
> /*
> * Tell the caller to flush out the request if
> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
> req = nfs_try_to_update_request(inode, page, offset, bytes);
> if (req != NULL)
> goto out;
> - req = nfs_create_request(ctx, page, offset, bytes);
> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
> if (IS_ERR(req))
> goto out;
> nfs_inode_add_request(inode, req);
> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
> return 0;
> l_ctx = req->wb_lock_context;
> do_flush = req->wb_page != page || req->wb_context != ctx;
> + /* for now, flush if more than 1 request in page_group */
> + do_flush |= req->wb_this_page != req;
> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
> do_flush |= l_ctx->lockowner.l_owner != current->files
> || l_ctx->lockowner.l_pid != current->tgid;
> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> index 214e098..1fb161b 100644
> --- a/include/linux/nfs_page.h
> +++ b/include/linux/nfs_page.h
> @@ -26,6 +26,8 @@ enum {
> PG_MAPPED, /* page private set for buffered io */
> PG_CLEAN, /* write succeeded */
> PG_COMMIT_TO_DS, /* used by pnfs layouts */
> + PG_HEADLOCK, /* page group lock of wb_head */
> + PG_TEARDOWN, /* page group sync for destroy */
> };
>
> struct nfs_inode;
> @@ -41,6 +43,8 @@ struct nfs_page {
> struct kref wb_kref; /* reference count */
> unsigned long wb_flags;
> struct nfs_write_verifier wb_verf; /* Commit cookie */
> + struct nfs_page *wb_this_page; /* list of reqs for this page */
> + struct nfs_page *wb_head; /* head pointer for req list */

Hmm ok, so to make sure I understand...

So page->private will point to the "head" req (struct page_private).
Then we'll have a singly-linked list of reqs hanging off of
wb_this_page. Is that right?

If so, then it seems like it would be clearer to use a standard
list_head here. If you need to get to the wb_head, you could always do
something like this:

list_first_entry(&req->wb_page->wb_this_page);

...and could even turn that into a macro or static inline for some
syntactic sugar. It's a little more pointer chasing to find the head,
but it seems like that would be clearer than using yet another
linked-list implementation.

> };
>
> struct nfs_pageio_descriptor;
> @@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {
>
> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
> struct page *page,
> + struct nfs_page *last,
> unsigned int offset,
> unsigned int count);
> -extern void nfs_release_request(struct nfs_page *req);
> +extern void nfs_release_request(struct nfs_page *);
>
>
> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
> @@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
> struct nfs_page *req);
> extern int nfs_wait_on_request(struct nfs_page *);
> extern void nfs_unlock_request(struct nfs_page *req);
> -extern void nfs_unlock_and_release_request(struct nfs_page *req);
> +extern void nfs_unlock_and_release_request(struct nfs_page *);
> +extern void nfs_page_group_lock(struct nfs_page *);
> +extern void nfs_page_group_unlock(struct nfs_page *);
> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
>
> /*
> * Lock the page of an asynchronous request


--
Jeff Layton <[email protected]>

2014-04-23 14:36:12

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

On 04/23/2014 10:31 AM, Weston Andros Adamson wrote:
> On Apr 23, 2014, at 10:16 AM, Anna Schumaker <[email protected]> wrote:
>
>> On 04/22/2014 05:29 PM, Weston Andros Adamson wrote:
>>> Since the ability to split pages into subpage requests has been added,
>>> nfs_pgio_header->rpc_list only ever has one wdata/rdata.
>>>
>>> Signed-off-by: Weston Andros Adamson <[email protected]>
>>> ---
>>> fs/nfs/pnfs.c | 41 +++++++++++++++--------------------------
>>> fs/nfs/read.c | 35 +++++------------------------------
>>> fs/nfs/write.c | 38 +++++++-------------------------------
>>> include/linux/nfs_xdr.h | 35 ++++++++++++++++++-----------------
>>> 4 files changed, 45 insertions(+), 104 deletions(-)
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index 7c89385..3b3ec46 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -1600,23 +1600,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
>>> }
>>>
>>> static void
>>> -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
>>> +pnfs_do_write(struct nfs_pageio_descriptor *desc,
>>> + struct nfs_pgio_header *hdr, int how)
>>> {
>>> - struct nfs_write_data *data;
>>> + struct nfs_write_data *data = hdr->data.write;
>>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>>> + enum pnfs_try_status trypnfs;
>>>
>>> desc->pg_lseg = NULL;
>>> - while (!list_empty(head)) {
>>> - enum pnfs_try_status trypnfs;
>>> -
>>> - data = list_first_entry(head, struct nfs_write_data, list);
>>> - list_del_init(&data->list);
>>> -
>>> - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>>> - pnfs_write_through_mds(desc, data);
>>> - }
>>> + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>>> + pnfs_write_through_mds(desc, data);
>>> pnfs_put_lseg(lseg);
>>> }
>>>
>>> @@ -1650,7 +1645,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>>> pnfs_put_lseg(desc->pg_lseg);
>>> desc->pg_lseg = NULL;
>>> } else
>>> - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
>>> + pnfs_do_write(desc, hdr, desc->pg_ioflags);
>>> if (atomic_dec_and_test(&hdr->refcnt))
>>> hdr->completion_ops->completion(hdr);
>>> return ret;
>>> @@ -1758,23 +1753,17 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
>>> }
>>>
>>> static void
>>> -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
>>> +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
>>> {
>>> - struct nfs_read_data *data;
>>> + struct nfs_read_data *data = hdr->data.read;
>>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>>> + enum pnfs_try_status trypnfs;
>>>
>>> desc->pg_lseg = NULL;
>>> - while (!list_empty(head)) {
>>> - enum pnfs_try_status trypnfs;
>>> -
>>> - data = list_first_entry(head, struct nfs_read_data, list);
>>> - list_del_init(&data->list);
>>> -
>>> - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>>> - pnfs_read_through_mds(desc, data);
>>> - }
>>> + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>>> + pnfs_read_through_mds(desc, data);
>>> pnfs_put_lseg(lseg);
>>> }
>>>
>>> @@ -1809,7 +1798,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>>> pnfs_put_lseg(desc->pg_lseg);
>>> desc->pg_lseg = NULL;
>>> } else
>>> - pnfs_do_multiple_reads(desc, &hdr->rpc_list);
>>> + pnfs_do_read(desc, hdr);
>>> if (atomic_dec_and_test(&hdr->refcnt))
>>> hdr->completion_ops->completion(hdr);
>>> return ret;
>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>> index daeff0c..c6b7dd0 100644
>>> --- a/fs/nfs/read.c
>>> +++ b/fs/nfs/read.c
>>> @@ -42,7 +42,6 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
>>> struct nfs_pgio_header *hdr = &rhdr->header;
>>>
>>> INIT_LIST_HEAD(&hdr->pages);
>>> - INIT_LIST_HEAD(&hdr->rpc_list);
>>> spin_lock_init(&hdr->lock);
>>> atomic_set(&hdr->refcnt, 0);
>>> }
>>> @@ -286,26 +285,6 @@ static int nfs_do_read(struct nfs_read_data *data,
>>> return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
>>> }
>>>
>>> -static int
>>> -nfs_do_multiple_reads(struct list_head *head,
>>> - const struct rpc_call_ops *call_ops)
>>> -{
>>> - struct nfs_read_data *data;
>>> - int ret = 0;
>>> -
>>> - while (!list_empty(head)) {
>>> - int ret2;
>>> -
>>> - data = list_first_entry(head, struct nfs_read_data, list);
>>> - list_del_init(&data->list);
>>> -
>>> - ret2 = nfs_do_read(data, call_ops);
>>> - if (ret == 0)
>>> - ret = ret2;
>>> - }
>>> - return ret;
>>> -}
>>> -
>>> static void
>>> nfs_async_read_error(struct list_head *head)
>>> {
>>> @@ -327,12 +306,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
>>> struct nfs_pgio_header *hdr)
>>> {
>>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>>> - while (!list_empty(&hdr->rpc_list)) {
>>> - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
>>> - struct nfs_read_data, list);
>>> - list_del(&data->list);
>>> - nfs_readdata_release(data);
>>> - }
>>> + nfs_readdata_release(hdr->data.read);
>>> + hdr->data.read = NULL;
>>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>>> }
>>>
>>> @@ -364,7 +339,8 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
>>> }
>>>
>>> nfs_read_rpcsetup(data, desc->pg_count, 0);
>>> - list_add(&data->list, &hdr->rpc_list);
>>> + WARN_ON_ONCE(hdr->data.read);
>>> + hdr->data.read = data;
>>> desc->pg_rpc_callops = &nfs_read_common_ops;
>>> return 0;
>>> }
>>> @@ -386,8 +362,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>>> atomic_inc(&hdr->refcnt);
>>> ret = nfs_generic_pagein(desc, hdr);
>>> if (ret == 0)
>>> - ret = nfs_do_multiple_reads(&hdr->rpc_list,
>>> - desc->pg_rpc_callops);
>>> + ret = nfs_do_read(hdr->data.read, desc->pg_rpc_callops);
>>> if (atomic_dec_and_test(&hdr->refcnt))
>>> hdr->completion_ops->completion(hdr);
>>> return ret;
>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>> index f40db93..cd24a14 100644
>>> --- a/fs/nfs/write.c
>>> +++ b/fs/nfs/write.c
>>> @@ -79,7 +79,6 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
>>>
>>> memset(p, 0, sizeof(*p));
>>> INIT_LIST_HEAD(&hdr->pages);
>>> - INIT_LIST_HEAD(&hdr->rpc_list);
>>> spin_lock_init(&hdr->lock);
>>> atomic_set(&hdr->refcnt, 0);
>>> hdr->verf = &p->verf;
>>> @@ -1171,26 +1170,6 @@ static int nfs_do_write(struct nfs_write_data *data,
>>> return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
>>> }
>>>
>>> -static int nfs_do_multiple_writes(struct list_head *head,
>>> - const struct rpc_call_ops *call_ops,
>>> - int how)
>>> -{
>>> - struct nfs_write_data *data;
>>> - int ret = 0;
>>> -
>>> - while (!list_empty(head)) {
>>> - int ret2;
>>> -
>>> - data = list_first_entry(head, struct nfs_write_data, list);
>>> - list_del_init(&data->list);
>>> -
>>> - ret2 = nfs_do_write(data, call_ops, how);
>>> - if (ret == 0)
>>> - ret = ret2;
>>> - }
>>> - return ret;
>>> -}
>>> -
>>> /* If a nfs_flush_* function fails, it should remove reqs from @head and
>>> * call this on each, which will prepare them to be retried on next
>>> * writeback using standard nfs.
>>> @@ -1223,12 +1202,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
>>> struct nfs_pgio_header *hdr)
>>> {
>>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>>> - while (!list_empty(&hdr->rpc_list)) {
>>> - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
>>> - struct nfs_write_data, list);
>>> - list_del(&data->list);
>>> - nfs_writedata_release(data);
>>> - }
>>> + nfs_writedata_release(hdr->data.write);
>>> + hdr->data.write = NULL;
>>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>>> }
>>>
>>> @@ -1275,7 +1250,8 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>>>
>>> /* Set up the argument struct */
>>> nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
>>> - list_add(&data->list, &hdr->rpc_list);
>>> + WARN_ON_ONCE(hdr->data.write);
>>> + hdr->data.write = data;
>>> desc->pg_rpc_callops = &nfs_write_common_ops;
>>> return 0;
>>> }
>>> @@ -1297,9 +1273,9 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>>> atomic_inc(&hdr->refcnt);
>>> ret = nfs_generic_flush(desc, hdr);
>>> if (ret == 0)
>>> - ret = nfs_do_multiple_writes(&hdr->rpc_list,
>>> - desc->pg_rpc_callops,
>>> - desc->pg_ioflags);
>>> + ret = nfs_do_write(hdr->data.write,
>>> + desc->pg_rpc_callops,
>>> + desc->pg_ioflags);
>>> if (atomic_dec_and_test(&hdr->refcnt))
>>> hdr->completion_ops->completion(hdr);
>>> return ret;
>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>>> index 6fb5b23..239274d 100644
>>> --- a/include/linux/nfs_xdr.h
>>> +++ b/include/linux/nfs_xdr.h
>>> @@ -1266,7 +1266,6 @@ struct nfs_page_array {
>>>
>>> struct nfs_read_data {
>>> struct nfs_pgio_header *header;
>>> - struct list_head list;
>>> struct rpc_task task;
>>> struct nfs_fattr fattr; /* fattr storage */
>>> struct nfs_readargs args;
>>> @@ -1278,6 +1277,20 @@ struct nfs_read_data {
>>> struct nfs_client *ds_clp; /* pNFS data server */
>>> };
>>>
>>> +struct nfs_write_data {
>>> + struct nfs_pgio_header *header;
>>> + struct rpc_task task;
>>> + struct nfs_fattr fattr;
>>> + struct nfs_writeverf verf;
>>> + struct nfs_writeargs args; /* argument struct */
>>> + struct nfs_writeres res; /* result struct */
>>> + unsigned long timestamp; /* For lease renewal */
>>> + int (*write_done_cb)(struct rpc_task *, struct nfs_write_data *);
>>> + __u64 mds_offset; /* Filelayout dense stripe */
>>> + struct nfs_page_array pages;
>>> + struct nfs_client *ds_clp; /* pNFS data server */
>>> +};
>>> +
>>> /* used as flag bits in nfs_pgio_header */
>>> enum {
>>> NFS_IOHDR_ERROR = 0,
>>> @@ -1291,7 +1304,10 @@ struct nfs_pgio_header {
>>> struct inode *inode;
>>> struct rpc_cred *cred;
>>> struct list_head pages;
>>> - struct list_head rpc_list;
>>> + union {
>>> + struct nfs_read_data *read;
>>> + struct nfs_write_data *write;
>>> + } data;
>> The first 5 patches in my series makes it so we can share all of these structs. Would it be useful to put those in first?
>>
>> Anna
>>
> Yes, I think it makes sense to stage most (if not all) of your patches first then merge my patches in.
>
> I think I�ll just give it a shot and see how bad it is. I need to post a rebased version of my patchset anyway,
> so I�ll see if I can also rebase on top of your changes.
>
> Any objections?

No objections! As a reminder, I'm based off of Trond's [testing] branch with two extra pageio cleanups from Christoph. Shoot me an email if you need help!

Anna
>
> -dros
>
>>> atomic_t refcnt;
>>> struct nfs_page *req;
>>> struct nfs_writeverf *verf;
>>> @@ -1315,21 +1331,6 @@ struct nfs_read_header {
>>> struct nfs_read_data rpc_data;
>>> };
>>>
>>> -struct nfs_write_data {
>>> - struct nfs_pgio_header *header;
>>> - struct list_head list;
>>> - struct rpc_task task;
>>> - struct nfs_fattr fattr;
>>> - struct nfs_writeverf verf;
>>> - struct nfs_writeargs args; /* argument struct */
>>> - struct nfs_writeres res; /* result struct */
>>> - unsigned long timestamp; /* For lease renewal */
>>> - int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
>>> - __u64 mds_offset; /* Filelayout dense stripe */
>>> - struct nfs_page_array pages;
>>> - struct nfs_client *ds_clp; /* pNFS data server */
>>> -};
>>> -
>>> struct nfs_write_header {
>>> struct nfs_pgio_header header;
>>> struct nfs_write_data rpc_data;

2014-04-24 15:45:10

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page

On Thu, 24 Apr 2014 11:23:19 -0400
Weston Andros Adamson <[email protected]> wrote:

> On Apr 24, 2014, at 10:50 AM, Jeff Layton <[email protected]> wrote:
>
> > On Tue, 22 Apr 2014 17:29:13 -0400
> > Weston Andros Adamson <[email protected]> wrote:
> >
> >> Add "page groups" - a circular list of nfs requests (struct nfs_page)
> >> that all reference the same page. This gives nfs read and write paths
> >> the ability to account for sub-page regions independently. This
> >> somewhat follows the design of struct buffer_head's sub-page
> >> accounting.
> >>
> >> Only "head" requests are ever added/removed from the inode list in
> >> the buffered write path. "head" and "sub" requests are treated the
> >> same through the read path and the rest of the write/commit path.
> >> Requests are given an extra reference across the life of the list.
> >>
> >> Page groups are never rejoined after being split. If the read/write
> >> request fails and the client falls back to another path (ie revert
> >> to MDS in PNFS case), the already split requests are pushed through
> >> the recoalescing code again, which may split them further and then
> >> coalesce them into properly sized requests on the wire. Fragmentation
> >> shouldn't be a problem with the current design, because we flush all
> >> requests in page group when a non-contiguous request is added, so
> >> the only time resplitting should occur is on a resend of a read or
> >> write.
> >>
> >> This patch lays the groundwork for sub-page splitting, but does not
> >> actually do any splitting. For now all page groups have one request
> >> as pg_test functions don't yet split pages. There are several related
> >> patches that are needed support multiple requests per page group.
> >>
> >> Signed-off-by: Weston Andros Adamson <[email protected]>
> >> ---
> >> fs/nfs/direct.c | 7 +-
> >> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
> >> fs/nfs/read.c | 4 +-
> >> fs/nfs/write.c | 12 ++-
> >> include/linux/nfs_page.h | 12 ++-
> >> 5 files changed, 231 insertions(+), 22 deletions(-)
> >>
> >> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> >> index a0c30c5..9d968ca 100644
> >> --- a/fs/nfs/direct.c
> >> +++ b/fs/nfs/direct.c
> >> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
> >> struct nfs_page *req;
> >> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
> >> /* XXX do we need to do the eof zeroing found in async_filler? */
> >> - req = nfs_create_request(dreq->ctx, pagevec[i],
> >> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> >> pgbase, req_len);
> >> if (IS_ERR(req)) {
> >> result = PTR_ERR(req);
> >> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
> >> struct nfs_page *req;
> >> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
> >>
> >> - req = nfs_create_request(dreq->ctx, pagevec[i],
> >> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
> >> pgbase, req_len);
> >> if (IS_ERR(req)) {
> >> result = PTR_ERR(req);
> >> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> >> spin_unlock(&dreq->lock);
> >>
> >> while (!list_empty(&hdr->pages)) {
> >> + bool do_destroy = true;
> >> +
> >> req = nfs_list_entry(hdr->pages.next);
> >> nfs_list_remove_request(req);
> >> switch (bit) {
> >> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
> >> case NFS_IOHDR_NEED_COMMIT:
> >> kref_get(&req->wb_kref);
> >> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
> >> + do_destroy = false;
> >> }
> >> nfs_unlock_and_release_request(req);
> >> }
> >> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> >> index ac4fb64..8cb8e14 100644
> >> --- a/fs/nfs/pagelist.c
> >> +++ b/fs/nfs/pagelist.c
> >> @@ -26,6 +26,8 @@
> >>
> >> static struct kmem_cache *nfs_page_cachep;
> >>
> >> +static void nfs_free_request(struct nfs_page *);
> >> +
> >> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
> >> {
> >> p->npages = pagecount;
> >> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> >> return __nfs_iocounter_wait(c);
> >> }
> >>
> >> +/*
> >> + * nfs_page_group_lock - lock the head of the page group
> >> + * @req - request in group that is to be locked
> >> + *
> >> + * this lock must be held if modifying the page group list
> >> + */
> >> +void
> >> +nfs_page_group_lock(struct nfs_page *req)
> >> +{
> >> + struct nfs_page *head = req->wb_head;
> >> + int err = -EAGAIN;
> >> +
> >> + WARN_ON_ONCE(head != head->wb_head);
> >> +
> >> + while (err)
> >> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
> >> + nfs_wait_bit_killable, TASK_KILLABLE);
> >> +}
> >> +
> >> +/*
> >> + * nfs_page_group_unlock - unlock the head of the page group
> >> + * @req - request in group that is to be unlocked
> >> + */
> >> +void
> >> +nfs_page_group_unlock(struct nfs_page *req)
> >> +{
> >> + struct nfs_page *head = req->wb_head;
> >> +
> >> + WARN_ON_ONCE(head != head->wb_head);
> >> +
> >> + smp_mb__before_clear_bit();
> >> + clear_bit(PG_HEADLOCK, &head->wb_flags);
> >> + smp_mb__after_clear_bit();
> >> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
> >> +}
> >> +
> >> +/*
> >> + * nfs_page_group_sync_on_bit_locked
> >> + *
> >> + * must be called with page group lock held
> >> + */
> >> +static bool
> >> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
> >> +{
> >> + struct nfs_page *head = req->wb_head;
> >> + struct nfs_page *tmp;
> >> +
> >> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
> >> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
> >> +
> >> + tmp = req->wb_this_page;
> >> + while (tmp != req) {
> >> + if (!test_bit(bit, &tmp->wb_flags))
> >> + return false;
> >> + tmp = tmp->wb_this_page;
> >> + }
> >> +
> >> + /* true! reset all bits */
> >> + tmp = req;
> >> + do {
> >> + clear_bit(bit, &tmp->wb_flags);
> >> + tmp = tmp->wb_this_page;
> >> + } while (tmp != req);
> >> +
> >> + return true;
> >> +}
> >> +
> >> +/*
> >> + * nfs_page_group_sync_on_bit - set bit on current request, but only
> >> + * return true if the bit is set for all requests in page group
> >> + * @req - request in page group
> >> + * @bit - PG_* bit that is used to sync page group
> >> + */
> >> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
> >> +{
> >> + bool ret;
> >> +
> >> + nfs_page_group_lock(req);
> >> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
> >> + nfs_page_group_unlock(req);
> >> +
> >> + return ret;
> >> +}
> >> +
> >> +/*
> >> + * nfs_page_group_init - Initialize the page group linkage for @req
> >> + * @req - a new nfs request
> >> + * @prev - the previous request in page group, or NULL if @req is the first
> >> + * or only request in the group (the head).
> >> + */
> >> +static inline void
> >> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
> >> +{
> >> + WARN_ON_ONCE(prev == req);
> >> +
> >> + if (!prev) {
> >> + req->wb_head = req;
> >> + req->wb_this_page = req;
> >> + } else {
> >> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
> >> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
> >> + req->wb_head = prev->wb_head;
> >> + req->wb_this_page = prev->wb_this_page;
> >> + prev->wb_this_page = req;
> >> + }
> >> +}
> >> +
> >> +/*
> >> + * nfs_page_group_destroy - sync the destruction of page groups
> >> + * @req - request that no longer needs the page group
> >> + *
> >> + * releases the page group reference from each member once all
> >> + * members have called this function.
> >> + */
> >> +static void
> >> +nfs_page_group_destroy(struct kref *kref)
> >> +{
> >> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> >> + struct nfs_page *tmp, *next;
> >> +
> >> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
> >> + return;
> >> +
> >> + tmp = req;
> >> + do {
> >> + next = tmp->wb_this_page;
> >> + /* unlink and free */
> >> + tmp->wb_this_page = tmp;
> >> + tmp->wb_head = tmp;
> >> + nfs_free_request(tmp);
> >> + tmp = next;
> >> + } while (tmp != req);
> >> +}
> >> +
> >> /**
> >> * nfs_create_request - Create an NFS read/write request.
> >> * @ctx: open context to use
> >> * @page: page to write
> >> + * @last: last nfs request created for this page group or NULL if head
> >> * @offset: starting offset within the page for the write
> >> * @count: number of bytes to read/write
> >> *
> >> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
> >> */
> >> struct nfs_page *
> >> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> >> - unsigned int offset, unsigned int count)
> >> + struct nfs_page *last, unsigned int offset,
> >> + unsigned int count)
> >> {
> >> struct nfs_page *req;
> >> struct nfs_lock_context *l_ctx;
> >> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
> >> req->wb_bytes = count;
> >> req->wb_context = get_nfs_open_context(ctx);
> >> kref_init(&req->wb_kref);
> >> + nfs_page_group_init(req, last);
> >> return req;
> >> }
> >>
> >> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
> >> }
> >> }
> >>
> >> -
> >> /**
> >> * nfs_release_request - Release the count on an NFS read/write request
> >> * @req: request to release
> >> *
> >> * Note: Should never be called with the spinlock held!
> >> */
> >> -static void nfs_free_request(struct kref *kref)
> >> +static void nfs_free_request(struct nfs_page *req)
> >> {
> >> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
> >> + WARN_ON_ONCE(req->wb_this_page != req);
> >> +
> >> + /* extra debug: make sure no sync bits are still set */
> >> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
> >> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
> >> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
> >> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
> >> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
> >>
> >> /* Release struct file and open context */
> >> nfs_clear_request(req);
> >> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
> >>
> >> void nfs_release_request(struct nfs_page *req)
> >> {
> >> - kref_put(&req->wb_kref, nfs_free_request);
> >> + kref_put(&req->wb_kref, nfs_page_group_destroy);
> >> }
> >>
> >> static int nfs_wait_bit_uninterruptible(void *word)
> >> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
> >> * @desc: destination io descriptor
> >> * @req: request
> >> *
> >> + * This may split a request into subrequests which are all part of the
> >> + * same page group.
> >> + *
> >> * Returns true if the request 'req' was successfully coalesced into the
> >> * existing list of pages 'desc'.
> >> */
> >> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
> >> struct nfs_page *req)
> >> {
> >> - while (!nfs_pageio_do_add_request(desc, req)) {
> >> - desc->pg_moreio = 1;
> >> - nfs_pageio_doio(desc);
> >> - if (desc->pg_error < 0)
> >> - return 0;
> >> - desc->pg_moreio = 0;
> >> - if (desc->pg_recoalesce)
> >> - return 0;
> >> - }
> >> + struct nfs_page *subreq;
> >> + unsigned int bytes_left = 0;
> >> + unsigned int offset, pgbase;
> >> +
> >> + nfs_page_group_lock(req);
> >> +
> >> + subreq = req;
> >> + bytes_left = subreq->wb_bytes;
> >> + offset = subreq->wb_offset;
> >> + pgbase = subreq->wb_pgbase;
> >> +
> >> + do {
> >> + if (!nfs_pageio_do_add_request(desc, subreq)) {
> >> + /* make sure pg_test call(s) did nothing */
> >> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
> >> + WARN_ON_ONCE(subreq->wb_offset != offset);
> >> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
> >> +
> >> + nfs_page_group_unlock(req);
> >> + desc->pg_moreio = 1;
> >> + nfs_pageio_doio(desc);
> >> + if (desc->pg_error < 0)
> >> + return 0;
> >> + desc->pg_moreio = 0;
> >> + if (desc->pg_recoalesce)
> >> + return 0;
> >> + /* retry add_request for this subreq */
> >> + nfs_page_group_lock(req);
> >> + continue;
> >> + }
> >> +
> >> + /* check for buggy pg_test call(s) */
> >> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
> >> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
> >> + WARN_ON_ONCE(subreq->wb_bytes == 0);
> >> +
> >> + bytes_left -= subreq->wb_bytes;
> >> + offset += subreq->wb_bytes;
> >> + pgbase += subreq->wb_bytes;
> >> +
> >> + if (bytes_left) {
> >> + subreq = nfs_create_request(req->wb_context,
> >> + req->wb_page,
> >> + subreq, pgbase, bytes_left);
> >> + nfs_lock_request(subreq);
> >> + subreq->wb_offset = offset;
> >> + subreq->wb_index = req->wb_index;
> >> + }
> >> + } while (bytes_left > 0);
> >> +
> >> + nfs_page_group_unlock(req);
> >> return 1;
> >> }
> >>
> >> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> >> index 95a0855..ee0a3cd 100644
> >> --- a/fs/nfs/read.c
> >> +++ b/fs/nfs/read.c
> >> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
> >> len = nfs_page_length(page);
> >> if (len == 0)
> >> return nfs_return_empty_page(page);
> >> - new = nfs_create_request(ctx, page, 0, len);
> >> + new = nfs_create_request(ctx, page, NULL, 0, len);
> >> if (IS_ERR(new)) {
> >> unlock_page(page);
> >> return PTR_ERR(new);
> >> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
> >> if (len == 0)
> >> return nfs_return_empty_page(page);
> >>
> >> - new = nfs_create_request(desc->ctx, page, 0, len);
> >> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
> >> if (IS_ERR(new))
> >> goto out_error;
> >>
> >> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> >> index ca20ec7..d1453f2 100644
> >> --- a/fs/nfs/write.c
> >> +++ b/fs/nfs/write.c
> >> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
> >> }
> >> nfsi->npages--;
> >> spin_unlock(&inode->i_lock);
> >> - nfs_release_request(req);
> >> + nfs_release_request(head);
> >> }
> >>
> >> static void
> >> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
> >> {
> >> struct nfs_commit_info cinfo;
> >> unsigned long bytes = 0;
> >> + bool do_destroy;
> >>
> >> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
> >> goto out;
> >> @@ -654,6 +655,7 @@ remove_req:
> >> next:
> >> nfs_unlock_request(req);
> >> nfs_end_page_writeback(req->wb_page);
> >> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
> >> nfs_release_request(req);
> >> }
> >> out:
> >> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
> >> if (req == NULL)
> >> goto out_unlock;
> >>
> >> + /* should be handled by nfs_flush_incompatible */
> >> + WARN_ON_ONCE(req->wb_head != req);
> >> + WARN_ON_ONCE(req->wb_this_page != req);
> >> +
> >> rqend = req->wb_offset + req->wb_bytes;
> >> /*
> >> * Tell the caller to flush out the request if
> >> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
> >> req = nfs_try_to_update_request(inode, page, offset, bytes);
> >> if (req != NULL)
> >> goto out;
> >> - req = nfs_create_request(ctx, page, offset, bytes);
> >> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
> >> if (IS_ERR(req))
> >> goto out;
> >> nfs_inode_add_request(inode, req);
> >> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
> >> return 0;
> >> l_ctx = req->wb_lock_context;
> >> do_flush = req->wb_page != page || req->wb_context != ctx;
> >> + /* for now, flush if more than 1 request in page_group */
> >> + do_flush |= req->wb_this_page != req;
> >> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
> >> do_flush |= l_ctx->lockowner.l_owner != current->files
> >> || l_ctx->lockowner.l_pid != current->tgid;
> >> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> >> index 214e098..1fb161b 100644
> >> --- a/include/linux/nfs_page.h
> >> +++ b/include/linux/nfs_page.h
> >> @@ -26,6 +26,8 @@ enum {
> >> PG_MAPPED, /* page private set for buffered io */
> >> PG_CLEAN, /* write succeeded */
> >> PG_COMMIT_TO_DS, /* used by pnfs layouts */
> >> + PG_HEADLOCK, /* page group lock of wb_head */
> >> + PG_TEARDOWN, /* page group sync for destroy */
> >> };
> >>
> >> struct nfs_inode;
> >> @@ -41,6 +43,8 @@ struct nfs_page {
> >> struct kref wb_kref; /* reference count */
> >> unsigned long wb_flags;
> >> struct nfs_write_verifier wb_verf; /* Commit cookie */
> >> + struct nfs_page *wb_this_page; /* list of reqs for this page */
> >> + struct nfs_page *wb_head; /* head pointer for req list */
> >
> > Hmm ok, so to make sure I understand...
> >
> > So page->private will point to the "head" req (struct page_private).
>
> Only in the buffered write case. Page->private is not set for read path / direct i/o path.
>
> > Then we'll have a singly-linked list of reqs hanging off of
> > wb_this_page. Is that right?
> >
> > If so, then it seems like it would be clearer to use a standard
> > list_head here. If you need to get to the wb_head, you could always do
> > something like this:
> >
> > list_first_entry(&req->wb_page->wb_this_page);
>
> Well, wb_page is a struct page and doesn?t have wb_this_page (which is in struct
> nfs_page), but I see where you?re going with this.
>

Doh, right! Sorry, I threw that together in haste, but you get the
idea. I was thinking you could go back to the page and dereference
->private.

> A strategy like this only works if we always have page->private pointing to the head
> request. We chose not to go that way because it messes with the buffered
> write path?s setting / clearing of page private which interacts with the swappable
> nfs pages code that everyone seems to be afraid to touch ;)
>
> So we decided to go this route (not messing with page_private) as a first step - we
> certainly could add it later, but the current approach makes things less complex.
>

Ok, that makes sense. Thanks...

> >
> > ...and could even turn that into a macro or static inline for some
> > syntactic sugar. It's a little more pointer chasing to find the head,
> > but it seems like that would be clearer than using yet another
> > linked-list implementation.
>
> So, I?m not against using list_head.. I didn?t go that route initially because I was:
>
> 1) following the buffer_head example, which rolls it?s own list
>

I wouldn't be surprised if the buffer_head code predates the standard
linked-list macros, so that probably explains why they did it that way.
The file locking code has a similar construct in inode->i_flock list.

> 2) trying to grow nfs_page as little as possible - but we might have room within
> the allocator bucket it currently lives in?
>

nfs_page comes out of a dedicated slabcache, so that probably won't be the case.

> 3) not sure list_head is suitable for a circular list (I haven?t ever looked into it).
>
> and until we have a way to find the head request (via page private, etc) without
> walking the circular list (chicken / egg problem needing to grab head lock before walking
> list to find the head to lock it), we?ll still need the head pointer.
>
> Thoughts?
>
> -dros
>

If you can't rely on page->private pointing to the request, then that
does make it tough to do what I was suggesting. struct list_head lists
are doubly-linked and circular by nature, so that does seem to be a
natural fit for what you're trying to do.

The only problem is that struct list_head is two pointers instead of
one, so it's not going to be as space-efficient as what you're doing
here. If that's a large concern then you may have no choice but to do
this after all.

> >
> >> };
> >>
> >> struct nfs_pageio_descriptor;
> >> @@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {
> >>
> >> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
> >> struct page *page,
> >> + struct nfs_page *last,
> >> unsigned int offset,
> >> unsigned int count);
> >> -extern void nfs_release_request(struct nfs_page *req);
> >> +extern void nfs_release_request(struct nfs_page *);
> >>
> >>
> >> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
> >> @@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
> >> struct nfs_page *req);
> >> extern int nfs_wait_on_request(struct nfs_page *);
> >> extern void nfs_unlock_request(struct nfs_page *req);
> >> -extern void nfs_unlock_and_release_request(struct nfs_page *req);
> >> +extern void nfs_unlock_and_release_request(struct nfs_page *);
> >> +extern void nfs_page_group_lock(struct nfs_page *);
> >> +extern void nfs_page_group_unlock(struct nfs_page *);
> >> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
> >>
> >> /*
> >> * Lock the page of an asynchronous request
> >
> >
> > --
> > Jeff Layton <[email protected]>
>


--
Jeff Layton <[email protected]>

2014-04-22 21:29:16

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 12/17] nfs: use > 1 request to handle bsize < PAGE_SIZE

Use the newly added support for multiple requests per page for
rsize/wsize < PAGE_SIZE, instead of having multiple read / write
data structures per pageio header.

This allows us to get rid of nfs_flush_multi and nfs_pagein_multi.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/pagelist.c | 22 +++++++------------
fs/nfs/read.c | 63 ++++++------------------------------------------------
fs/nfs/write.c | 64 ++++++-------------------------------------------------
3 files changed, 22 insertions(+), 127 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b5c4c13..e819b1b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -434,21 +434,13 @@ nfs_wait_on_request(struct nfs_page *req)
size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
struct nfs_page *prev, struct nfs_page *req)
{
- if (!prev)
- return req->wb_bytes;
- /*
- * FIXME: ideally we should be able to coalesce all requests
- * that are not block boundary aligned, but currently this
- * is problematic for the case of bsize < PAGE_CACHE_SIZE,
- * since nfs_flush_multi and nfs_pagein_multi assume you
- * can have only one struct nfs_page.
- */
- if (desc->pg_bsize < PAGE_SIZE)
+ if (desc->pg_count > desc->pg_bsize) {
+ /* should never happen */
+ WARN_ON_ONCE(1);
return 0;
+ }

- if (desc->pg_count + req->wb_bytes <= desc->pg_bsize)
- return req->wb_bytes;
- return 0;
+ return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);

@@ -526,7 +518,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
return false;
}
size = pgio->pg_ops->pg_test(pgio, prev, req);
- WARN_ON_ONCE(size && size != req->wb_bytes);
+ WARN_ON_ONCE(size > req->wb_bytes);
+ if (size && size < req->wb_bytes)
+ req->wb_bytes = size;
return size > 0;
}

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c774810..daeff0c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -336,56 +336,12 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
}

-/*
- * Generate multiple requests to fill a single page.
- *
- * We optimize to reduce the number of read operations on the wire. If we
- * detect that we're reading a page, or an area of a page, that is past the
- * end of file, we do not generate NFS read operations but just clear the
- * parts of the page that would have come back zero from the server anyway.
- *
- * We rely on the cached value of i_size to make this determination; another
- * client can fill pages on the server past our cached end-of-file, but we
- * won't see the new data until our attribute cache is updated. This is more
- * or less conventional NFS client behavior.
- */
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req = hdr->req;
- struct page *page = req->wb_page;
- struct nfs_read_data *data;
- size_t rsize = desc->pg_bsize, nbytes;
- unsigned int offset;
-
- offset = 0;
- nbytes = desc->pg_count;
- do {
- size_t len = min(nbytes,rsize);
-
- data = nfs_readdata_alloc(hdr, 1);
- if (!data) {
- nfs_pagein_error(desc, hdr);
- return -ENOMEM;
- }
- data->pages.pagevec[0] = page;
- nfs_read_rpcsetup(data, len, offset);
- list_add(&data->list, &hdr->rpc_list);
- nbytes -= len;
- offset += len;
- } while (nbytes != 0);
-
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- desc->pg_rpc_callops = &nfs_read_common_ops;
- return 0;
-}
-
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
{
struct nfs_page *req;
struct page **pages;
+ struct page *last_page;
struct nfs_read_data *data;
struct list_head *head = &desc->pg_list;

@@ -397,11 +353,14 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
}

pages = data->pages.pagevec;
+ last_page = NULL;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
+ if (last_page != req->wb_page)
+ *pages++ = req->wb_page;
+ last_page = req->wb_page;
}

nfs_read_rpcsetup(data, desc->pg_count, 0);
@@ -409,14 +368,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
desc->pg_rpc_callops = &nfs_read_common_ops;
return 0;
}
-
-int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- if (desc->pg_bsize < PAGE_CACHE_SIZE)
- return nfs_pagein_multi(desc, hdr);
- return nfs_pagein_one(desc, hdr);
-}
EXPORT_SYMBOL_GPL(nfs_generic_pagein);

static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 43892e0..f40db93 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1233,52 +1233,6 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
}

/*
- * Generate multiple small requests to write out a single
- * contiguous dirty area on one page.
- */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req = hdr->req;
- struct page *page = req->wb_page;
- struct nfs_write_data *data;
- size_t wsize = desc->pg_bsize, nbytes;
- unsigned int offset;
- int requests = 0;
- struct nfs_commit_info cinfo;
-
- nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-
- if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
- (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
- desc->pg_count > wsize))
- desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
-
- offset = 0;
- nbytes = desc->pg_count;
- do {
- size_t len = min(nbytes, wsize);
-
- data = nfs_writedata_alloc(hdr, 1);
- if (!data) {
- nfs_flush_error(desc, hdr);
- return -ENOMEM;
- }
- data->pages.pagevec[0] = page;
- nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
- list_add(&data->list, &hdr->rpc_list);
- requests++;
- nbytes -= len;
- offset += len;
- } while (nbytes != 0);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- desc->pg_rpc_callops = &nfs_write_common_ops;
- return 0;
-}
-
-/*
* Create an RPC task for the given write request and kick it.
* The page must have been locked by the caller.
*
@@ -1286,11 +1240,12 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
* This is the case if nfs_updatepage detects a conflicting request
* that has been written but not committed.
*/
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
{
struct nfs_page *req;
struct page **pages;
+ struct page *last_page;
struct nfs_write_data *data;
struct list_head *head = &desc->pg_list;
struct nfs_commit_info cinfo;
@@ -1304,11 +1259,14 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,

nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
pages = data->pages.pagevec;
+ last_page = NULL;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
+ if (last_page != req->wb_page)
+ *pages++ = req->wb_page;
+ last_page = req->wb_page;
}

if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
@@ -1321,14 +1279,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
desc->pg_rpc_callops = &nfs_write_common_ops;
return 0;
}
-
-int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- if (desc->pg_bsize < PAGE_CACHE_SIZE)
- return nfs_flush_multi(desc, hdr);
- return nfs_flush_one(desc, hdr);
-}
EXPORT_SYMBOL_GPL(nfs_generic_flush);

static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
--
1.8.5.2 (Apple Git-48)


2014-04-22 21:28:59

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 01/17] nfs: clean up PG_* flags

Remove unused flags PG_NEED_COMMIT and PG_NEED_RESCHED.
Add comments describing how each flag is used.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
include/linux/nfs_page.h | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 92ce578..93c7293 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -22,12 +22,10 @@
* Valid flags for a dirty buffer
*/
enum {
- PG_BUSY = 0,
- PG_MAPPED,
- PG_CLEAN,
- PG_NEED_COMMIT,
- PG_NEED_RESCHED,
- PG_COMMIT_TO_DS,
+ PG_BUSY = 0, /* nfs_{un}lock_request */
+ PG_MAPPED, /* page private set for buffered io */
+ PG_CLEAN, /* write succeeded */
+ PG_COMMIT_TO_DS, /* used by pnfs layouts */
};

struct nfs_inode;
--
1.8.5.2 (Apple Git-48)


2014-04-23 14:40:32

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 05/17] nfs: add support for multiple nfs reqs per page





On Apr 22, 2014, at 5:40 PM, Weston Andros Adamson <[email protected]> wrote:

> Oh boy, I posted this with a ?cleanup? of page group reference counting,
> but this doesn?t work with certain file layout stripe sizes :-/
>
> I?ll post the older, clunky version (that works) tomorrow if I can?t figure this out quickly.
>
> -dros
>
>
>
> On Apr 22, 2014, at 5:29 PM, Weston Andros Adamson <[email protected]> wrote:
>
>> Add "page groups" - a circular list of nfs requests (struct nfs_page)
>> that all reference the same page. This gives nfs read and write paths
>> the ability to account for sub-page regions independently. This
>> somewhat follows the design of struct buffer_head's sub-page
>> accounting.
>>
>> Only "head" requests are ever added/removed from the inode list in
>> the buffered write path. "head" and "sub" requests are treated the
>> same through the read path and the rest of the write/commit path.
>> Requests are given an extra reference across the life of the list.
>>
>> Page groups are never rejoined after being split. If the read/write
>> request fails and the client falls back to another path (ie revert
>> to MDS in PNFS case), the already split requests are pushed through
>> the recoalescing code again, which may split them further and then
>> coalesce them into properly sized requests on the wire. Fragmentation
>> shouldn't be a problem with the current design, because we flush all
>> requests in page group when a non-contiguous request is added, so
>> the only time resplitting should occur is on a resend of a read or
>> write.
>>
>> This patch lays the groundwork for sub-page splitting, but does not
>> actually do any splitting. For now all page groups have one request
>> as pg_test functions don't yet split pages. There are several related
>> patches that are needed support multiple requests per page group.
>>
>> Signed-off-by: Weston Andros Adamson <[email protected]>
>> ---
>> fs/nfs/direct.c | 7 +-
>> fs/nfs/pagelist.c | 218 ++++++++++++++++++++++++++++++++++++++++++++---
>> fs/nfs/read.c | 4 +-
>> fs/nfs/write.c | 12 ++-
>> include/linux/nfs_page.h | 12 ++-
>> 5 files changed, 231 insertions(+), 22 deletions(-)
>>
>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>> index a0c30c5..9d968ca 100644
>> --- a/fs/nfs/direct.c
>> +++ b/fs/nfs/direct.c
>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
>> struct nfs_page *req;
>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>> /* XXX do we need to do the eof zeroing found in async_filler? */
>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>> pgbase, req_len);
>> if (IS_ERR(req)) {
>> result = PTR_ERR(req);
>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
>> struct nfs_page *req;
>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>
>> - req = nfs_create_request(dreq->ctx, pagevec[i],
>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>> pgbase, req_len);
>> if (IS_ERR(req)) {
>> result = PTR_ERR(req);
>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>> spin_unlock(&dreq->lock);
>>
>> while (!list_empty(&hdr->pages)) {
>> + bool do_destroy = true;
>> +
>> req = nfs_list_entry(hdr->pages.next);
>> nfs_list_remove_request(req);
>> switch (bit) {
>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>> case NFS_IOHDR_NEED_COMMIT:
>> kref_get(&req->wb_kref);
>> nfs_mark_request_commit(req, hdr->lseg, &cinfo);
>> + do_destroy = false;
>> }
>> nfs_unlock_and_release_request(req);
>> }
>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>> index ac4fb64..8cb8e14 100644
>> --- a/fs/nfs/pagelist.c
>> +++ b/fs/nfs/pagelist.c
>> @@ -26,6 +26,8 @@
>>
>> static struct kmem_cache *nfs_page_cachep;
>>
>> +static void nfs_free_request(struct nfs_page *);
>> +
>> bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
>> {
>> p->npages = pagecount;
>> @@ -133,10 +135,145 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>> return __nfs_iocounter_wait(c);
>> }
>>
>> +/*
>> + * nfs_page_group_lock - lock the head of the page group
>> + * @req - request in group that is to be locked
>> + *
>> + * this lock must be held if modifying the page group list
>> + */
>> +void
>> +nfs_page_group_lock(struct nfs_page *req)
>> +{
>> + struct nfs_page *head = req->wb_head;
>> + int err = -EAGAIN;
>> +
>> + WARN_ON_ONCE(head != head->wb_head);
>> +
>> + while (err)
>> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
>> + nfs_wait_bit_killable, TASK_KILLABLE);
>> +}
>> +
>> +/*
>> + * nfs_page_group_unlock - unlock the head of the page group
>> + * @req - request in group that is to be unlocked
>> + */
>> +void
>> +nfs_page_group_unlock(struct nfs_page *req)
>> +{
>> + struct nfs_page *head = req->wb_head;
>> +
>> + WARN_ON_ONCE(head != head->wb_head);
>> +
>> + smp_mb__before_clear_bit();
>> + clear_bit(PG_HEADLOCK, &head->wb_flags);
>> + smp_mb__after_clear_bit();
>> + wake_up_bit(&head->wb_flags, PG_HEADLOCK);
>> +}
>> +
>> +/*
>> + * nfs_page_group_sync_on_bit_locked
>> + *
>> + * must be called with page group lock held
>> + */
>> +static bool
>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
>> +{
>> + struct nfs_page *head = req->wb_head;
>> + struct nfs_page *tmp;
>> +
>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
>> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
>> +
>> + tmp = req->wb_this_page;
>> + while (tmp != req) {
>> + if (!test_bit(bit, &tmp->wb_flags))
>> + return false;
>> + tmp = tmp->wb_this_page;
>> + }
>> +
>> + /* true! reset all bits */
>> + tmp = req;
>> + do {
>> + clear_bit(bit, &tmp->wb_flags);
>> + tmp = tmp->wb_this_page;
>> + } while (tmp != req);
>> +
>> + return true;
>> +}
>> +
>> +/*
>> + * nfs_page_group_sync_on_bit - set bit on current request, but only
>> + * return true if the bit is set for all requests in page group
>> + * @req - request in page group
>> + * @bit - PG_* bit that is used to sync page group
>> + */
>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
>> +{
>> + bool ret;
>> +
>> + nfs_page_group_lock(req);
>> + ret = nfs_page_group_sync_on_bit_locked(req, bit);
>> + nfs_page_group_unlock(req);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * nfs_page_group_init - Initialize the page group linkage for @req
>> + * @req - a new nfs request
>> + * @prev - the previous request in page group, or NULL if @req is the first
>> + * or only request in the group (the head).
>> + */
>> +static inline void
>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
>> +{
>> + WARN_ON_ONCE(prev == req);
>> +
>> + if (!prev) {
>> + req->wb_head = req;
>> + req->wb_this_page = req;
>> + } else {
>> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
>> + req->wb_head = prev->wb_head;
>> + req->wb_this_page = prev->wb_this_page;
>> + prev->wb_this_page = req;
>> + }
>> +}
>> +
>> +/*
>> + * nfs_page_group_destroy - sync the destruction of page groups
>> + * @req - request that no longer needs the page group
>> + *
>> + * releases the page group reference from each member once all
>> + * members have called this function.
>> + */
>> +static void
>> +nfs_page_group_destroy(struct kref *kref)
>> +{
>> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>> + struct nfs_page *tmp, *next;
>> +
>> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
>> + return;
>> +
>> + tmp = req;
>> + do {
>> + next = tmp->wb_this_page;
>> + /* unlink and free */
>> + tmp->wb_this_page = tmp;
>> + tmp->wb_head = tmp;
>> + nfs_free_request(tmp);
>> + tmp = next;
>> + } while (tmp != req);
>> +}
>> +
>> /**
>> * nfs_create_request - Create an NFS read/write request.
>> * @ctx: open context to use
>> * @page: page to write
>> + * @last: last nfs request created for this page group or NULL if head
>> * @offset: starting offset within the page for the write
>> * @count: number of bytes to read/write
>> *
>> @@ -146,7 +283,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>> */
>> struct nfs_page *
>> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>> - unsigned int offset, unsigned int count)
>> + struct nfs_page *last, unsigned int offset,
>> + unsigned int count)
>> {
>> struct nfs_page *req;
>> struct nfs_lock_context *l_ctx;
>> @@ -178,6 +316,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>> req->wb_bytes = count;
>> req->wb_context = get_nfs_open_context(ctx);
>> kref_init(&req->wb_kref);
>> + nfs_page_group_init(req, last);
>> return req;
>> }
>>
>> @@ -235,16 +374,22 @@ static void nfs_clear_request(struct nfs_page *req)
>> }
>> }
>>
>> -
>> /**
>> * nfs_release_request - Release the count on an NFS read/write request
>> * @req: request to release
>> *
>> * Note: Should never be called with the spinlock held!
>> */
>> -static void nfs_free_request(struct kref *kref)
>> +static void nfs_free_request(struct nfs_page *req)
>> {
>> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>> + WARN_ON_ONCE(req->wb_this_page != req);
>> +
>> + /* extra debug: make sure no sync bits are still set */
>> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
>> + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
>>
>> /* Release struct file and open context */
>> nfs_clear_request(req);
>> @@ -253,7 +398,7 @@ static void nfs_free_request(struct kref *kref)
>>
>> void nfs_release_request(struct nfs_page *req)
>> {
>> - kref_put(&req->wb_kref, nfs_free_request);
>> + kref_put(&req->wb_kref, nfs_page_group_destroy);
>> }
>>
>> static int nfs_wait_bit_uninterruptible(void *word)
>> @@ -439,21 +584,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
>> * @desc: destination io descriptor
>> * @req: request
>> *
>> + * This may split a request into subrequests which are all part of the
>> + * same page group.
>> + *
>> * Returns true if the request 'req' was successfully coalesced into the
>> * existing list of pages 'desc'.
>> */
>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
>> struct nfs_page *req)
>> {
>> - while (!nfs_pageio_do_add_request(desc, req)) {
>> - desc->pg_moreio = 1;
>> - nfs_pageio_doio(desc);
>> - if (desc->pg_error < 0)
>> - return 0;
>> - desc->pg_moreio = 0;
>> - if (desc->pg_recoalesce)
>> - return 0;
>> - }
>> + struct nfs_page *subreq;
>> + unsigned int bytes_left = 0;
>> + unsigned int offset, pgbase;
>> +
>> + nfs_page_group_lock(req);
>> +
>> + subreq = req;
>> + bytes_left = subreq->wb_bytes;
>> + offset = subreq->wb_offset;
>> + pgbase = subreq->wb_pgbase;
>> +
>> + do {
>> + if (!nfs_pageio_do_add_request(desc, subreq)) {
>> + /* make sure pg_test call(s) did nothing */
>> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
>> + WARN_ON_ONCE(subreq->wb_offset != offset);
>> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
>> +
>> + nfs_page_group_unlock(req);
>> + desc->pg_moreio = 1;
>> + nfs_pageio_doio(desc);
>> + if (desc->pg_error < 0)
>> + return 0;
>> + desc->pg_moreio = 0;
>> + if (desc->pg_recoalesce)
>> + return 0;
>> + /* retry add_request for this subreq */
>> + nfs_page_group_lock(req);
>> + continue;
>> + }
>> +
>> + /* check for buggy pg_test call(s) */
>> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
>> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
>> + WARN_ON_ONCE(subreq->wb_bytes == 0);
>> +
>> + bytes_left -= subreq->wb_bytes;
>> + offset += subreq->wb_bytes;
>> + pgbase += subreq->wb_bytes;
>> +
>> + if (bytes_left) {
>> + subreq = nfs_create_request(req->wb_context,
>> + req->wb_page,
>> + subreq, pgbase, bytes_left);
>> + nfs_lock_request(subreq);
>> + subreq->wb_offset = offset;
>> + subreq->wb_index = req->wb_index;
>> + }
>> + } while (bytes_left > 0);
>> +
>> + nfs_page_group_unlock(req);
>> return 1;
>> }
>>
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index 95a0855..ee0a3cd 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -139,7 +139,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>> len = nfs_page_length(page);
>> if (len == 0)
>> return nfs_return_empty_page(page);
>> - new = nfs_create_request(ctx, page, 0, len);
>> + new = nfs_create_request(ctx, page, NULL, 0, len);
>> if (IS_ERR(new)) {
>> unlock_page(page);
>> return PTR_ERR(new);
>> @@ -600,7 +600,7 @@ readpage_async_filler(void *data, struct page *page)
>> if (len == 0)
>> return nfs_return_empty_page(page);
>>
>> - new = nfs_create_request(desc->ctx, page, 0, len);
>> + new = nfs_create_request(desc->ctx, page, NULL, 0, len);
>> if (IS_ERR(new))
>> goto out_error;
>>
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index ca20ec7..d1453f2 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -461,7 +461,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
>> }
>> nfsi->npages--;
>> spin_unlock(&inode->i_lock);
>> - nfs_release_request(req);
>> + nfs_release_request(head);
>> }
>>
>> static void
>> @@ -625,6 +625,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
>> {
>> struct nfs_commit_info cinfo;
>> unsigned long bytes = 0;
>> + bool do_destroy;
>>
>> if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
>> goto out;
>> @@ -654,6 +655,7 @@ remove_req:
>> next:
>> nfs_unlock_request(req);
>> nfs_end_page_writeback(req->wb_page);
>> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
>> nfs_release_request(req);
>> }
>> out:
>> @@ -758,6 +760,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
>> if (req == NULL)
>> goto out_unlock;
>>
>> + /* should be handled by nfs_flush_incompatible */
>> + WARN_ON_ONCE(req->wb_head != req);
>> + WARN_ON_ONCE(req->wb_this_page != req);
>> +
>> rqend = req->wb_offset + req->wb_bytes;
>> /*
>> * Tell the caller to flush out the request if
>> @@ -819,7 +825,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
>> req = nfs_try_to_update_request(inode, page, offset, bytes);
>> if (req != NULL)
>> goto out;
>> - req = nfs_create_request(ctx, page, offset, bytes);
>> + req = nfs_create_request(ctx, page, NULL, offset, bytes);
>> if (IS_ERR(req))
>> goto out;
>> nfs_inode_add_request(inode, req);
>> @@ -863,6 +869,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>> return 0;
>> l_ctx = req->wb_lock_context;
>> do_flush = req->wb_page != page || req->wb_context != ctx;
>> + /* for now, flush if more than 1 request in page_group */
>> + do_flush |= req->wb_this_page != req;
>> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
>> do_flush |= l_ctx->lockowner.l_owner != current->files
>> || l_ctx->lockowner.l_pid != current->tgid;
>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
>> index 214e098..1fb161b 100644
>> --- a/include/linux/nfs_page.h
>> +++ b/include/linux/nfs_page.h
>> @@ -26,6 +26,8 @@ enum {
>> PG_MAPPED, /* page private set for buffered io */
>> PG_CLEAN, /* write succeeded */
>> PG_COMMIT_TO_DS, /* used by pnfs layouts */
>> + PG_HEADLOCK, /* page group lock of wb_head */
>> + PG_TEARDOWN, /* page group sync for destroy */
>> };
>>
>> struct nfs_inode;
>> @@ -41,6 +43,8 @@ struct nfs_page {
>> struct kref wb_kref; /* reference count */
>> unsigned long wb_flags;
>> struct nfs_write_verifier wb_verf; /* Commit cookie */
>> + struct nfs_page *wb_this_page; /* list of reqs for this page */
>> + struct nfs_page *wb_head; /* head pointer for req list */
>> };
>>
>> struct nfs_pageio_descriptor;
>> @@ -75,9 +79,10 @@ struct nfs_pageio_descriptor {
>>
>> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
>> struct page *page,
>> + struct nfs_page *last,
>> unsigned int offset,
>> unsigned int count);
>> -extern void nfs_release_request(struct nfs_page *req);
>> +extern void nfs_release_request(struct nfs_page *);
>>
>>
>> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
>> @@ -95,7 +100,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
>> struct nfs_page *req);
>> extern int nfs_wait_on_request(struct nfs_page *);
>> extern void nfs_unlock_request(struct nfs_page *req);
>> -extern void nfs_unlock_and_release_request(struct nfs_page *req);
>> +extern void nfs_unlock_and_release_request(struct nfs_page *);
>> +extern void nfs_page_group_lock(struct nfs_page *);
>> +extern void nfs_page_group_unlock(struct nfs_page *);
>> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
>>
>> /*
>> * Lock the page of an asynchronous request
>> --
>> 1.8.5.2 (Apple Git-48)
>>
>


Attachments:
0001-fixup-handle-sub-request-handoff-between-write-and-c.patch (3.67 kB)

2014-04-23 12:30:17

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 03/17] nfs: modify pg_test interface to return size_t

On 04/23/2014 12:29 AM, Weston Andros Adamson wrote:
> This is a step toward allowing pg_test to inform the the
> coalescing code to reduce the size of requests so they may fit in
> whatever scheme the pg_test callback wants to define.
>
> For now, just return the size of the request if there is space, or 0
> if there is not. This shouldn't change any behavior as it acts
> the same as when the pg_test functions returned bool.
>
> Signed-off-by: Weston Andros Adamson <[email protected]>
> ---
> fs/nfs/blocklayout/blocklayout.c | 16 ++++++++++++----
> fs/nfs/nfs4filelayout.c | 12 +++++++-----
> fs/nfs/objlayout/objio_osd.c | 14 ++++++++++----
> fs/nfs/pagelist.c | 22 +++++++++++++++++++---
> fs/nfs/pnfs.c | 12 +++++++++---
> fs/nfs/pnfs.h | 3 ++-
> include/linux/nfs_page.h | 5 +++--
> 7 files changed, 62 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 65d849b..3867976 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -1189,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
> pnfs_generic_pg_init_read(pgio, req);
> }
>
> -static bool
> +/*
> + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
> + * of bytes (maximum @req->wb_bytes) that can be coalesced.
> + */
> +static size_t
> bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> struct nfs_page *req)
> {
> if (pgio->pg_dreq != NULL &&
> !is_aligned_req(req, SECTOR_SIZE))
> - return false;
> + return 0;
>
> return pnfs_generic_pg_test(pgio, prev, req);
> }
> @@ -1241,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
> }
> }
>
> -static bool
> +/*
> + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
> + * of bytes (maximum @req->wb_bytes) that can be coalesced.
> + */
> +static size_t
> bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> struct nfs_page *req)
> {
> if (pgio->pg_dreq != NULL &&
> !is_aligned_req(req, PAGE_CACHE_SIZE))
> - return false;
> + return 0;
>
> return pnfs_generic_pg_test(pgio, prev, req);
> }
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index b6fc8c1..dfc7282 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -915,10 +915,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
> /*
> * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
> *
> - * return true : coalesce page
> - * return false : don't coalesce page
> + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
> + * of bytes (maximum @req->wb_bytes) that can be coalesced.
> */
> -static bool
> +static size_t
> filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> struct nfs_page *req)
> {
> @@ -927,7 +927,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
>
> if (!pnfs_generic_pg_test(pgio, prev, req) ||
> !nfs_generic_pg_test(pgio, prev, req))
> - return false;
> + return 0;
>
> p_stripe = (u64)req_offset(prev);
> r_stripe = (u64)req_offset(req);
> @@ -936,7 +936,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> do_div(p_stripe, stripe_unit);
> do_div(r_stripe, stripe_unit);
>
> - return (p_stripe == r_stripe);
> + if (p_stripe == r_stripe)
> + return req->wb_bytes;
> + return 0;
> }
>
> static void
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index 5457745..c20352a 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -564,14 +564,20 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
> return 0;
> }
>
> -static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
> +/*
> + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
> + * of bytes (maximum @req->wb_bytes) that can be coalesced.
> + */
> +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
> struct nfs_page *prev, struct nfs_page *req)
> {
> if (!pnfs_generic_pg_test(pgio, prev, req))
> - return false;
> + return 0;
>
> - return pgio->pg_count + req->wb_bytes <=
> - (unsigned long)pgio->pg_layout_private;
> + if (pgio->pg_count + req->wb_bytes <=
> + (unsigned long)pgio->pg_layout_private)
> + return req->wb_bytes;

nit

I hate this form please do an "else". This form I use when there
is no symmetry between the options, its like saying "I cut processing
short in this condition"

Thanks
Boaz

> + return 0;
> }
>
> static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index ecd34b7..3c35b9e 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -277,7 +277,17 @@ nfs_wait_on_request(struct nfs_page *req)
> TASK_UNINTERRUPTIBLE);
> }
>
> -bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
> +/*
> + * nfs_generic_pg_test - determine if requests can be coalesced
> + * @desc: pointer to descriptor
> + * @prev: previous request in desc, or NULL
> + * @req: this request
> + *
> + * Returns zero if @req can be coalesced into @desc, otherwise it returns
> + * the size of the request.
> + */
> +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
> + struct nfs_page *prev, struct nfs_page *req)
> {
> /*
> * FIXME: ideally we should be able to coalesce all requests
> @@ -289,7 +299,9 @@ bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *pr
> if (desc->pg_bsize < PAGE_SIZE)
> return 0;
>
> - return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
> + if (desc->pg_count + req->wb_bytes <= desc->pg_bsize)
> + return req->wb_bytes;
> + return 0;
> }
> EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
>
> @@ -354,6 +366,8 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
> struct nfs_page *req,
> struct nfs_pageio_descriptor *pgio)
> {
> + size_t size;
> +
> if (!nfs_match_open_context(req->wb_context, prev->wb_context))
> return false;
> if (req->wb_context->dentry->d_inode->i_flock != NULL &&
> @@ -365,7 +379,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
> return false;
> if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
> return false;
> - return pgio->pg_ops->pg_test(pgio, prev, req);
> + size = pgio->pg_ops->pg_test(pgio, prev, req);
> + WARN_ON_ONCE(size && size != req->wb_bytes);
> + return size > 0;
> }
>
> /**
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index cb53d45..6201bf6 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1461,7 +1461,11 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
> nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
> }
>
> -bool
> +/*
> + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
> + * of bytes (maximum @req->wb_bytes) that can be coalesced.
> + */
> +size_t
> pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> struct nfs_page *req)
> {
> @@ -1482,8 +1486,10 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> * first byte that lies outside the pnfs_layout_range. FIXME?
> *
> */
> - return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
> - pgio->pg_lseg->pls_range.length);
> + if (req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
> + pgio->pg_lseg->pls_range.length))
> + return req->wb_bytes;
> + return 0;
> }
> EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
>
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 0237939..0386d7c 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -192,7 +192,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
> void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
> struct nfs_page *req, u64 wb_size);
> int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
> -bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
> +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
> + struct nfs_page *prev, struct nfs_page *req);
> void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
> struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
> void pnfs_free_lseg_list(struct list_head *tmp_list);
> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> index 905809d..214e098 100644
> --- a/include/linux/nfs_page.h
> +++ b/include/linux/nfs_page.h
> @@ -46,7 +46,8 @@ struct nfs_page {
> struct nfs_pageio_descriptor;
> struct nfs_pageio_ops {
> void (*pg_init)(struct nfs_pageio_descriptor *, struct nfs_page *);
> - bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
> + size_t (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *,
> + struct nfs_page *);
> int (*pg_doio)(struct nfs_pageio_descriptor *);
> };
>
> @@ -89,7 +90,7 @@ extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
> struct nfs_page *);
> extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc);
> extern void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *, pgoff_t);
> -extern bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
> +extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
> struct nfs_page *prev,
> struct nfs_page *req);
> extern int nfs_wait_on_request(struct nfs_page *);
>


2014-04-22 21:29:13

by Weston Andros Adamson

[permalink] [raw]
Subject: [PATCH 10/17] nfs: allow coalescing of subpage requests

Remove check that the request covers a whole page.

Signed-off-by: Weston Andros Adamson <[email protected]>
---
fs/nfs/pagelist.c | 4 ----
1 file changed, 4 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 8cb8e14..b5c4c13 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -522,10 +522,6 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
!nfs_match_lock_context(req->wb_lock_context,
prev->wb_lock_context))
return false;
- if (req->wb_pgbase != 0)
- return false;
- if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return false;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
return false;
}
--
1.8.5.2 (Apple Git-48)


2014-04-23 14:31:58

by Weston Andros Adamson

[permalink] [raw]
Subject: Re: [PATCH 13/17] nfs: remove list of [rw]data from pgio header

On Apr 23, 2014, at 10:16 AM, Anna Schumaker <[email protected]> wrote:

> On 04/22/2014 05:29 PM, Weston Andros Adamson wrote:
>> Since the ability to split pages into subpage requests has been added,
>> nfs_pgio_header->rpc_list only ever has one wdata/rdata.
>>
>> Signed-off-by: Weston Andros Adamson <[email protected]>
>> ---
>> fs/nfs/pnfs.c | 41 +++++++++++++++--------------------------
>> fs/nfs/read.c | 35 +++++------------------------------
>> fs/nfs/write.c | 38 +++++++-------------------------------
>> include/linux/nfs_xdr.h | 35 ++++++++++++++++++-----------------
>> 4 files changed, 45 insertions(+), 104 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 7c89385..3b3ec46 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -1600,23 +1600,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
>> }
>>
>> static void
>> -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
>> +pnfs_do_write(struct nfs_pageio_descriptor *desc,
>> + struct nfs_pgio_header *hdr, int how)
>> {
>> - struct nfs_write_data *data;
>> + struct nfs_write_data *data = hdr->data.write;
>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>> + enum pnfs_try_status trypnfs;
>>
>> desc->pg_lseg = NULL;
>> - while (!list_empty(head)) {
>> - enum pnfs_try_status trypnfs;
>> -
>> - data = list_first_entry(head, struct nfs_write_data, list);
>> - list_del_init(&data->list);
>> -
>> - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>> - pnfs_write_through_mds(desc, data);
>> - }
>> + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>> + pnfs_write_through_mds(desc, data);
>> pnfs_put_lseg(lseg);
>> }
>>
>> @@ -1650,7 +1645,7 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>> pnfs_put_lseg(desc->pg_lseg);
>> desc->pg_lseg = NULL;
>> } else
>> - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
>> + pnfs_do_write(desc, hdr, desc->pg_ioflags);
>> if (atomic_dec_and_test(&hdr->refcnt))
>> hdr->completion_ops->completion(hdr);
>> return ret;
>> @@ -1758,23 +1753,17 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
>> }
>>
>> static void
>> -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
>> +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
>> {
>> - struct nfs_read_data *data;
>> + struct nfs_read_data *data = hdr->data.read;
>> const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
>> struct pnfs_layout_segment *lseg = desc->pg_lseg;
>> + enum pnfs_try_status trypnfs;
>>
>> desc->pg_lseg = NULL;
>> - while (!list_empty(head)) {
>> - enum pnfs_try_status trypnfs;
>> -
>> - data = list_first_entry(head, struct nfs_read_data, list);
>> - list_del_init(&data->list);
>> -
>> - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>> - if (trypnfs == PNFS_NOT_ATTEMPTED)
>> - pnfs_read_through_mds(desc, data);
>> - }
>> + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
>> + if (trypnfs == PNFS_NOT_ATTEMPTED)
>> + pnfs_read_through_mds(desc, data);
>> pnfs_put_lseg(lseg);
>> }
>>
>> @@ -1809,7 +1798,7 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>> pnfs_put_lseg(desc->pg_lseg);
>> desc->pg_lseg = NULL;
>> } else
>> - pnfs_do_multiple_reads(desc, &hdr->rpc_list);
>> + pnfs_do_read(desc, hdr);
>> if (atomic_dec_and_test(&hdr->refcnt))
>> hdr->completion_ops->completion(hdr);
>> return ret;
>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>> index daeff0c..c6b7dd0 100644
>> --- a/fs/nfs/read.c
>> +++ b/fs/nfs/read.c
>> @@ -42,7 +42,6 @@ struct nfs_read_header *nfs_readhdr_alloc(void)
>> struct nfs_pgio_header *hdr = &rhdr->header;
>>
>> INIT_LIST_HEAD(&hdr->pages);
>> - INIT_LIST_HEAD(&hdr->rpc_list);
>> spin_lock_init(&hdr->lock);
>> atomic_set(&hdr->refcnt, 0);
>> }
>> @@ -286,26 +285,6 @@ static int nfs_do_read(struct nfs_read_data *data,
>> return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
>> }
>>
>> -static int
>> -nfs_do_multiple_reads(struct list_head *head,
>> - const struct rpc_call_ops *call_ops)
>> -{
>> - struct nfs_read_data *data;
>> - int ret = 0;
>> -
>> - while (!list_empty(head)) {
>> - int ret2;
>> -
>> - data = list_first_entry(head, struct nfs_read_data, list);
>> - list_del_init(&data->list);
>> -
>> - ret2 = nfs_do_read(data, call_ops);
>> - if (ret == 0)
>> - ret = ret2;
>> - }
>> - return ret;
>> -}
>> -
>> static void
>> nfs_async_read_error(struct list_head *head)
>> {
>> @@ -327,12 +306,8 @@ static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
>> struct nfs_pgio_header *hdr)
>> {
>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>> - while (!list_empty(&hdr->rpc_list)) {
>> - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
>> - struct nfs_read_data, list);
>> - list_del(&data->list);
>> - nfs_readdata_release(data);
>> - }
>> + nfs_readdata_release(hdr->data.read);
>> + hdr->data.read = NULL;
>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>> }
>>
>> @@ -364,7 +339,8 @@ int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
>> }
>>
>> nfs_read_rpcsetup(data, desc->pg_count, 0);
>> - list_add(&data->list, &hdr->rpc_list);
>> + WARN_ON_ONCE(hdr->data.read);
>> + hdr->data.read = data;
>> desc->pg_rpc_callops = &nfs_read_common_ops;
>> return 0;
>> }
>> @@ -386,8 +362,7 @@ static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
>> atomic_inc(&hdr->refcnt);
>> ret = nfs_generic_pagein(desc, hdr);
>> if (ret == 0)
>> - ret = nfs_do_multiple_reads(&hdr->rpc_list,
>> - desc->pg_rpc_callops);
>> + ret = nfs_do_read(hdr->data.read, desc->pg_rpc_callops);
>> if (atomic_dec_and_test(&hdr->refcnt))
>> hdr->completion_ops->completion(hdr);
>> return ret;
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index f40db93..cd24a14 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -79,7 +79,6 @@ struct nfs_write_header *nfs_writehdr_alloc(void)
>>
>> memset(p, 0, sizeof(*p));
>> INIT_LIST_HEAD(&hdr->pages);
>> - INIT_LIST_HEAD(&hdr->rpc_list);
>> spin_lock_init(&hdr->lock);
>> atomic_set(&hdr->refcnt, 0);
>> hdr->verf = &p->verf;
>> @@ -1171,26 +1170,6 @@ static int nfs_do_write(struct nfs_write_data *data,
>> return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
>> }
>>
>> -static int nfs_do_multiple_writes(struct list_head *head,
>> - const struct rpc_call_ops *call_ops,
>> - int how)
>> -{
>> - struct nfs_write_data *data;
>> - int ret = 0;
>> -
>> - while (!list_empty(head)) {
>> - int ret2;
>> -
>> - data = list_first_entry(head, struct nfs_write_data, list);
>> - list_del_init(&data->list);
>> -
>> - ret2 = nfs_do_write(data, call_ops, how);
>> - if (ret == 0)
>> - ret = ret2;
>> - }
>> - return ret;
>> -}
>> -
>> /* If a nfs_flush_* function fails, it should remove reqs from @head and
>> * call this on each, which will prepare them to be retried on next
>> * writeback using standard nfs.
>> @@ -1223,12 +1202,8 @@ static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
>> struct nfs_pgio_header *hdr)
>> {
>> set_bit(NFS_IOHDR_REDO, &hdr->flags);
>> - while (!list_empty(&hdr->rpc_list)) {
>> - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
>> - struct nfs_write_data, list);
>> - list_del(&data->list);
>> - nfs_writedata_release(data);
>> - }
>> + nfs_writedata_release(hdr->data.write);
>> + hdr->data.write = NULL;
>> desc->pg_completion_ops->error_cleanup(&desc->pg_list);
>> }
>>
>> @@ -1275,7 +1250,8 @@ int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>>
>> /* Set up the argument struct */
>> nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
>> - list_add(&data->list, &hdr->rpc_list);
>> + WARN_ON_ONCE(hdr->data.write);
>> + hdr->data.write = data;
>> desc->pg_rpc_callops = &nfs_write_common_ops;
>> return 0;
>> }
>> @@ -1297,9 +1273,9 @@ static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
>> atomic_inc(&hdr->refcnt);
>> ret = nfs_generic_flush(desc, hdr);
>> if (ret == 0)
>> - ret = nfs_do_multiple_writes(&hdr->rpc_list,
>> - desc->pg_rpc_callops,
>> - desc->pg_ioflags);
>> + ret = nfs_do_write(hdr->data.write,
>> + desc->pg_rpc_callops,
>> + desc->pg_ioflags);
>> if (atomic_dec_and_test(&hdr->refcnt))
>> hdr->completion_ops->completion(hdr);
>> return ret;
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index 6fb5b23..239274d 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -1266,7 +1266,6 @@ struct nfs_page_array {
>>
>> struct nfs_read_data {
>> struct nfs_pgio_header *header;
>> - struct list_head list;
>> struct rpc_task task;
>> struct nfs_fattr fattr; /* fattr storage */
>> struct nfs_readargs args;
>> @@ -1278,6 +1277,20 @@ struct nfs_read_data {
>> struct nfs_client *ds_clp; /* pNFS data server */
>> };
>>
>> +struct nfs_write_data {
>> + struct nfs_pgio_header *header;
>> + struct rpc_task task;
>> + struct nfs_fattr fattr;
>> + struct nfs_writeverf verf;
>> + struct nfs_writeargs args; /* argument struct */
>> + struct nfs_writeres res; /* result struct */
>> + unsigned long timestamp; /* For lease renewal */
>> + int (*write_done_cb)(struct rpc_task *, struct nfs_write_data *);
>> + __u64 mds_offset; /* Filelayout dense stripe */
>> + struct nfs_page_array pages;
>> + struct nfs_client *ds_clp; /* pNFS data server */
>> +};
>> +
>> /* used as flag bits in nfs_pgio_header */
>> enum {
>> NFS_IOHDR_ERROR = 0,
>> @@ -1291,7 +1304,10 @@ struct nfs_pgio_header {
>> struct inode *inode;
>> struct rpc_cred *cred;
>> struct list_head pages;
>> - struct list_head rpc_list;
>> + union {
>> + struct nfs_read_data *read;
>> + struct nfs_write_data *write;
>> + } data;
>
> The first 5 patches in my series makes it so we can share all of these structs. Would it be useful to put those in first?
>
> Anna
>

Yes, I think it makes sense to stage most (if not all) of your patches first then merge my patches in.

I think I?ll just give it a shot and see how bad it is. I need to post a rebased version of my patchset anyway,
so I?ll see if I can also rebase on top of your changes.

Any objections?

-dros

>> atomic_t refcnt;
>> struct nfs_page *req;
>> struct nfs_writeverf *verf;
>> @@ -1315,21 +1331,6 @@ struct nfs_read_header {
>> struct nfs_read_data rpc_data;
>> };
>>
>> -struct nfs_write_data {
>> - struct nfs_pgio_header *header;
>> - struct list_head list;
>> - struct rpc_task task;
>> - struct nfs_fattr fattr;
>> - struct nfs_writeverf verf;
>> - struct nfs_writeargs args; /* argument struct */
>> - struct nfs_writeres res; /* result struct */
>> - unsigned long timestamp; /* For lease renewal */
>> - int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
>> - __u64 mds_offset; /* Filelayout dense stripe */
>> - struct nfs_page_array pages;
>> - struct nfs_client *ds_clp; /* pNFS data server */
>> -};
>> -
>> struct nfs_write_header {
>> struct nfs_pgio_header header;
>> struct nfs_write_data rpc_data;
>


2014-04-23 12:20:25

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 11/17] nfs: chain calls to pg_test

On 04/23/2014 12:29 AM, Weston Andros Adamson wrote:
> Now that pg_test can change the size of the request (by returning a non-zero
> size smaller than the request), pg_test functions that call other
> pg_test functions must return the minimum of the result - or 0 if any fail.
>
> Also clean up the logic of some pg_test functions so that all checks are
> for contitions where coalescing is not possible.
>
> Signed-off-by: Weston Andros Adamson <[email protected]>
> ---
> fs/nfs/nfs4filelayout.c | 27 ++++++++++++++-------------
> fs/nfs/objlayout/objio_osd.c | 12 ++++++++----
> fs/nfs/pnfs.c | 15 ++++++++++-----
> 3 files changed, 32 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index 3b32c95..cfd76bd 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -930,26 +930,27 @@ static size_t
> filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> struct nfs_page *req)
> {
> + unsigned int size;
> u64 p_stripe, r_stripe;
> u32 stripe_unit;
>
> - if (!pnfs_generic_pg_test(pgio, prev, req) ||
> - !nfs_generic_pg_test(pgio, prev, req))
> + /* calls nfs_generic_pg_test */
> + size = pnfs_generic_pg_test(pgio, prev, req);
> + if (!size)
> return 0;
>
> - if (!prev)
> - return req->wb_bytes;
> + if (prev) {
> + p_stripe = (u64)req_offset(prev);
> + r_stripe = (u64)req_offset(req);
> + stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
>
> - p_stripe = (u64)req_offset(prev);
> - r_stripe = (u64)req_offset(req);
> - stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
> + do_div(p_stripe, stripe_unit);
> + do_div(r_stripe, stripe_unit);
>
> - do_div(p_stripe, stripe_unit);
> - do_div(r_stripe, stripe_unit);
> -
> - if (p_stripe == r_stripe)
> - return req->wb_bytes;
> - return 0;
> + if (p_stripe != r_stripe)
> + return 0;
> + }
> + return min(size, req->wb_bytes);
> }
>
> static void
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index c20352a..31de29e 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -571,13 +571,17 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
> static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
> struct nfs_page *prev, struct nfs_page *req)
> {
> - if (!pnfs_generic_pg_test(pgio, prev, req))
> + unsigned int size;
> +
> + size = pnfs_generic_pg_test(pgio, prev, req);
> +
> + if (!size)
> return 0;
>
> - if (pgio->pg_count + req->wb_bytes <=
> + if (pgio->pg_count + req->wb_bytes >
> (unsigned long)pgio->pg_layout_private)
> - return req->wb_bytes;
> - return 0;
> + return 0;

objio_osd can enjoy the new facility by returning the
remainder here:

max_io = (unsigned long)pgio->pg_layout_private);

wb_bytes = min(size, req->wb_bytes);

if (pgio->pg_count + req->wb_bytes > max_io)
wb_bytes = max_io - pgio->pg_count;

return wb_bytes;

Which reminds me that this code sucks and I need to fix it. I will do
so after you send your changes.

[I promise to test these guys soon. Can you please put them on a public tree?]

Thanks
Boaz

> + return min(size, req->wb_bytes);
> }
>
> static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 6201bf6..7c89385 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1469,8 +1469,12 @@ size_t
> pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> struct nfs_page *req)
> {
> - if (pgio->pg_lseg == NULL)
> - return nfs_generic_pg_test(pgio, prev, req);
> + unsigned int size;
> +
> + size = nfs_generic_pg_test(pgio, prev, req);
> +
> + if (!size)
> + return 0;
>
> /*
> * Test if a nfs_page is fully contained in the pnfs_layout_range.
> @@ -1486,10 +1490,11 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
> * first byte that lies outside the pnfs_layout_range. FIXME?
> *
> */
> - if (req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
> + if (req_offset(req) >= end_offset(pgio->pg_lseg->pls_range.offset,
> pgio->pg_lseg->pls_range.length))
> - return req->wb_bytes;
> - return 0;
> + return 0;
> +
> + return min(size, req->wb_bytes);
> }
> EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
>
>