2011-02-21 17:49:41

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 0/7] pnfs write functionality (wave 4)

These apply on top of Andy's most recent wave3 patches.

They can also be found at the branch wave4-submit-1 at
git://linux-nfs.org/~iisaman/linux-pnfs.git

These patches give write functionality to the pnfs file layout driver.
The primary limitation is that we have not yet modified the commit
code, so we can not yet handle COMMITs to the data server. We get
around this for the moment by sending any WRITE to a data server which
does not have commit-to-mds set as a SYNCH write. The next patchset will
address this limitation.

Fred



2011-02-22 02:14:03

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 4/7] NFSv4.1: trigger LAYOUTGET for writes

On 2011-02-21 17:06, Fred Isaman wrote:
>
> On Feb 21, 2011, at 10:49 AM, Benny Halevy wrote:
>
>> On 2011-02-21 09:49, Fred Isaman wrote:
>>> Signed-off-by: Fred Isaman <[email protected]>
>>> ---
>>> fs/nfs/pnfs.c | 25 +++++++++++++++++++++++++
>>> fs/nfs/pnfs.h | 7 +++++++
>>> fs/nfs/write.c | 32 ++++++++++++++++++++------------
>>> 3 files changed, 52 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index a7ea646..0ed3948d 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -876,6 +876,31 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
>>> pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
>>> }
>>>
>>> +static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
>>> + struct nfs_page *prev,
>>> + struct nfs_page *req)
>>> +{
>>> + if (pgio->pg_count == prev->wb_bytes) {
>>> + /* This is first coelesce call for a series of nfs_pages */
>>> + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
>>> + prev->wb_context,
>>> + IOMODE_RW);
>>> + }
>>> + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
>>> +}
>>> +
>>> +/*
>>> + * rsize is already set by caller to MDS rsize.
>>> + */
>>
>> This comment is confusing and looks out of place...
>
> OK. I suggest we remove similar comment in the read code too.
>

Yeah. Works for me.

>>
>>> +void
>>> +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
>>> +{
>>> + struct pnfs_layoutdriver_type *ld;
>>> +
>>> + ld = NFS_SERVER(inode)->pnfs_curr_ld;
>>> + pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
>>> +}
>>> +
>>> /*
>>> * Call the appropriate parallel I/O subsystem read function.
>>> */
>>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>>> index acbb778..1d4e631 100644
>>> --- a/fs/nfs/pnfs.h
>>> +++ b/fs/nfs/pnfs.h
>>> @@ -123,6 +123,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
>>> enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
>>> const struct rpc_call_ops *);
>>> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
>>> +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
>>> int pnfs_layout_process(struct nfs4_layoutget *lgp);
>>> void pnfs_free_lseg_list(struct list_head *tmp_list);
>>> void pnfs_destroy_layout(struct nfs_inode *);
>>> @@ -235,6 +236,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
>>> pgio->pg_test = NULL;
>>> }
>>>
>>> +static inline void
>>> +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
>>> +{
>>> + pgio->pg_test = NULL;
>>> +}
>>> +
>>> #endif /* CONFIG_NFS_V4_1 */
>>>
>>> #endif /* FS_NFS_PNFS_H */
>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>> index 0df18ae..6cf5de6 100644
>>> --- a/fs/nfs/write.c
>>> +++ b/fs/nfs/write.c
>>> @@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
>>> } while (nbytes != 0);
>>> atomic_set(&req->wb_complete, requests);
>>>
>>> + /* We know lseg==NULL */
>>
>> Then maybe BUG_ON or WARN_ON(lseg != NULL)?
>
> OK.
>
>
>>
>>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
>>> ClearPageError(page);
>>> offset = 0;
>>> nbytes = count;
>>> @@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
>>> nbytes -= wsize;
>>> } while (nbytes != 0);
>>>
>>> + put_lseg(lseg);
>>> return ret;
>>>
>>> out_bad:
>>> @@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
>>> struct nfs_page *req;
>>> struct page **pages;
>>> struct nfs_write_data *data;
>>> + int ret;
>>>
>>> data = nfs_writedata_alloc(npages);
>>> - if (!data)
>>> - goto out_bad;
>>> -
>>> + if (!data) {
>>> + while (!list_empty(head)) {
>>> + req = nfs_list_entry(head->next);
>>
>> nit: it'd be cleaner to define a nfs_list_first_entry helper in nfs_page.h
>> rather than using a combination of list helpers and open code (head->next).
>
> I would think that would be a separate patch. As this is just a simple move of pre-existing code,
> it is easier to see what is happening if I leave the copied code the same.
>
>

OK, makes sense.

Benny

>>
>>> + nfs_list_remove_request(req);
>>> + nfs_redirty_request(req);
>>> + }
>>> + ret = -ENOMEM;
>>> + goto out;
>>> + }
>>> pages = data->pagevec;
>>> while (!list_empty(head)) {
>>> req = nfs_list_entry(head->next);
>>> @@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
>>> *pages++ = req->wb_page;
>>> }
>>> req = nfs_list_entry(data->pages.next);
>>> + if ((!lseg) && list_is_singular(&data->pages))
>>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
>>>
>>> /* Set up the argument struct */
>>> - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
>>> - out_bad:
>>> - while (!list_empty(head)) {
>>> - req = nfs_list_entry(head->next);
>>> - nfs_list_remove_request(req);
>>> - nfs_redirty_request(req);
>>> - }
>>> - return -ENOMEM;
>>> + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
>>> +out:
>>> + put_lseg(lseg);
>>
>> Shouldn't you do that only if lseg was updated above?
>>
>
> No, this is the put matching the get done in the pg_test function. I'll comment that.
>
> Fred
>

2011-02-21 17:49:43

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 6/7] NFSv4.1: remove GETATTR from ds writes

Any WRITE compound directed to a data server needs to have the
GETATTR calls suppressed.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 6 +++++-
fs/nfs/nfs4xdr.c | 8 +++++---
2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bfdd36a..4ec34e2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3148,7 +3148,11 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
{
struct nfs_server *server = NFS_SERVER(data->inode);

- data->args.bitmask = server->cache_consistency_bitmask;
+ if (data->lseg) {
+ data->args.bitmask = NULL;
+ data->res.fattr = NULL;
+ } else
+ data->args.bitmask = server->cache_consistency_bitmask;
if (!data->write_done_cb)
data->write_done_cb = nfs4_write_done_cb;
data->res.server = server;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2380c45..c35880c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2275,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_write(xdr, args, &hdr);
req->rq_snd_buf.flags |= XDRBUF_WRITE;
- encode_getfattr(xdr, args->bitmask, &hdr);
+ if (args->bitmask)
+ encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}

@@ -5694,8 +5695,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_write(xdr, res);
if (status)
goto out;
- decode_getfattr(xdr, res->fattr, res->server,
- !RPC_IS_ASYNC(rqstp->rq_task));
+ if (res->fattr)
+ decode_getfattr(xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
if (!status)
status = res->count;
out:
--
1.7.2.1


2011-02-22 01:07:07

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 4/7] NFSv4.1: trigger LAYOUTGET for writes


On Feb 21, 2011, at 10:49 AM, Benny Halevy wrote:

> On 2011-02-21 09:49, Fred Isaman wrote:
>> Signed-off-by: Fred Isaman <[email protected]>
>> ---
>> fs/nfs/pnfs.c | 25 +++++++++++++++++++++++++
>> fs/nfs/pnfs.h | 7 +++++++
>> fs/nfs/write.c | 32 ++++++++++++++++++++------------
>> 3 files changed, 52 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index a7ea646..0ed3948d 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -876,6 +876,31 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
>> pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
>> }
>>
>> +static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
>> + struct nfs_page *prev,
>> + struct nfs_page *req)
>> +{
>> + if (pgio->pg_count == prev->wb_bytes) {
>> + /* This is first coelesce call for a series of nfs_pages */
>> + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
>> + prev->wb_context,
>> + IOMODE_RW);
>> + }
>> + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
>> +}
>> +
>> +/*
>> + * rsize is already set by caller to MDS rsize.
>> + */
>
> This comment is confusing and looks out of place...

OK. I suggest we remove similar comment in the read code too.

>
>> +void
>> +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
>> +{
>> + struct pnfs_layoutdriver_type *ld;
>> +
>> + ld = NFS_SERVER(inode)->pnfs_curr_ld;
>> + pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
>> +}
>> +
>> /*
>> * Call the appropriate parallel I/O subsystem read function.
>> */
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index acbb778..1d4e631 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -123,6 +123,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
>> enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
>> const struct rpc_call_ops *);
>> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
>> +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
>> int pnfs_layout_process(struct nfs4_layoutget *lgp);
>> void pnfs_free_lseg_list(struct list_head *tmp_list);
>> void pnfs_destroy_layout(struct nfs_inode *);
>> @@ -235,6 +236,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
>> pgio->pg_test = NULL;
>> }
>>
>> +static inline void
>> +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
>> +{
>> + pgio->pg_test = NULL;
>> +}
>> +
>> #endif /* CONFIG_NFS_V4_1 */
>>
>> #endif /* FS_NFS_PNFS_H */
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index 0df18ae..6cf5de6 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
>> } while (nbytes != 0);
>> atomic_set(&req->wb_complete, requests);
>>
>> + /* We know lseg==NULL */
>
> Then maybe BUG_ON or WARN_ON(lseg != NULL)?

OK.


>
>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
>> ClearPageError(page);
>> offset = 0;
>> nbytes = count;
>> @@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
>> nbytes -= wsize;
>> } while (nbytes != 0);
>>
>> + put_lseg(lseg);
>> return ret;
>>
>> out_bad:
>> @@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
>> struct nfs_page *req;
>> struct page **pages;
>> struct nfs_write_data *data;
>> + int ret;
>>
>> data = nfs_writedata_alloc(npages);
>> - if (!data)
>> - goto out_bad;
>> -
>> + if (!data) {
>> + while (!list_empty(head)) {
>> + req = nfs_list_entry(head->next);
>
> nit: it'd be cleaner to define a nfs_list_first_entry helper in nfs_page.h
> rather than using a combination of list helpers and open code (head->next).

I would think that would be a separate patch. As this is just a simple move of pre-existing code,
it is easier to see what is happening if I leave the copied code the same.


>
>> + nfs_list_remove_request(req);
>> + nfs_redirty_request(req);
>> + }
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> pages = data->pagevec;
>> while (!list_empty(head)) {
>> req = nfs_list_entry(head->next);
>> @@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
>> *pages++ = req->wb_page;
>> }
>> req = nfs_list_entry(data->pages.next);
>> + if ((!lseg) && list_is_singular(&data->pages))
>> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
>>
>> /* Set up the argument struct */
>> - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
>> - out_bad:
>> - while (!list_empty(head)) {
>> - req = nfs_list_entry(head->next);
>> - nfs_list_remove_request(req);
>> - nfs_redirty_request(req);
>> - }
>> - return -ENOMEM;
>> + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
>> +out:
>> + put_lseg(lseg);
>
> Shouldn't you do that only if lseg was updated above?
>

No, this is the put matching the get done in the pg_test function. I'll comment that.

Fred


2011-02-21 17:49:42

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 2/7] NFSv4.1: add callback to nfs4_write_done

Add callback that pnfs layout driver can use to do its own handling
of data server WRITE response.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4proc.c | 14 ++++++++++----
include/linux/nfs_xdr.h | 1 +
2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f91e259..bfdd36a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3122,13 +3122,10 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
}
EXPORT_SYMBOL_GPL(nfs4_reset_read);

-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;

- if (!nfs4_sequence_done(task, &data->res.seq_res))
- return -EAGAIN;
-
if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
return -EAGAIN;
@@ -3140,11 +3137,20 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
return 0;
}

+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return -EAGAIN;
+ return data->write_done_cb(task, data);
+}
+
static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
{
struct nfs_server *server = NFS_SERVER(data->inode);

data->args.bitmask = server->cache_consistency_bitmask;
+ if (!data->write_done_cb)
+ data->write_done_cb = nfs4_write_done_cb;
data->res.server = server;
data->timestamp = jiffies;

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index eb0e870..21cd41d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1038,6 +1038,7 @@ struct nfs_write_data {
unsigned int npages; /* Max length of pagevec */
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
+ int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
--
1.7.2.1


2011-02-21 17:49:43

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 4/7] NFSv4.1: trigger LAYOUTGET for writes

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/pnfs.c | 25 +++++++++++++++++++++++++
fs/nfs/pnfs.h | 7 +++++++
fs/nfs/write.c | 32 ++++++++++++++++++++------------
3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a7ea646..0ed3948d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -876,6 +876,31 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
}

+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev,
+ struct nfs_page *req)
+{
+ if (pgio->pg_count == prev->wb_bytes) {
+ /* This is first coelesce call for a series of nfs_pages */
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ prev->wb_context,
+ IOMODE_RW);
+ }
+ return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+}
+
+/*
+ * rsize is already set by caller to MDS rsize.
+ */
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld;
+
+ ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+
/*
* Call the appropriate parallel I/O subsystem read function.
*/
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index acbb778..1d4e631 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -123,6 +123,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
const struct rpc_call_ops *);
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
@@ -235,6 +236,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
pgio->pg_test = NULL;
}

+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+ pgio->pg_test = NULL;
+}
+
#endif /* CONFIG_NFS_V4_1 */

#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0df18ae..6cf5de6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
} while (nbytes != 0);
atomic_set(&req->wb_complete, requests);

+ /* We know lseg==NULL */
+ lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
ClearPageError(page);
offset = 0;
nbytes = count;
@@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
nbytes -= wsize;
} while (nbytes != 0);

+ put_lseg(lseg);
return ret;

out_bad:
@@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
struct nfs_page *req;
struct page **pages;
struct nfs_write_data *data;
+ int ret;

data = nfs_writedata_alloc(npages);
- if (!data)
- goto out_bad;
-
+ if (!data) {
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_redirty_request(req);
+ }
+ ret = -ENOMEM;
+ goto out;
+ }
pages = data->pagevec;
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
@@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
*pages++ = req->wb_page;
}
req = nfs_list_entry(data->pages.next);
+ if ((!lseg) && list_is_singular(&data->pages))
+ lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);

/* Set up the argument struct */
- return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
- out_bad:
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_redirty_request(req);
- }
- return -ENOMEM;
+ ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
+out:
+ put_lseg(lseg);
+ return ret;
}

static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -996,7 +1004,7 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
{
size_t wsize = NFS_SERVER(inode)->wsize;

- pgio->pg_test = NULL;
+ pnfs_pageio_init_write(pgio, inode);

if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
--
1.7.2.1


2011-02-21 18:53:29

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 5/7] NFSv4.1: implement generic pnfs layer write switch

On 2011-02-21 09:49, Fred Isaman wrote:
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 1d4e631..3766afea 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -79,6 +79,8 @@ struct pnfs_layoutdriver_type {
> * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
> */
> enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
> + enum pnfs_try_status
> + (*write_pagelist) (struct nfs_write_data *nfs_data, int how);

nit: line break

> };
>
> struct pnfs_layout_hdr {
> @@ -120,6 +122,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
> enum pnfs_iomode access_type);
> void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
> void unset_pnfs_layoutdriver(struct nfs_server *);
> +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
> + const struct rpc_call_ops *, int);
> enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
> const struct rpc_call_ops *);
> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
> @@ -200,6 +204,13 @@ pnfs_try_to_read_data(struct nfs_read_data *data,
> return PNFS_NOT_ATTEMPTED;
> }
>
> +static inline enum pnfs_try_status
> +pnfs_try_to_write_data(struct nfs_write_data *data,
> + const struct rpc_call_ops *call_ops, int how)
> +{
> + return PNFS_NOT_ATTEMPTED;
> +}
> +
> static inline bool
> pnfs_roc(struct inode *ino)
> {
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 6cf5de6..b6b683d 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -873,6 +873,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
> data->res.verf = &data->verf;
> nfs_fattr_init(&data->fattr);
>
> + if (data->lseg &&
> + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
> + return 0;
> +
> return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
> }
>
> diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
> index 37a1437..8866bb3 100644
> --- a/include/linux/nfs_iostat.h
> +++ b/include/linux/nfs_iostat.h
> @@ -114,6 +114,7 @@ enum nfs_stat_eventcounters {
> NFSIOS_SHORTWRITE,
> NFSIOS_DELAY,
> NFSIOS_PNFS_READ,
> + NFSIOS_PNFS_WRITE,
> __NFSIOS_COUNTSMAX,
> };
>
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 09d9681..c82ad33 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -1039,6 +1039,7 @@ struct nfs_write_data {
> struct nfs_writeargs args; /* argument struct */
> struct nfs_writeres res; /* result struct */
> struct pnfs_layout_segment *lseg;
> + const struct rpc_call_ops *mds_ops;

nit: mds_ops is not really used in the patch, just initialized.
better introduce it along with its usage in patch 7/7.

Benny

> int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
> #ifdef CONFIG_NFS_V4
> unsigned long timestamp; /* For lease renewal */

2011-02-21 17:49:42

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 3/7] NFSv4.1: Send lseg down into nfs_write_rpcsetup

We grab the lseg sent in from the doio function and attach it to
each struct nfs_write_data created. This is how the lseg will be
sent to the layout driver.

Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/write.c | 7 +++++--
include/linux/nfs_xdr.h | 1 +
2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5604854..0df18ae 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -97,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)

static void nfs_writedata_release(struct nfs_write_data *wdata)
{
+ put_lseg(wdata->lseg);
put_nfs_open_context(wdata->args.context);
nfs_writedata_free(wdata);
}
@@ -840,6 +841,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
struct nfs_write_data *data,
const struct rpc_call_ops *call_ops,
unsigned int count, unsigned int offset,
+ struct pnfs_layout_segment *lseg,
int how)
{
struct inode *inode = req->wb_context->path.dentry->d_inode;
@@ -850,6 +852,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
data->req = req;
data->inode = inode = req->wb_context->path.dentry->d_inode;
data->cred = req->wb_context->cred;
+ data->lseg = get_lseg(lseg);

data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -930,7 +933,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
if (nbytes < wsize)
wsize = nbytes;
ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
- wsize, offset, how);
+ wsize, offset, lseg, how);
if (ret == 0)
ret = ret2;
offset += wsize;
@@ -978,7 +981,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
req = nfs_list_entry(data->pages.next);

/* Set up the argument struct */
- return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
+ return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
out_bad:
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 21cd41d..09d9681 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1038,6 +1038,7 @@ struct nfs_write_data {
unsigned int npages; /* Max length of pagevec */
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
+ struct pnfs_layout_segment *lseg;
int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
--
1.7.2.1


2011-02-21 17:49:44

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 7/7] NFSv4.1: pnfs filelayout driver write

Allows the pnfs filelayout driver to write to the data servers.

Note that COMMIT to data servers will be implemented in a future
patch. To avoid improper behavior, for the moment any WRITE to a data
server that would also require a COMMIT to the data server is sent
NFS_FILE_SYNC.

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Mingyang Guo <[email protected]>
Signed-off-by: Oleg Drokin <[email protected]>
Signed-off-by: Ricardo Labiaga <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/internal.h | 5 ++
fs/nfs/nfs4filelayout.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++-
fs/nfs/nfs4proc.c | 17 ++++++++
fs/nfs/write.c | 5 ++-
include/linux/nfs_xdr.h | 2 +
5 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1a3228e..d1ddc23 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -276,6 +276,10 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);

/* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how);
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
@@ -291,6 +295,7 @@ extern int nfs4_init_client(struct nfs_client *clp,
const char *ip_addr,
rpc_authflavor_t authflavour,
int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
extern int _nfs4_call_sync(struct nfs_server *server,
struct rpc_message *msg,
struct nfs4_sequence_args *args,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 553adea..bfd30de 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -189,12 +189,69 @@ static void filelayout_read_release(void *data)
rdata->mds_ops->rpc_release(data);
}

+static int filelayout_write_done_cb(struct rpc_task *task,
+ struct nfs_write_data *data)
+{
+ int reset = 0;
+
+ if (filelayout_async_handle_error(task, data->args.context->state,
+ data->ds_clp, &reset) == -EAGAIN) {
+ struct nfs_client *clp;
+
+ dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+ __func__, data->ds_clp, data->ds_clp->cl_session);
+ if (reset) {
+ nfs4_reset_write(task, data);
+ filelayout_set_lo_fail(data->lseg);
+ clp = NFS_SERVER(data->inode)->nfs_client;
+ } else
+ clp = data->ds_clp;
+ nfs_restart_rpc(task, clp);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+ &wdata->args.seq_args, &wdata->res.seq_res,
+ 0, task))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ /* Note this may cause RPC to be resent */
+ wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_release(void *data)
+{
+ struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+ wdata->mds_ops->rpc_release(data);
+}
+
struct rpc_call_ops filelayout_read_call_ops = {
.rpc_call_prepare = filelayout_read_prepare,
.rpc_call_done = filelayout_read_call_done,
.rpc_release = filelayout_read_release,
};

+struct rpc_call_ops filelayout_write_call_ops = {
+ .rpc_call_prepare = filelayout_write_prepare,
+ .rpc_call_done = filelayout_write_call_done,
+ .rpc_release = filelayout_write_release,
+};
+
static enum pnfs_try_status
filelayout_read_pagelist(struct nfs_read_data *data)
{
@@ -236,10 +293,50 @@ filelayout_read_pagelist(struct nfs_read_data *data)
return PNFS_ATTEMPTED;
}

+/* Perform async writes. */
static enum pnfs_try_status
filelayout_write_pagelist(struct nfs_write_data *data, int sync)
{
- return PNFS_NOT_ATTEMPTED;
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ loff_t offset = data->args.offset;
+ u32 j, idx;
+ struct nfs_fh *fh;
+
+ /* Retrieve the correct rpc_client for the byte range */
+ j = nfs4_fl_calc_j_index(lseg, offset);
+ idx = nfs4_fl_calc_ds_index(lseg, j);
+ ds = nfs4_fl_prepare_ds(lseg, idx);
+ if (!ds) {
+ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+ data->inode->i_ino, sync, (size_t) data->args.count, offset,
+ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+ /* We can't handle commit to ds yet */
+ if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
+ data->args.stable = NFS_FILE_SYNC;
+
+ data->write_done_cb = filelayout_write_done_cb;
+ data->ds_clp = ds->ds_clp;
+ fh = nfs4_fl_select_ds_fh(lseg, j);
+ if (fh)
+ data->args.fh = fh;
+ /*
+ * Get the file offset on the dserver. Set the write offset to
+ * this offset and save the original offset.
+ */
+ data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+ data->mds_offset = offset;
+
+ /* Perform an asynchronous write */
+ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+ &filelayout_write_call_ops, sync);
+ return PNFS_ATTEMPTED;
}

/*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4ec34e2..c2ba9c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3144,6 +3144,23 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
return data->write_done_cb(task, data);
}

+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+ dprintk("%s Reset task for i/o through\n", __func__);
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ data->ds_clp = NULL;
+ data->write_done_cb = nfs4_write_done_cb;
+ data->args.fh = NFS_FH(data->inode);
+ data->args.bitmask = data->res.server->cache_consistency_bitmask;
+ data->args.offset = data->mds_offset;
+ data->res.fattr = &data->fattr;
+ task->tk_ops = data->mds_ops;
+ rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
+
static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
{
struct nfs_server *server = NFS_SERVER(data->inode);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b6b683d..a10f120 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -783,7 +783,7 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}

-static int nfs_initiate_write(struct nfs_write_data *data,
+int nfs_initiate_write(struct nfs_write_data *data,
struct rpc_clnt *clnt,
const struct rpc_call_ops *call_ops,
int how)
@@ -833,6 +833,7 @@ static int nfs_initiate_write(struct nfs_write_data *data,
out:
return ret;
}
+EXPORT_SYMBOL(nfs_initiate_write);

/*
* Set up the argument/result storage required for the RPC call.
@@ -1194,6 +1195,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
*/
static unsigned long complain;

+ /* Note this will print the MDS for a DS write */
if (time_before(complain, jiffies)) {
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
@@ -1214,6 +1216,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
+ data->mds_offset += resp->count;
argp->offset += resp->count;
argp->pgbase += resp->count;
argp->count -= resp->count;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c82ad33..3440f5a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1039,11 +1039,13 @@ struct nfs_write_data {
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
struct pnfs_layout_segment *lseg;
+ struct nfs_client *ds_clp; /* pNFS data server */
const struct rpc_call_ops *mds_ops;
int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
+ __u64 mds_offset; /* Filelayout dense stripe */
struct page *page_array[NFS_PAGEVEC_SIZE];
};

--
1.7.2.1


2011-02-21 18:49:31

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 4/7] NFSv4.1: trigger LAYOUTGET for writes

On 2011-02-21 09:49, Fred Isaman wrote:
> Signed-off-by: Fred Isaman <[email protected]>
> ---
> fs/nfs/pnfs.c | 25 +++++++++++++++++++++++++
> fs/nfs/pnfs.h | 7 +++++++
> fs/nfs/write.c | 32 ++++++++++++++++++++------------
> 3 files changed, 52 insertions(+), 12 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index a7ea646..0ed3948d 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -876,6 +876,31 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
> pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
> }
>
> +static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
> + struct nfs_page *prev,
> + struct nfs_page *req)
> +{
> + if (pgio->pg_count == prev->wb_bytes) {
> + /* This is first coelesce call for a series of nfs_pages */
> + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
> + prev->wb_context,
> + IOMODE_RW);
> + }
> + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
> +}
> +
> +/*
> + * rsize is already set by caller to MDS rsize.
> + */

This comment is confusing and looks out of place...

> +void
> +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
> +{
> + struct pnfs_layoutdriver_type *ld;
> +
> + ld = NFS_SERVER(inode)->pnfs_curr_ld;
> + pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
> +}
> +
> /*
> * Call the appropriate parallel I/O subsystem read function.
> */
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index acbb778..1d4e631 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -123,6 +123,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
> enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
> const struct rpc_call_ops *);
> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
> +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
> int pnfs_layout_process(struct nfs4_layoutget *lgp);
> void pnfs_free_lseg_list(struct list_head *tmp_list);
> void pnfs_destroy_layout(struct nfs_inode *);
> @@ -235,6 +236,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
> pgio->pg_test = NULL;
> }
>
> +static inline void
> +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
> +{
> + pgio->pg_test = NULL;
> +}
> +
> #endif /* CONFIG_NFS_V4_1 */
>
> #endif /* FS_NFS_PNFS_H */
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 0df18ae..6cf5de6 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
> } while (nbytes != 0);
> atomic_set(&req->wb_complete, requests);
>
> + /* We know lseg==NULL */

Then maybe BUG_ON or WARN_ON(lseg != NULL)?

> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
> ClearPageError(page);
> offset = 0;
> nbytes = count;
> @@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
> nbytes -= wsize;
> } while (nbytes != 0);
>
> + put_lseg(lseg);
> return ret;
>
> out_bad:
> @@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
> struct nfs_page *req;
> struct page **pages;
> struct nfs_write_data *data;
> + int ret;
>
> data = nfs_writedata_alloc(npages);
> - if (!data)
> - goto out_bad;
> -
> + if (!data) {
> + while (!list_empty(head)) {
> + req = nfs_list_entry(head->next);

nit: it'd be cleaner to define a nfs_list_first_entry helper in nfs_page.h
rather than using a combination of list helpers and open code (head->next).

> + nfs_list_remove_request(req);
> + nfs_redirty_request(req);
> + }
> + ret = -ENOMEM;
> + goto out;
> + }
> pages = data->pagevec;
> while (!list_empty(head)) {
> req = nfs_list_entry(head->next);
> @@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
> *pages++ = req->wb_page;
> }
> req = nfs_list_entry(data->pages.next);
> + if ((!lseg) && list_is_singular(&data->pages))
> + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
>
> /* Set up the argument struct */
> - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
> - out_bad:
> - while (!list_empty(head)) {
> - req = nfs_list_entry(head->next);
> - nfs_list_remove_request(req);
> - nfs_redirty_request(req);
> - }
> - return -ENOMEM;
> + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
> +out:
> + put_lseg(lseg);

Shouldn't you do that only if lseg was updated above?

Benny

> + return ret;
> }
>
> static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
> @@ -996,7 +1004,7 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
> {
> size_t wsize = NFS_SERVER(inode)->wsize;
>
> - pgio->pg_test = NULL;
> + pnfs_pageio_init_write(pgio, inode);
>
> if (wsize < PAGE_CACHE_SIZE)
> nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);

2011-02-21 17:49:41

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 1/7] NFSv4.1: rearrange nfs_write_rpcsetup

From: Andy Adamson <[email protected]>

Reorder nfs_write_rpcsetup, preparing for a pnfs entry point.

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/write.c | 82 +++++++++++++++++++++++++++++++------------------------
1 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index aca0268..5604854 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -782,25 +782,21 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}

-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static int nfs_write_rpcsetup(struct nfs_page *req,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- unsigned int count, unsigned int offset,
- int how)
+static int nfs_initiate_write(struct nfs_write_data *data,
+ struct rpc_clnt *clnt,
+ const struct rpc_call_ops *call_ops,
+ int how)
{
- struct inode *inode = req->wb_context->path.dentry->d_inode;
+ struct inode *inode = data->inode;
int priority = flush_task_priority(how);
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
- .rpc_cred = req->wb_context->cred,
+ .rpc_cred = data->cred,
};
struct rpc_task_setup task_setup_data = {
- .rpc_client = NFS_CLIENT(inode),
+ .rpc_client = clnt,
.task = &data->task,
.rpc_message = &msg,
.callback_ops = call_ops,
@@ -811,12 +807,49 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
};
int ret = 0;

+ /* Set up the initial task struct. */
+ NFS_PROTO(inode)->write_setup(data, &msg);
+
+ dprintk("NFS: %5u initiated write call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ if (how & FLUSH_SYNC) {
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0)
+ ret = task->tk_status;
+ }
+ rpc_put_task(task);
+out:
+ return ret;
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+ struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops,
+ unsigned int count, unsigned int offset,
+ int how)
+{
+ struct inode *inode = req->wb_context->path.dentry->d_inode;
+
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */

data->req = req;
data->inode = inode = req->wb_context->path.dentry->d_inode;
- data->cred = msg.rpc_cred;
+ data->cred = req->wb_context->cred;

data->args.fh = NFS_FH(inode);
data->args.offset = req_offset(req) + offset;
@@ -837,30 +870,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);

- /* Set up the initial task struct. */
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- dprintk("NFS: %5u initiated write call "
- "(req %s/%lld, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- count,
- (unsigned long long)data->args.offset);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
- rpc_put_task(task);
-out:
- return ret;
+ return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
}

/* If a nfs_flush_* function fails, it should remove reqs from @head and
--
1.7.2.1


2011-02-21 17:49:43

by Fred Isaman

[permalink] [raw]
Subject: [PATCH 5/7] NFSv4.1: implement generic pnfs layer write switch

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: J. Bruce Fields <[email protected]>
Signed-off-by: Mike Sager <[email protected]>
Signed-off-by: Ricardo Labiaga <[email protected]>
Signed-off-by: Tao Guo <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
---
fs/nfs/nfs4filelayout.c | 7 +++++++
fs/nfs/pnfs.c | 24 ++++++++++++++++++++++++
fs/nfs/pnfs.h | 11 +++++++++++
fs/nfs/write.c | 4 ++++
include/linux/nfs_iostat.h | 1 +
include/linux/nfs_xdr.h | 1 +
6 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b9f8044..553adea 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -236,6 +236,12 @@ filelayout_read_pagelist(struct nfs_read_data *data)
return PNFS_ATTEMPTED;
}

+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
/*
* filelayout_check_layout()
*
@@ -453,6 +459,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.free_lseg = filelayout_free_lseg,
.pg_test = filelayout_pg_test,
.read_pagelist = filelayout_read_pagelist,
+ .write_pagelist = filelayout_write_pagelist,
};

static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0ed3948d..e8e7bab 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -901,6 +901,30 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
}

+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ struct inode *inode = wdata->inode;
+ enum pnfs_try_status trypnfs;
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ wdata->mds_ops = call_ops;
+
+ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+ inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
+ trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+ if (trypnfs == PNFS_NOT_ATTEMPTED) {
+ wdata->lseg = NULL;
+ put_lseg(wdata->lseg);
+ } else
+ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+
+ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+ return trypnfs;
+}
+
/*
* Call the appropriate parallel I/O subsystem read function.
*/
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1d4e631..3766afea 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -79,6 +79,8 @@ struct pnfs_layoutdriver_type {
* I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
*/
enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+ enum pnfs_try_status
+ (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
};

struct pnfs_layout_hdr {
@@ -120,6 +122,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+ const struct rpc_call_ops *, int);
enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
const struct rpc_call_ops *);
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
@@ -200,6 +204,13 @@ pnfs_try_to_read_data(struct nfs_read_data *data,
return PNFS_NOT_ATTEMPTED;
}

+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6cf5de6..b6b683d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -873,6 +873,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);

+ if (data->lseg &&
+ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+ return 0;
+
return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
}

diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
index 37a1437..8866bb3 100644
--- a/include/linux/nfs_iostat.h
+++ b/include/linux/nfs_iostat.h
@@ -114,6 +114,7 @@ enum nfs_stat_eventcounters {
NFSIOS_SHORTWRITE,
NFSIOS_DELAY,
NFSIOS_PNFS_READ,
+ NFSIOS_PNFS_WRITE,
__NFSIOS_COUNTSMAX,
};

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 09d9681..c82ad33 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1039,6 +1039,7 @@ struct nfs_write_data {
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
struct pnfs_layout_segment *lseg;
+ const struct rpc_call_ops *mds_ops;
int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
--
1.7.2.1


2011-02-22 19:55:39

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 7/7] NFSv4.1: pnfs filelayout driver write

On Mon, Feb 21, 2011 at 9:49 AM, Fred Isaman <[email protected]> wrote:
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b6b683d..a10f120 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -783,7 +783,7 @@ static int flush_task_priority(int how)
> ? ? ? ?return RPC_PRIORITY_NORMAL;
> ?}
>
> -static int nfs_initiate_write(struct nfs_write_data *data,
> +int nfs_initiate_write(struct nfs_write_data *data,
> ? ? ? ? ? ? ? ? ? ? ? struct rpc_clnt *clnt,
> ? ? ? ? ? ? ? ? ? ? ? const struct rpc_call_ops *call_ops,
> ? ? ? ? ? ? ? ? ? ? ? int how)
> @@ -833,6 +833,7 @@ static int nfs_initiate_write(struct nfs_write_data *data,
> ?out:
> ? ? ? ?return ret;
> ?}
> +EXPORT_SYMBOL(nfs_initiate_write);
>

This should be EXPORT_SYMBOL_GPL.

Fred

2011-02-22 01:27:14

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH 4/7] NFSv4.1: trigger LAYOUTGET for writes

On Mon, 2011-02-21 at 10:49 -0800, Benny Halevy wrote:
> On 2011-02-21 09:49, Fred Isaman wrote:
> > @@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
> > struct nfs_page *req;
> > struct page **pages;
> > struct nfs_write_data *data;
> > + int ret;
> >
> > data = nfs_writedata_alloc(npages);
> > - if (!data)
> > - goto out_bad;
> > -
> > + if (!data) {
> > + while (!list_empty(head)) {
> > + req = nfs_list_entry(head->next);
>
> nit: it'd be cleaner to define a nfs_list_first_entry helper in nfs_page.h
> rather than using a combination of list helpers and open code (head->next).

No thanks! Let's not add lots of helpers for trivial stuff unless that
results in a clear gain in type safety.

Trond

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-02-22 01:17:15

by Fred Isaman

[permalink] [raw]
Subject: Re: [PATCH 5/7] NFSv4.1: implement generic pnfs layer write switch


On Feb 21, 2011, at 10:53 AM, Benny Halevy wrote:

> On 2011-02-21 09:49, Fred Isaman wrote:
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 1d4e631..3766afea 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -79,6 +79,8 @@ struct pnfs_layoutdriver_type {
>> * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
>> */
>> enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
>> + enum pnfs_try_status
>> + (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
>
> nit: line break

OK


>
>> };
>>
>> struct pnfs_layout_hdr {
>> @@ -120,6 +122,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>> enum pnfs_iomode access_type);
>> void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
>> void unset_pnfs_layoutdriver(struct nfs_server *);
>> +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
>> + const struct rpc_call_ops *, int);
>> enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
>> const struct rpc_call_ops *);
>> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
>> @@ -200,6 +204,13 @@ pnfs_try_to_read_data(struct nfs_read_data *data,
>> return PNFS_NOT_ATTEMPTED;
>> }
>>
>> +static inline enum pnfs_try_status
>> +pnfs_try_to_write_data(struct nfs_write_data *data,
>> + const struct rpc_call_ops *call_ops, int how)
>> +{
>> + return PNFS_NOT_ATTEMPTED;
>> +}
>> +
>> static inline bool
>> pnfs_roc(struct inode *ino)
>> {
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index 6cf5de6..b6b683d 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -873,6 +873,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
>> data->res.verf = &data->verf;
>> nfs_fattr_init(&data->fattr);
>>
>> + if (data->lseg &&
>> + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
>> + return 0;
>> +
>> return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
>> }
>>
>> diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h
>> index 37a1437..8866bb3 100644
>> --- a/include/linux/nfs_iostat.h
>> +++ b/include/linux/nfs_iostat.h
>> @@ -114,6 +114,7 @@ enum nfs_stat_eventcounters {
>> NFSIOS_SHORTWRITE,
>> NFSIOS_DELAY,
>> NFSIOS_PNFS_READ,
>> + NFSIOS_PNFS_WRITE,
>> __NFSIOS_COUNTSMAX,
>> };
>>
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index 09d9681..c82ad33 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -1039,6 +1039,7 @@ struct nfs_write_data {
>> struct nfs_writeargs args; /* argument struct */
>> struct nfs_writeres res; /* result struct */
>> struct pnfs_layout_segment *lseg;
>> + const struct rpc_call_ops *mds_ops;
>
> nit: mds_ops is not really used in the patch, just initialized.
> better introduce it along with its usage in patch 7/7.

OK.

Fred

>
> Benny
>
>> int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data);
>> #ifdef CONFIG_NFS_V4
>> unsigned long timestamp; /* For lease renewal */