2011-04-20 16:46:31

by Benny Halevy

[permalink] [raw]
Subject: [RFC 0/27] pnfs-submit for 2.6.40

I prepared an initial version of the pnfs patch series candidate for 2.6.40.
It implements basic support for layout segments and error handling
(fallback to the MDS) for non-rpc based layout drivers.

Comments welcome! :)

The patches are also available in
git://linux-nfs.org/~bhalevy/linux-pnfs.git pnfs-submit

On top of it I rebased the following branches to allow easier testing:
pnfs-submit
pnfs-obj
pnfs
pnfsd-all
pnfsd-lexp-all
pnfs-exofs-all

These branches do not include the pnfs-block implementation yet.

generic pnfs additions:
[RFC 01/27] pnfs: CB_NOTIFY_DEVICEID
[RFC 02/27] pnfs: direct i/o
[RFC 13/27] pnfs: client stats

for pnfs-obj:
[RFC 03/27] pnfs: layoutreturn
[RFC 04/27] pnfs: layoutret_on_setattr
[RFC 05/27] pnfs: Use byte-range layout segments
[RFC 06/27] pnfs: encode_layoutreturn
[RFC 07/27] pnfs: encode_layoutcommit
[RFC 08/27] pnfs: {setup,cleanup}_layoutcommit
[RFC 09/27] pnfs: support for non-rpc layout drivers
[RFC 10/27] pnfs: {,un}set_layoutdriver methods
[RFC 11/27] pnfs: per mount layout driver private data
[RFC 12/27] pnfs: alloc and free layout_hdr layoutdriver methods
[RFC 14/27] pnfsd: introduce exp_xdr.h

pnfs-obj
[RFC 15/27] pnfs-obj: pnfs_osd XDR definitions
[RFC 16/27] pnfs-obj: pnfs_osd XDR client implementations
[RFC 17/27] exofs: pnfs-tree: Remove pnfs-osd private definitions
[RFC 18/27] pnfs-obj: Define PNFS_OBJLAYOUT Kconfig option
[RFC 19/27] pnfs-obj: objlayout driver skeleton
[RFC 20/27] pnfs-obj: objio_osd device information retrieval and caching
[RFC 21/27] pnfs-obj: objio_osd real IO implementation
[RFC 22/27] sunrpc: New xdr_rewind_stream()
[RFC 23/27] pnfs-obj: objlayout_encode_layoutreturn Implementation.
[RFC 24/27] pnfs-obj: objio_osd report osd_errors for layoutreturn
[RFC 25/27] pnfs-obj: objlayout_encode_layoutcommit implementation
[RFC 26/27] pnfs-obj: objio_osd: RAID0 support
[RFC 27/27] pnfs-obj: objio_osd: groups support

* Need to move to xdr_stream based decoding.
* Boaz has some changes queued up, including using a common
I/O engine for exofs and the pnfs-obj layout driver.





2011-04-20 17:26:55

by Benny Halevy

[permalink] [raw]
Subject: [RFC 05/27] pnfs: Use byte-range layout segments

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/callback_proc.c | 4 +-
fs/nfs/pnfs.c | 175 +++++++++++++++++++++++++++++++++++++----------
fs/nfs/pnfs.h | 6 +-
fs/nfs/read.c | 10 ++-
fs/nfs/write.c | 8 ++-
5 files changed, 156 insertions(+), 47 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 99494f6..96f35f2 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
spin_lock(&ino->i_lock);
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
mark_matching_lsegs_invalid(lo, &free_me_list,
- args->cbl_range.iomode))
+ &args->cbl_range))
rv = NFS4ERR_DELAY;
else
rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+ if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
rv = NFS4ERR_DELAY;
list_del_init(&lo->plh_bulk_recall);
spin_unlock(&ino->i_lock);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 89e7725..0b4ad1f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -261,11 +261,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(put_lseg);

-static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
+/*
+ * is l2 fully contained in l1?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (start1 <= start2) && (end1 >= end2);
+}
+
+/*
+ * is l1 and l2 intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
+{
+ u64 start1 = l1->offset;
+ u64 end1 = end_offset(start1, l1->length);
+ u64 start2 = l2->offset;
+ u64 end2 = end_offset(start2, l2->length);
+
+ return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+ (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
+
+bool
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+ struct pnfs_layout_range *recall_range)
{
- return (recall_iomode == IOMODE_ANY ||
- lseg_iomode == recall_iomode);
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ lo_seg_intersecting(lseg_range, recall_range);
}

/* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +357,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode)
+ struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
int invalid = 0, removed = 0;
@@ -309,10 +370,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return 0;
}
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+ if (should_free_lseg(&lseg->pls_range, recall_range)) {
dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
- lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
lseg->pls_range.length);
invalid++;
removed += mark_lseg_invalid(lseg, tmp_list);
@@ -338,7 +400,7 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
return 0;
}
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(lseg->pls_range.iomode, range->iomode)) {
+ if (should_free_lseg(&lseg->pls_range, range)) {
dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode,
@@ -383,12 +445,17 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
{
struct pnfs_layout_hdr *lo;
LIST_HEAD(tmp_list);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };

spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+ mark_matching_lsegs_invalid(lo, &tmp_list, &range);
}
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
@@ -496,7 +563,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- u32 iomode)
+ struct pnfs_layout_range *range)
{
struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
@@ -527,11 +594,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
goto out_err_free;
}

- lgp->args.minlength = NFS4_MAX_UINT64;
+ lgp->args.minlength = PAGE_CACHE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range.iomode = iomode;
- lgp->args.range.offset = 0;
- lgp->args.range.length = NFS4_MAX_UINT64;
+ lgp->args.range = *range;
lgp->args.type = server->pnfs_curr_ld->id;
lgp->args.inode = ino;
lgp->args.ctx = get_nfs_open_context(ctx);
@@ -545,7 +612,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
nfs4_proc_layoutget(lgp);
if (!lseg) {
/* remember that LAYOUTGET failed and suspend trying */
- set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+ set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
}

/* free xdr pages */
@@ -718,10 +785,24 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
* are seen first.
*/
static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+ struct pnfs_layout_range *l2)
{
+ s64 d;
+
+ /* higher offset > lower offset */
+ d = l1->offset - l2->offset;
+ if (d)
+ return d;
+
+ /* longer length > shorter length */
+ d = l1->length - l2->length;
+ if (d)
+ return d;
+
/* read > read/write */
- return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+ return (int)(l2->iomode == IOMODE_READ) -
+ (int)(l1->iomode == IOMODE_READ);
}

static void
@@ -735,7 +816,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,

assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(&lp->pls_range, &lseg->pls_range) > 0)
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -814,16 +895,28 @@ pnfs_find_alloc_layout(struct inode *ino)
* READ RW true
*/
static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_range *range)
{
- return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+ struct pnfs_layout_range range1;
+
+ if ((range->iomode == IOMODE_RW &&
+ lseg->pls_range.iomode != IOMODE_RW) ||
+ !lo_seg_intersecting(&lseg->pls_range, range))
+ return 0;
+
+ /* range1 covers only the first byte in the range */
+ range1 = *range;
+ range1.length = 1;
+ return lo_seg_contained(&lseg->pls_range, &range1);
}

/*
* lookup range in layout
*/
static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
{
struct pnfs_layout_segment *lseg, *ret = NULL;

@@ -832,11 +925,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
- is_matching_lseg(lseg, iomode)) {
+ is_matching_lseg(lseg, range)) {
ret = get_lseg(lseg);
break;
}
- if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+ if (cmp_layout(range, &lseg->pls_range) > 0)
break;
}

@@ -852,8 +945,15 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
+ loff_t pos,
+ u64 count,
enum pnfs_iomode iomode)
{
+ struct pnfs_layout_range arg = {
+ .iomode = iomode,
+ .offset = pos,
+ .length = count,
+ };
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
struct pnfs_layout_hdr *lo;
@@ -881,7 +981,7 @@ pnfs_update_layout(struct inode *ino,
goto out_unlock;

/* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, iomode);
+ lseg = pnfs_find_lseg(lo, &arg);
if (lseg)
goto out_unlock;

@@ -903,7 +1003,7 @@ pnfs_update_layout(struct inode *ino,
spin_unlock(&clp->cl_lock);
}

- lseg = send_layoutget(lo, ctx, iomode);
+ lseg = send_layoutget(lo, ctx, &arg);
if (!lseg && first) {
spin_lock(&clp->cl_lock);
list_del_init(&lo->plh_layouts);
@@ -930,17 +1030,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
int status = 0;

- /* Verify we got what we asked for.
- * Note that because the xdr parsing only accepts a single
- * element array, this can fail even if the server is behaving
- * correctly.
- */
- if (lgp->args.range.iomode > res->range.iomode ||
- res->range.offset != 0 ||
- res->range.length != NFS4_MAX_UINT64) {
- status = -EINVAL;
- goto out;
- }
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
if (!lseg || IS_ERR(lseg)) {
@@ -995,8 +1084,13 @@ static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
/* This is first coelesce call for a series of nfs_pages */
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
prev->wb_context,
+ req_offset(req),
+ pgio->pg_count,
IOMODE_READ);
- }
+ } else if (pgio->pg_lseg &&
+ req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length))
+ return 0;
return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
}

@@ -1017,8 +1111,13 @@ static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
/* This is first coelesce call for a series of nfs_pages */
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
prev->wb_context,
+ req_offset(req),
+ pgio->pg_count,
IOMODE_RW);
- }
+ } else if (pgio->pg_lseg &&
+ req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length))
+ return 0;
return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
}

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3506ad4..c315109 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -136,7 +136,7 @@ void get_layout_hdr(struct pnfs_layout_hdr *lo);
void put_lseg(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type);
+ loff_t pos, u64 count, enum pnfs_iomode access_type);
void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
void unset_pnfs_layoutdriver(struct nfs_server *);
enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
@@ -158,7 +158,7 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct nfs4_state *open_state);
int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- u32 iomode);
+ struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -279,7 +279,7 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)

static inline struct pnfs_layout_segment *
pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
- enum pnfs_iomode access_type)
+ loff_t pos, u64 count, enum pnfs_iomode access_type)
{
return NULL;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7cded2b..f43d41a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,13 +288,17 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests);

BUG_ON(desc->pg_lseg != NULL);
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_READ);
ClearPageError(page);
offset = 0;
nbytes = desc->pg_count;
do {
int ret2;

+ /* FIXME: need a new layout segment? */
+
data = list_entry(list.next, struct nfs_read_data, pages);
list_del_init(&data->pages);

@@ -351,7 +355,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
}
req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages))
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_READ);

ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
0, lseg);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af0c627..5d76bf5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -936,7 +936,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
atomic_set(&req->wb_complete, requests);

BUG_ON(desc->pg_lseg);
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_RW);
ClearPageError(page);
offset = 0;
nbytes = desc->pg_count;
@@ -1010,7 +1012,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
}
req = nfs_list_entry(data->pages.next);
if ((!lseg) && list_is_singular(&data->pages))
- lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
+ lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+ req_offset(req), desc->pg_count,
+ IOMODE_RW);

if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
--
1.7.3.4


2011-04-22 09:09:24

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 12/27] pnfs: alloc and free layout_hdr layoutdriver methods

On 2011-04-20 23:43, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
>
> Why is this needed?
>

for allocating layout-driver private data in hdr.
I'll re-send with the usage...

>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> fs/nfs/pnfs.c | 21 ++++++++++++++++++---
>> fs/nfs/pnfs.h | 3 +++
>> 2 files changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index afc64b3..2254362 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -188,13 +188,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
>> atomic_inc(&lo->plh_refcount);
>> }
>>
>> +static struct pnfs_layout_hdr *
>> +pnfs_alloc_layout_hdr(struct inode *ino)
>> +{
>> + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
>> + return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
>> + kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
>
> BTW: GFP_KERNEL is a bug here. It should be GFP_NOFS or else we can
> recurse back into the filesystem through the page reclaim code.
>

OK. then this needs to be fixed upstream as well
in alloc_init_layout_hdr().

Should I send a patch?

Benny

>> +}
>> +
>> +static void
>> +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
>> +{
>> + struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
>> + return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
>> +}
>> +
>> static void
>> destroy_layout_hdr(struct pnfs_layout_hdr *lo)
>> {
>> dprintk("%s: freeing layout cache %p\n", __func__, lo);
>> BUG_ON(!list_empty(&lo->plh_layouts));
>> NFS_I(lo->plh_inode)->layout = NULL;
>> - kfree(lo);
>> + pnfs_free_layout_hdr(lo);
>> }
>>
>> static void
>> @@ -857,7 +872,7 @@ alloc_init_layout_hdr(struct inode *ino)
>> {
>> struct pnfs_layout_hdr *lo;
>>
>> - lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
>> + lo = pnfs_alloc_layout_hdr(ino);
>> if (!lo)
>> return NULL;
>> atomic_set(&lo->plh_refcount, 1);
>> @@ -890,7 +905,7 @@ pnfs_find_alloc_layout(struct inode *ino)
>> if (likely(nfsi->layout == NULL)) /* Won the race? */
>> nfsi->layout = new;
>> else
>> - kfree(new);
>> + pnfs_free_layout_hdr(new);
>> return nfsi->layout;
>> }
>>
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index bb266ba..35662ac 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -83,6 +83,9 @@ struct pnfs_layoutdriver_type {
>> int (*set_layoutdriver) (struct nfs_server *);
>> int (*unset_layoutdriver) (struct nfs_server *);
>>
>> + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
>> + void (*free_layout_hdr) (struct pnfs_layout_hdr *);
>> +
>> struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
>> void (*free_lseg) (struct pnfs_layout_segment *lseg);
>>
>
>


2011-04-20 17:26:46

by Benny Halevy

[permalink] [raw]
Subject: [RFC 04/27] pnfs: layoutret_on_setattr

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 3 +++
fs/nfs/pnfs.h | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b03defb..b4df7a6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2332,6 +2332,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs4_state *state = NULL;
int status;

+ if (pnfs_ld_layoutret_on_setattr(inode))
+ pnfs_return_layout(inode, NULL, true);
+
nfs_fattr_init(fattr);

/* Search for an existing open(O_WRITE) file */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a308f3c..3506ad4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -64,12 +64,18 @@ enum {
NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
};

+enum layoutdriver_policy_flags {
+ /* Should the pNFS client commit and return the layout upon a setattr */
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+};
+
/* Per-layout driver specific registration structure */
struct pnfs_layoutdriver_type {
struct list_head pnfs_tblid;
const u32 id;
const char *name;
struct module *owner;
+ unsigned flags;
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);

@@ -228,6 +234,16 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
put_lseg(req->wb_commit_lseg);
}

+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return false;
+ return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+ PNFS_LAYOUTRET_ON_SETATTR;
+}
+
static inline int pnfs_return_layout(struct inode *ino,
struct pnfs_layout_range *range,
bool wait)
@@ -290,6 +306,12 @@ static inline int pnfs_return_layout(struct inode *ino,
}

static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+ return false;
+}
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
--
1.7.3.4


2011-04-20 17:27:29

by Benny Halevy

[permalink] [raw]
Subject: [RFC 09/27] pnfs: support for non-rpc layout drivers

Non-rpc layout driver such as for objects and blocks
implement their own I/O path and error handling logic.
Therefore bypass NFS-based error handling for these layout drivers.

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/internal.h | 2 +
fs/nfs/nfs4filelayout.c | 1 +
fs/nfs/nfs4proc.c | 14 +++++++++++-
fs/nfs/pnfs.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/pnfs.h | 7 +++++-
include/linux/nfs_xdr.h | 2 +
6 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce..1914d2f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,8 @@ extern int nfs_migrate_page(struct address_space *,
#endif

/* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
+extern void __nfs4_write_done_cb(struct nfs_write_data *);
extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
extern int nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2feab7f..e67a0d4 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -859,6 +859,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
.owner = THIS_MODULE,
+ .flags = PNFS_USE_RPC_CODE,
.alloc_lseg = filelayout_alloc_lseg,
.free_lseg = filelayout_free_lseg,
.pg_test = filelayout_pg_test,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d0eb50b..cc2cdcd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3149,6 +3149,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return err;
}

+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+ nfs_invalidate_atime(data->inode);
+}
+
static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3158,7 +3163,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
return -EAGAIN;
}

- nfs_invalidate_atime(data->inode);
+ __nfs4_read_done_cb(data);
if (task->tk_status > 0)
renew_lease(server, data->timestamp);
return 0;
@@ -3198,6 +3203,11 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
}
EXPORT_SYMBOL_GPL(nfs4_reset_read);

+void __nfs4_write_done_cb(struct nfs_write_data *data)
+{
+ nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
+}
+
static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
@@ -3208,7 +3218,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
}
if (task->tk_status >= 0) {
renew_lease(NFS_SERVER(inode), data->timestamp);
- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+ __nfs4_write_done_cb(data);
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a5050d2..18ae397 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1130,6 +1130,30 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
}

+/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_write_done(struct nfs_write_data *data)
+{
+ int status;
+
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ if (!data->pnfs_error) {
+ __nfs4_write_done_cb(data);
+ data->mds_ops->rpc_call_done(NULL, data);
+ data->mds_ops->rpc_release(data);
+ return 0;
+ }
+
+ dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+ data->pnfs_error);
+ status = nfs_initiate_write(data, NFS_CLIENT(data->inode), data->mds_ops, NFS_FILE_SYNC);
+ return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_write_done);
+
enum pnfs_try_status
pnfs_try_to_write_data(struct nfs_write_data *wdata,
const struct rpc_call_ops *call_ops, int how)
@@ -1155,6 +1179,30 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
}

/*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_read_done(struct nfs_read_data *data)
+{
+ int status;
+
+ put_lseg(data->lseg);
+ data->lseg = NULL;
+ if (!data->pnfs_error) {
+ __nfs4_read_done_cb(data);
+ data->mds_ops->rpc_call_done(NULL, data);
+ data->mds_ops->rpc_release(data);
+ return 0;
+ }
+
+ dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+ data->pnfs_error);
+ status = nfs_initiate_read(data, NFS_CLIENT(data->inode), data->mds_ops);
+ return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_read_done);
+
+/*
* Call the appropriate parallel I/O subsystem read function.
*/
enum pnfs_try_status
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9f8e970..18b84ce 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,8 +65,11 @@ enum {
};

enum layoutdriver_policy_flags {
+ /* Should the full nfs rpc cleanup code be used after io */
+ PNFS_USE_RPC_CODE = 1 << 0,
+
/* Should the pNFS client commit and return the layout upon a setattr */
- PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
+ PNFS_LAYOUTRET_ON_SETATTR = 1 << 1,
};

/* Per-layout driver specific registration structure */
@@ -182,6 +185,8 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
+int pnfs_write_done(struct nfs_write_data *);
+int pnfs_read_done(struct nfs_read_data *);

static inline int lo_fail_bit(u32 iomode)
{
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 01eb1ae..41f896a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1108,6 +1108,7 @@ struct nfs_read_data {
const struct rpc_call_ops *mds_ops;
int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
__u64 mds_offset;
+ int pnfs_error;
struct page *page_array[NFS_PAGEVEC_SIZE];
};

@@ -1133,6 +1134,7 @@ struct nfs_write_data {
unsigned long timestamp; /* For lease renewal */
#endif
__u64 mds_offset; /* Filelayout dense stripe */
+ int pnfs_error;
struct page *page_array[NFS_PAGEVEC_SIZE];
};

--
1.7.3.4


2011-04-20 17:27:20

by Benny Halevy

[permalink] [raw]
Subject: [RFC 08/27] pnfs: {setup,cleanup}_layoutcommit

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 2 ++
fs/nfs/nfs4xdr.c | 1 +
fs/nfs/pnfs.c | 21 +++++++++++++++++++++
fs/nfs/pnfs.h | 8 ++++++++
include/linux/nfs_xdr.h | 1 +
5 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b4df7a6..d0eb50b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5807,6 +5807,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
if (nfs4_setup_sequence(server, &data->args.seq_args,
&data->res.seq_res, 1, task))
return;
+ data->res.status = -1;
rpc_call_start(task);
}

@@ -5841,6 +5842,7 @@ static void nfs4_layoutcommit_release(void *calldata)
{
struct nfs4_layoutcommit_data *data = calldata;

+ pnfs_cleanup_layoutcommit(data->args.inode, data);
/* Matched by references in pnfs_set_layoutcommit */
put_lseg(data->lseg);
put_rpccred(data->cred);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4f7bef9..23e608f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5307,6 +5307,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
int status;

status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+ res->status = status;
if (status)
return status;

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0b4ad1f..a5050d2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1181,6 +1181,19 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
return trypnfs;
}

+void pnfs_cleanup_layoutcommit(struct inode *inode,
+ struct nfs4_layoutcommit_data *data)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ /* TODO: Maybe we should avoid this by allowing the layout driver
+ * to directly xdr its layout on the wire.
+ */
+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+ nfss->pnfs_curr_ld->cleanup_layoutcommit(
+ NFS_I(inode)->layout, data);
+}
+
/*
* Currently there is only one (whole file) write lseg.
*/
@@ -1277,6 +1290,14 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
data->args.lastbytewritten = end_pos - 1;
data->res.server = NFS_SERVER(inode);

+ /* Call layout driver to set the arguments */
+ if (NFS_SERVER(inode)->pnfs_curr_ld->setup_layoutcommit) {
+ status = NFS_SERVER(inode)->pnfs_curr_ld->setup_layoutcommit(
+ NFS_I(inode)->layout, &data->args);
+ if (status)
+ goto out;
+ }
+
status = nfs4_proc_layoutcommit(data, sync);
out:
dprintk("<-- %s status %d\n", __func__, status);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 011885e..9f8e970 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -99,10 +99,16 @@ struct pnfs_layoutdriver_type {
/* device notification methods */
void (*delete_deviceid)(struct nfs4_deviceid *);

+ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutcommit_args *args);
+
void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);

+ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutcommit_data *data);
+
void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
@@ -151,6 +157,8 @@ enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
const struct rpc_call_ops *, int);
enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
const struct rpc_call_ops *);
+void pnfs_cleanup_layoutcommit(struct inode *,
+ struct nfs4_layoutcommit_data *);
void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
int pnfs_layout_process(struct nfs4_layoutget *lgp);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 455ddfb..01eb1ae 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -255,6 +255,7 @@ struct nfs4_layoutcommit_res {
struct nfs_fattr *fattr;
const struct nfs_server *server;
struct nfs4_sequence_res seq_res;
+ int status;
};

struct nfs4_layoutcommit_data {
--
1.7.3.4


2011-04-20 17:27:37

by Benny Halevy

[permalink] [raw]
Subject: [RFC 10/27] pnfs: {,un}set_layoutdriver methods

For managing per nfs_server layout driver data

[was: pass mntfh down the init_pnfs path]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/pnfs.c | 13 ++++++++++++-
fs/nfs/pnfs.h | 4 ++++
2 files changed, 16 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 18ae397..afc64b3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -75,8 +75,11 @@ find_pnfs_driver(u32 id)
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
- if (nfss->pnfs_curr_ld)
+ if (nfss->pnfs_curr_ld) {
+ if (nfss->pnfs_curr_ld->unset_layoutdriver)
+ nfss->pnfs_curr_ld->unset_layoutdriver(nfss);
module_put(nfss->pnfs_curr_ld->owner);
+ }
nfss->pnfs_curr_ld = NULL;
}

@@ -115,6 +118,14 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
}
server->pnfs_curr_ld = ld_type;

+ if (ld_type->set_layoutdriver &&
+ ld_type->set_layoutdriver(server)) {
+ dprintk("%s: Error initializing mount point for layout driver %u.\n",
+ __func__, id);
+ module_put(ld_type->owner);
+ goto out_no_driver;
+ }
+
dprintk("%s: pNFS module for %u set\n", __func__, id);
return;

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 18b84ce..bb266ba 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -79,6 +79,10 @@ struct pnfs_layoutdriver_type {
const char *name;
struct module *owner;
unsigned flags;
+
+ int (*set_layoutdriver) (struct nfs_server *);
+ int (*unset_layoutdriver) (struct nfs_server *);
+
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);

--
1.7.3.4


2011-04-20 20:34:16

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 09/27] pnfs: support for non-rpc layout drivers

On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
> Non-rpc layout driver such as for objects and blocks
> implement their own I/O path and error handling logic.
> Therefore bypass NFS-based error handling for these layout drivers.
>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/internal.h | 2 +
> fs/nfs/nfs4filelayout.c | 1 +
> fs/nfs/nfs4proc.c | 14 +++++++++++-
> fs/nfs/pnfs.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/nfs/pnfs.h | 7 +++++-
> include/linux/nfs_xdr.h | 2 +
> 6 files changed, 71 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index ce118ce..1914d2f 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -310,6 +310,8 @@ extern int nfs_migrate_page(struct address_space *,
> #endif
>
> /* nfs4proc.c */
> +extern void __nfs4_read_done_cb(struct nfs_read_data *);
> +extern void __nfs4_write_done_cb(struct nfs_write_data *);
> extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
> extern int nfs4_init_client(struct nfs_client *clp,
> const struct rpc_timeout *timeparms,
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index 2feab7f..e67a0d4 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -859,6 +859,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
> .id = LAYOUT_NFSV4_1_FILES,
> .name = "LAYOUT_NFSV4_1_FILES",
> .owner = THIS_MODULE,
> + .flags = PNFS_USE_RPC_CODE,

This isn't being used anywhere, so why do I need it in this patch?

> .alloc_lseg = filelayout_alloc_lseg,
> .free_lseg = filelayout_free_lseg,
> .pg_test = filelayout_pg_test,
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index d0eb50b..cc2cdcd 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -3149,6 +3149,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
> return err;
> }
>
> +void __nfs4_read_done_cb(struct nfs_read_data *data)
^^^^^^^^^^^^^^^^^^^^^^^^^^^ why the wrapper?
> +{
> + nfs_invalidate_atime(data->inode);
> +}
> +
> static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
> {
> struct nfs_server *server = NFS_SERVER(data->inode);
> @@ -3158,7 +3163,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
> return -EAGAIN;
> }
>
> - nfs_invalidate_atime(data->inode);
> + __nfs4_read_done_cb(data);
> if (task->tk_status > 0)
> renew_lease(server, data->timestamp);
> return 0;
> @@ -3198,6 +3203,11 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
> }
> EXPORT_SYMBOL_GPL(nfs4_reset_read);
>
> +void __nfs4_write_done_cb(struct nfs_write_data *data)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Again, why the wrapper?

> +{
> + nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
> +}
> +
> static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
> {
> struct inode *inode = data->inode;
> @@ -3208,7 +3218,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
> }
> if (task->tk_status >= 0) {
> renew_lease(NFS_SERVER(inode), data->timestamp);
> - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
> + __nfs4_write_done_cb(data);
> }
> return 0;
> }
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index a5050d2..18ae397 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1130,6 +1130,30 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
> pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
> }
>
> +/*
> + * Called by non rpc-based layout drivers
> + */
> +int
> +pnfs_write_done(struct nfs_write_data *data)
^^^^^^^^^^^^^^^^^^ If this is not generic to all pnfs layout drivers,
then why the apparently generic name?

Why isn't this being introduced together with a driver that actually
uses the functionality? There is no way to review it outside of that
context.

> +{
> + int status;
> +
> + put_lseg(data->lseg);
> + data->lseg = NULL;
> + if (!data->pnfs_error) {
> + __nfs4_write_done_cb(data);
> + data->mds_ops->rpc_call_done(NULL, data);
> + data->mds_ops->rpc_release(data);
> + return 0;
> + }
> +
> + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
> + data->pnfs_error);
> + status = nfs_initiate_write(data, NFS_CLIENT(data->inode), data->mds_ops, NFS_FILE_SYNC);
> + return status ? : -EAGAIN;
> +}
> +EXPORT_SYMBOL_GPL(pnfs_write_done);
> +
> enum pnfs_try_status
> pnfs_try_to_write_data(struct nfs_write_data *wdata,
> const struct rpc_call_ops *call_ops, int how)
> @@ -1155,6 +1179,30 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
> }
>
> /*
> + * Called by non rpc-based layout drivers
> + */
> +int
> +pnfs_read_done(struct nfs_read_data *data)
> +{
> + int status;
> +
> + put_lseg(data->lseg);
> + data->lseg = NULL;
> + if (!data->pnfs_error) {
> + __nfs4_read_done_cb(data);
> + data->mds_ops->rpc_call_done(NULL, data);
> + data->mds_ops->rpc_release(data);
> + return 0;
> + }
> +
> + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
> + data->pnfs_error);
> + status = nfs_initiate_read(data, NFS_CLIENT(data->inode), data->mds_ops);
> + return status ? : -EAGAIN;
> +}
> +EXPORT_SYMBOL_GPL(pnfs_read_done);
> +
> +/*
> * Call the appropriate parallel I/O subsystem read function.
> */
> enum pnfs_try_status
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 9f8e970..18b84ce 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -65,8 +65,11 @@ enum {
> };
>
> enum layoutdriver_policy_flags {
> + /* Should the full nfs rpc cleanup code be used after io */
> + PNFS_USE_RPC_CODE = 1 << 0,
> +
> /* Should the pNFS client commit and return the layout upon a setattr */
> - PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
> + PNFS_LAYOUTRET_ON_SETATTR = 1 << 1,
> };
>
> /* Per-layout driver specific registration structure */
> @@ -182,6 +185,8 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
> void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
> int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
> int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
> +int pnfs_write_done(struct nfs_write_data *);
> +int pnfs_read_done(struct nfs_read_data *);
>
> static inline int lo_fail_bit(u32 iomode)
> {
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 01eb1ae..41f896a 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -1108,6 +1108,7 @@ struct nfs_read_data {
> const struct rpc_call_ops *mds_ops;
> int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
> __u64 mds_offset;
> + int pnfs_error;
> struct page *page_array[NFS_PAGEVEC_SIZE];
> };
>
> @@ -1133,6 +1134,7 @@ struct nfs_write_data {
> unsigned long timestamp; /* For lease renewal */
> #endif
> __u64 mds_offset; /* Filelayout dense stripe */
> + int pnfs_error;
> struct page *page_array[NFS_PAGEVEC_SIZE];
> };
>

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-20 20:49:23

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 15/27] pnfs-obj: pnfs_osd XDR definitions

On Wed, 2011-04-20 at 20:28 +0300, Benny Halevy wrote:
> * Add the pnfs_osd_xdr.h header
>
> * defintions the pnfs_osd_layout structure including all it's
> sub-types and constants.
> * Declare the pnfs_osd_xdr_encode/decode_layout API + all needed
> inline helpers.
>
> * Define the pnfs_osd_deviceaddr structure and all its subtypes and
> constants.
> * Declare API for encoding/decoding of a pnfs_osd_deviceaddr to/from
> XDR stream.
>
> * Define the pnfs_osd_ioerr structure, its substructures and constants.
> * Declare API for encoding/decoding of a pnfs_osd_ioerr to/from
> XDR stream.
>
> * Define the pnfs_osd_layoutupdate structure and its substructures.
> * Declare API for encoding/decoding of a pnfs_osd_layoutupdate to/from
> XDR stream.
>
> [Some extra debug-prints]
> Signed-off-by: Boaz Harrosh <[email protected]>
> [objlayout driver skeleton]
> [use __be32]
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> include/linux/pnfs_osd_xdr.h | 439 ++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 439 insertions(+), 0 deletions(-)
> create mode 100644 include/linux/pnfs_osd_xdr.h
>
> diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
> new file mode 100644
> index 0000000..6f2cedf
> --- /dev/null
> +++ b/include/linux/pnfs_osd_xdr.h
> @@ -0,0 +1,439 @@
> +/*
> + * pnfs_osd_xdr.h
> + *
> + * pNFS-osd on-the-wire data structures
> + *
> + * Copyright (C) 2007-2009 Panasas Inc.
> + * All rights reserved.
> + *
> + * Benny Halevy <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2
> + * See the file COPYING included with this distribution for more details.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the Panasas company nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
> + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
> + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +#ifndef __PNFS_OSD_XDR_H__
> +#define __PNFS_OSD_XDR_H__
> +
> +#include <linux/nfs_fs.h>
> +#include <linux/nfs_page.h>
> +#include <linux/exp_xdr.h>
> +#include <scsi/osd_protocol.h>
> +
> +#define PNFS_OSD_OSDNAME_MAXSIZE 256
> +
> +/*
> + * START OF "GENERIC" DECODE ROUTINES.
> + * These may look a little ugly since they are imported from a "generic"
> + * set of XDR encode/decode routines which are intended to be shared by
> + * all of our NFSv4 implementations (OpenBSD, MacOS X...).
> + *
> + * If the pain of reading these is too great, it should be a straightforward
> + * task to translate them into Linux-specific versions which are more
> + * consistent with the style used in NFSv2/v3...

Please do... We just got rid of these in the NFS code. The last thing we
want to do is reintroduce them in new code.

> + */
> +#define READ32(x) (x) = ntohl(*p++)
> +#define READ64(x) do { \
> + (x) = (u64)ntohl(*p++) << 32; \
> + (x) |= ntohl(*p++); \
> +} while (0)
> +#define COPYMEM(x, nbytes) do { \
> + memcpy((x), p, nbytes); \
> + p += XDR_QUADLEN(nbytes); \
> +} while (0)
> +
> +/*
> + * draft-ietf-nfsv4-minorversion-22
> + * draft-ietf-nfsv4-pnfs-obj-12
> + */
> +
> +/* Layout Structure */
> +
> +enum pnfs_osd_raid_algorithm4 {
> + PNFS_OSD_RAID_0 = 1,
> + PNFS_OSD_RAID_4 = 2,
> + PNFS_OSD_RAID_5 = 3,
> + PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
> +};
> +
> +/* struct pnfs_osd_data_map4 {
> + * uint32_t odm_num_comps;
> + * length4 odm_stripe_unit;
> + * uint32_t odm_group_width;
> + * uint32_t odm_group_depth;
> + * uint32_t odm_mirror_cnt;
> + * pnfs_osd_raid_algorithm4 odm_raid_algorithm;
> + * };
> + */
> +struct pnfs_osd_data_map {
> + u32 odm_num_comps;
> + u64 odm_stripe_unit;
> + u32 odm_group_width;
> + u32 odm_group_depth;
> + u32 odm_mirror_cnt;
> + u32 odm_raid_algorithm;
> +};
> +
> +static inline int
> +pnfs_osd_data_map_xdr_sz(void)
> +{
> + return 1 + 2 + 1 + 1 + 1 + 1;
> +}
> +
> +static inline size_t
> +pnfs_osd_data_map_incore_sz(void)
> +{
> + return sizeof(struct pnfs_osd_data_map);

Why the wrapper?

> +}
> +
> +/* struct pnfs_osd_objid4 {
> + * deviceid4 oid_device_id;
> + * uint64_t oid_partition_id;
> + * uint64_t oid_object_id;
> + * };
> + */
> +struct pnfs_osd_objid {
> + struct nfs4_deviceid oid_device_id;
> + u64 oid_partition_id;
> + u64 oid_object_id;
> +};
> +
> +/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */
> +#define _DEVID_LO(oid_device_id) \
> + (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data)
> +
> +#define _DEVID_HI(oid_device_id) \
> + (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1)
> +
> +static inline int
> +pnfs_osd_objid_xdr_sz(void)
> +{
> + return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
> +}
> +
> +static inline size_t
> +pnfs_osd_objid_incore_sz(void)
> +{
> + return sizeof(struct pnfs_osd_objid);
> +}
> +
> +enum pnfs_osd_version {
> + PNFS_OSD_MISSING = 0,
> + PNFS_OSD_VERSION_1 = 1,
> + PNFS_OSD_VERSION_2 = 2
> +};
> +
> +struct pnfs_osd_opaque_cred {
> + u32 cred_len;
> + u8 *cred;
> +};
> +
> +static inline int
> +pnfs_osd_opaque_cred_xdr_sz(__be32 *p)
> +{
> + u32 *start = p;
> + u32 n;
> +
> + READ32(n);
> + p += XDR_QUADLEN(n);
> + return p - start;
> +}
> +
> +static inline size_t
> +pnfs_osd_opaque_cred_incore_sz(__be32 *p)
> +{
> + u32 n;
> +
> + READ32(n);
> + return XDR_QUADLEN(n) * 4;
> +}
> +
> +enum pnfs_osd_cap_key_sec {
> + PNFS_OSD_CAP_KEY_SEC_NONE = 0,
> + PNFS_OSD_CAP_KEY_SEC_SSV = 1,
> +};
> +
> +/* struct pnfs_osd_object_cred4 {
> + * pnfs_osd_objid4 oc_object_id;
> + * pnfs_osd_version4 oc_osd_version;
> + * pnfs_osd_cap_key_sec4 oc_cap_key_sec;
> + * opaque oc_capability_key<>;
> + * opaque oc_capability<>;
> + * };
> + */
> +struct pnfs_osd_object_cred {
> + struct pnfs_osd_objid oc_object_id;
> + u32 oc_osd_version;
> + u32 oc_cap_key_sec;
> + struct pnfs_osd_opaque_cred oc_cap_key;
> + struct pnfs_osd_opaque_cred oc_cap;
> +};
> +
> +static inline int
> +pnfs_osd_object_cred_xdr_sz(__be32 *p)
> +{
> + __be32 *start = p;
> +
> + p += pnfs_osd_objid_xdr_sz() + 2;
> + p += pnfs_osd_opaque_cred_xdr_sz(p);
> + p += pnfs_osd_opaque_cred_xdr_sz(p);
> + return p - start;
> +}
> +
> +static inline size_t
> +pnfs_osd_object_cred_incore_sz(__be32 *p)
> +{
> + size_t sz = sizeof(struct pnfs_osd_object_cred);
> +
> + p += pnfs_osd_objid_xdr_sz() + 2;
> + sz += pnfs_osd_opaque_cred_incore_sz(p);
> + p += pnfs_osd_opaque_cred_xdr_sz(p);
> + sz += pnfs_osd_opaque_cred_incore_sz(p);
> + return sz;
> +}
> +
> +/* struct pnfs_osd_layout4 {
> + * pnfs_osd_data_map4 olo_map;
> + * uint32_t olo_comps_index;
> + * pnfs_osd_object_cred4 olo_components<>;
> + * };
> + */
> +struct pnfs_osd_layout {
> + struct pnfs_osd_data_map olo_map;
> + u32 olo_comps_index;
> + u32 olo_num_comps;
> + struct pnfs_osd_object_cred *olo_comps;
> +};
> +
> +static inline int
> +pnfs_osd_layout_xdr_sz(__be32 *p)
> +{
> + __be32 *start = p;
> + u32 n;
> +
> + p += pnfs_osd_data_map_xdr_sz() + 1;
> + READ32(n);
> + while ((int)(n--) > 0)
> + p += pnfs_osd_object_cred_xdr_sz(p);
> + return p - start;
> +}
> +
> +static inline size_t
> +pnfs_osd_layout_incore_sz(__be32 *p)
> +{
> + u32 n;
> + size_t sz;
> +
> + p += pnfs_osd_data_map_xdr_sz() + 1;
> + READ32(n);
> + sz = sizeof(struct pnfs_osd_layout);
> + while ((int)(n--) > 0) {
> + sz += pnfs_osd_object_cred_incore_sz(p);
> + p += pnfs_osd_object_cred_xdr_sz(p);
> + }
> + return sz;
> +}
> +
> +/* Device Address */
> +
> +enum pnfs_osd_targetid_type {
> + OBJ_TARGET_ANON = 1,
> + OBJ_TARGET_SCSI_NAME = 2,
> + OBJ_TARGET_SCSI_DEVICE_ID = 3,
> +};
> +
> +/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
> + * case OBJ_TARGET_SCSI_NAME:
> + * string oti_scsi_name<>;
> + *
> + * case OBJ_TARGET_SCSI_DEVICE_ID:
> + * opaque oti_scsi_device_id<>;
> + *
> + * default:
> + * void;
> + * };
> + *
> + * union pnfs_osd_targetaddr4 switch (bool ota_available) {
> + * case TRUE:
> + * netaddr4 ota_netaddr;
> + * case FALSE:
> + * void;
> + * };
> + *
> + * struct pnfs_osd_deviceaddr4 {
> + * pnfs_osd_targetid4 oda_targetid;
> + * pnfs_osd_targetaddr4 oda_targetaddr;
> + * uint64_t oda_lun;
> + * opaque oda_systemid<>;
> + * pnfs_osd_object_cred4 oda_root_obj_cred;
> + * opaque oda_osdname<>;
> + * };
> + */
> +struct pnfs_osd_targetid {
> + u32 oti_type;
> + struct nfs4_string oti_scsi_device_id;
> +};
> +
> +enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
> +
> +/* struct netaddr4 {
> + * // see struct rpcb in RFC1833
> + * string r_netid<>; // network id
> + * string r_addr<>; // universal address
> + * };
> + */
> +struct pnfs_osd_net_addr {
> + struct nfs4_string r_netid;
> + struct nfs4_string r_addr;
> +};
> +
> +struct pnfs_osd_targetaddr {
> + u32 ota_available;
> + struct pnfs_osd_net_addr ota_netaddr;
> +};
> +
> +enum {
> + NETWORK_ID_MAX = 16 / 4,
> + UNIVERSAL_ADDRESS_MAX = 64 / 4,
> + PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
> +};
> +
> +struct pnfs_osd_deviceaddr {
> + struct pnfs_osd_targetid oda_targetid;
> + struct pnfs_osd_targetaddr oda_targetaddr;
> + u8 oda_lun[8];
> + struct nfs4_string oda_systemid;
> + struct pnfs_osd_object_cred oda_root_obj_cred;
> + struct nfs4_string oda_osdname;
> +};
> +
> +enum {
> + ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
> + PNFS_OSD_DEVICEADDR_MAX =
> + PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
> + 2 /*oda_lun*/ +
> + 1 + OSD_SYSTEMID_LEN +
> + 1 + ODA_OSDNAME_MAX,
> +};
> +
> +/* LAYOUTCOMMIT: layoutupdate */
> +
> +/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
> + * case TRUE:
> + * int64_t dsu_delta;
> + * case FALSE:
> + * void;
> + * };
> + *
> + * struct pnfs_osd_layoutupdate4 {
> + * pnfs_osd_deltaspaceused4 olu_delta_space_used;
> + * bool olu_ioerr_flag;
> + * };
> + */
> +struct pnfs_osd_layoutupdate {
> + u32 dsu_valid;
> + s64 dsu_delta;
> + u32 olu_ioerr_flag;
> +};
> +
> +/* LAYOUTRETURN: I/O Rrror Report */
> +
> +enum pnfs_osd_errno {
> + PNFS_OSD_ERR_EIO = 1,
> + PNFS_OSD_ERR_NOT_FOUND = 2,
> + PNFS_OSD_ERR_NO_SPACE = 3,
> + PNFS_OSD_ERR_BAD_CRED = 4,
> + PNFS_OSD_ERR_NO_ACCESS = 5,
> + PNFS_OSD_ERR_UNREACHABLE = 6,
> + PNFS_OSD_ERR_RESOURCE = 7
> +};
> +
> +/* struct pnfs_osd_ioerr4 {
> + * pnfs_osd_objid4 oer_component;
> + * length4 oer_comp_offset;
> + * length4 oer_comp_length;
> + * bool oer_iswrite;
> + * pnfs_osd_errno4 oer_errno;
> + * };
> + */
> +struct pnfs_osd_ioerr {
> + struct pnfs_osd_objid oer_component;
> + u64 oer_comp_offset;
> + u64 oer_comp_length;
> + u32 oer_iswrite;
> + u32 oer_errno;
> +};
> +
> +static inline unsigned
> +pnfs_osd_ioerr_xdr_sz(void)
> +{
> + return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1;
> +}
> +
> +/* OSD XDR API */
> +
> +/* Layout helpers */
> +extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout(
> + struct pnfs_osd_layout *layout, __be32 *p);
> +
> +extern int pnfs_osd_xdr_encode_layout(
> + struct exp_xdr_stream *xdr,
> + struct pnfs_osd_layout *layout);
> +
> +/* Device Info helpers */
> +
> +/* First pass calculate total size for space needed */
> +extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(__be32 *p);
> +
> +/* Note: some strings pointed to inside @deviceaddr might point
> + * to space inside @p. @p should stay valid while @deviceaddr
> + * is in use.
> + * It is assumed that @deviceaddr points to bigger memory of size
> + * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz()
> + */
> +extern void pnfs_osd_xdr_decode_deviceaddr(
> + struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
> +
> +/* For Servers */
> +extern int pnfs_osd_xdr_encode_deviceaddr(
> + struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr);
> +
> +/* layoutupdate (layout_commit) xdr helpers */
> +extern int
> +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
> + struct pnfs_osd_layoutupdate *lou);
> +extern __be32 *
> +pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p);
> +
> +/* osd_ioerror encoding/decoding (layout_return) */
> +extern int
> +pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr);
> +extern __be32 *
> +pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p);
> +
> +#endif /* __PNFS_OSD_XDR_H__ */

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-20 17:26:38

by Benny Halevy

[permalink] [raw]
Subject: [RFC 03/27] pnfs: layoutreturn

Signed-off-by: Alexandros Batsakis <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Marc Eshel <[email protected]>
Signed-off-by: Zhang Jingwang <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/inode.c | 3 +-
fs/nfs/nfs4proc.c | 97 ++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4xdr.c | 118 ++++++++++++++++++++++++++++++++++++++++++++--
fs/nfs/pnfs.c | 96 ++++++++++++++++++++++++++++++++++++++
fs/nfs/pnfs.h | 22 +++++++++
include/linux/nfs4.h | 1 +
include/linux/nfs_xdr.h | 23 +++++++++
7 files changed, 353 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31a..73a2529 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1424,9 +1424,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
void nfs4_evict_inode(struct inode *inode)
{
- pnfs_destroy_layout(NFS_I(inode));
+ pnfs_return_layout(inode, NULL, true);
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
+ pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
/* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9bf41ea..b03defb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5662,6 +5662,103 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
return status;
}

+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s\n", __func__);
+ if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+ &lrp->res.seq_res, 0, task))
+ return;
+ rpc_call_start(task);
+}
+
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+ struct nfs_server *server;
+
+ dprintk("--> %s\n", __func__);
+
+ if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+ return;
+
+ if (lrp->args.return_type == RETURN_FILE)
+ server = NFS_SERVER(lrp->args.inode);
+ else
+ server = NULL;
+ if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ nfs_restart_rpc(task, lrp->clp);
+ return;
+ }
+ if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
+ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+
+ spin_lock(&lo->plh_inode->i_lock);
+ if (lrp->res.lrs_present)
+ pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ else
+ BUG_ON(!list_empty(&lo->plh_segs));
+ spin_unlock(&lo->plh_inode->i_lock);
+ }
+ dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutreturn_release(void *calldata)
+{
+ struct nfs4_layoutreturn *lrp = calldata;
+
+ dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
+ if (lrp->args.return_type == RETURN_FILE) {
+ struct inode *ino = lrp->args.inode;
+ struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
+
+ put_layout_hdr(lo);
+ }
+ kfree(calldata);
+ dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+ .rpc_call_prepare = nfs4_layoutreturn_prepare,
+ .rpc_call_done = nfs4_layoutreturn_done,
+ .rpc_release = nfs4_layoutreturn_release,
+};
+
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+ .rpc_argp = &lrp->args,
+ .rpc_resp = &lrp->res,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = lrp->clp->cl_rpcclient,
+ .rpc_message = &msg,
+ .callback_ops = &nfs4_layoutreturn_call_ops,
+ .callback_data = lrp,
+ .flags = RPC_TASK_ASYNC,
+ };
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ if (!issync)
+ goto out;
+ status = nfs4_wait_for_completion_rpc_task(task);
+ if (status != 0)
+ goto out;
+ status = task->tk_status;
+out:
+ dprintk("<-- %s\n", __func__);
+ rpc_put_task(task);
+ return status;
+}
+
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
{
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index dddfb57..53ea3e5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,12 @@ static int nfs4_stat_to_errno(int);
1 /* layoutupdate4 layout type */ + \
1 /* NULL filelayout layoutupdate4 payload */)
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
-
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ 1 /* FIXME: opaque lrf_body always empty at
+ *the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+ 1 + decode_stateid_maxsz)
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
@@ -760,7 +765,14 @@ static int nfs4_stat_to_errno(int);
decode_putfh_maxsz + \
decode_layoutcommit_maxsz + \
decode_getattr_maxsz)
-
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz)

const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
compound_encode_hdr_maxsz +
@@ -1890,6 +1902,37 @@ encode_layoutcommit(struct xdr_stream *xdr,
hdr->replen += decode_layoutcommit_maxsz;
return 0;
}
+
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+ nfs4_stateid stateid;
+ __be32 *p;
+
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+ *p++ = cpu_to_be32(args->reclaim);
+ *p++ = cpu_to_be32(args->layout_type);
+ *p++ = cpu_to_be32(args->range.iomode);
+ *p = cpu_to_be32(args->return_type);
+ if (args->return_type == RETURN_FILE) {
+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ spin_lock(&args->inode->i_lock);
+ memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data,
+ NFS4_STATEID_SIZE);
+ spin_unlock(&args->inode->i_lock);
+ p = xdr_encode_opaque_fixed(p, &stateid.data,
+ NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0);
+ }
+ hdr->nops++;
+ hdr->replen += decode_layoutreturn_maxsz;
+}
#endif /* CONFIG_NFS_V4_1 */

/*
@@ -2707,9 +2750,9 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
/*
* Encode LAYOUTCOMMIT request
*/
-static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
- struct xdr_stream *xdr,
- struct nfs4_layoutcommit_args *args)
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutcommit_args *args)
{
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2721,7 +2764,24 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
encode_layoutcommit(xdr, args, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
- return 0;
+}
+
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+ encode_layoutreturn(xdr, args, &hdr);
+ encode_nops(&hdr);
}
#endif /* CONFIG_NFS_V4_1 */

@@ -5202,6 +5262,27 @@ out_overflow:
return -EIO;
}

+static int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ __be32 *p;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+ if (status)
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->lrs_present = be32_to_cpup(p);
+ if (res->lrs_present)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_layoutcommit(struct xdr_stream *xdr,
struct rpc_rqst *req,
struct nfs4_layoutcommit_res *res)
@@ -6319,6 +6400,30 @@ out:
}

/*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_layoutreturn(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode LAYOUTCOMMIT response
*/
static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6544,6 +6649,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
+ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
#endif /* CONFIG_NFS_V4_1 */
};

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d9ab972..89e7725 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -321,6 +321,36 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return invalid - removed;
}

+/* Returns false if there was nothing to do, true otherwise */
+static bool
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
+ struct pnfs_layout_range *range)
+{
+ struct pnfs_layout_segment *lseg, *next;
+ bool rv = false;
+
+ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
+ __func__, lo, range->offset, range->length, range->iomode);
+ assert_spin_locked(&lo->plh_inode->i_lock);
+ if (list_empty(&lo->plh_segs)) {
+ if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+ put_layout_hdr_locked(lo);
+ return 0;
+ }
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (should_free_lseg(lseg->pls_range.iomode, range->iomode)) {
+ dprintk("%s: freeing lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
+ lseg->pls_range.length);
+ mark_lseg_invalid(lseg, tmp_list);
+ rv = true;
+ }
+ dprintk("%s:Return %d\n", __func__, rv);
+ return rv;
+}
+
/* note free_me must contain lsegs from a single layout_hdr */
void
pnfs_free_lseg_list(struct list_head *free_me)
@@ -539,6 +569,72 @@ out_err_free:
return NULL;
}

+static int
+return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
+{
+ struct nfs4_layoutreturn *lrp;
+ struct nfs_server *server = NFS_SERVER(ino);
+ int status = -ENOMEM;
+
+ dprintk("--> %s\n", __func__);
+
+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+ if (lrp == NULL) {
+ put_layout_hdr(NFS_I(ino)->layout);
+ goto out;
+ }
+ lrp->args.reclaim = 0;
+ lrp->args.layout_type = server->pnfs_curr_ld->id;
+ lrp->args.return_type = RETURN_FILE;
+ lrp->args.range = *range;
+ lrp->args.inode = ino;
+ lrp->clp = server->nfs_client;
+
+ status = nfs4_proc_layoutreturn(lrp, wait);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
+{
+ struct pnfs_layout_hdr *lo = NULL;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_range arg;
+ LIST_HEAD(tmp_list);
+ int status = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ arg.iomode = range ? range->iomode : IOMODE_ANY;
+ arg.offset = 0;
+ arg.length = NFS4_MAX_UINT64;
+
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s: no layout segments to return\n", __func__);
+ goto out;
+ }
+ /* Reference matched in nfs4_layoutreturn_release */
+ get_layout_hdr(lo);
+ spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&tmp_list);
+
+ /* Return layout even if layoutcommit fails */
+ status = pnfs_layoutcommit_inode(ino, wait);
+ if (status)
+ dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
+ __func__, status);
+ status = return_layout(ino, &arg, wait);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
bool pnfs_roc(struct inode *ino)
{
struct pnfs_layout_hdr *lo;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4cb0a0d..a308f3c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -123,6 +123,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);

/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -158,6 +159,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);

static inline int lo_fail_bit(u32 iomode)
{
@@ -226,6 +228,19 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
put_lseg(req->wb_commit_lseg);
}

+static inline int pnfs_return_layout(struct inode *ino,
+ struct pnfs_layout_range *range,
+ bool wait)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_server *nfss = NFS_SERVER(ino);
+
+ if (pnfs_enabled_sb(nfss) && nfsi->layout)
+ return _pnfs_return_layout(ino, range, wait);
+
+ return 0;
+}
+
#else /* CONFIG_NFS_V4_1 */

static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -267,6 +282,13 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
return PNFS_NOT_ATTEMPTED;
}

+static inline int pnfs_return_layout(struct inode *ino,
+ struct pnfs_layout_range *range,
+ bool wait)
+{
+ return 0;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 178fafe..9376eaf 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -562,6 +562,7 @@ enum {
NFSPROC4_CLNT_LAYOUTGET,
NFSPROC4_CLNT_GETDEVICEINFO,
NFSPROC4_CLNT_LAYOUTCOMMIT,
+ NFSPROC4_CLNT_LAYOUTRETURN,
};

/* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 78b101e..455ddfb 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -266,6 +266,29 @@ struct nfs4_layoutcommit_data {
struct nfs4_layoutcommit_res res;
};

+struct nfs4_layoutreturn_args {
+ __u32 reclaim;
+ __u32 layout_type;
+ __u32 return_type;
+ struct pnfs_layout_range range;
+ struct inode *inode;
+ struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutreturn_res {
+ struct nfs4_sequence_res seq_res;
+ u32 lrs_present;
+ nfs4_stateid stateid;
+};
+
+struct nfs4_layoutreturn {
+ struct nfs4_layoutreturn_args args;
+ struct nfs4_layoutreturn_res res;
+ struct rpc_cred *cred;
+ struct nfs_client *clp;
+ int rpc_status;
+};
+
/*
* Arguments to the open call.
*/
--
1.7.3.4


2011-04-22 09:06:03

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 11/27] pnfs: per mount layout driver private data

On 2011-04-20 23:36, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> include/linux/nfs_fs_sb.h | 3 +++
>> 1 files changed, 3 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 216cea5..c5b3fd0 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -142,6 +142,9 @@ struct nfs_server {
>> filesystem */
>> struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
>> struct rpc_wait_queue roc_rpcwaitq;
>> + void *pnfs_ld_data; /* Per-mount data */
>> + unsigned int ds_rsize; /* Data server read size */
>> + unsigned int ds_wsize; /* Data server write size */
>
> Shouldn't the rsize and wsize be part of the layout driver private data?
>

Oops, they're not used anymore (they were previously
used by the files layout). I'll just get rid of them.

Benny

>>
>> /* the following fields are protected by nfs_client->cl_lock */
>> struct rb_root state_owners;
>


2011-04-22 08:23:31

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 04/27] pnfs: layoutret_on_setattr

On 2011-04-20 23:03, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
>> From: Andy Adamson <[email protected]>
>>
>> Signed-off-by: Andy Adamson <[email protected]>
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> fs/nfs/nfs4proc.c | 3 +++
>> fs/nfs/pnfs.h | 22 ++++++++++++++++++++++
>> 2 files changed, 25 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index b03defb..b4df7a6 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -2332,6 +2332,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
>> struct nfs4_state *state = NULL;
>> int status;
>>
>> + if (pnfs_ld_layoutret_on_setattr(inode))
>> + pnfs_return_layout(inode, NULL, true);
>
> There is nothing that prevents further reads and writes being scheduled
> after this, so what is the plan to prevent them from being sent to the
> MDS?

The idea is that the client will acquire a fresh layout for further
I/Os. Sending I/Os to the MDS at any time should be fine,
but layoutgets should be synchronized with LAYOUTRETURN+SETATTR.

> Also, why are we doing this in the case of a file time update or a
> modebits update? It seems relevant only for the case of a size update.
>

I agree regarding file time updates, but in the case of modebits update
the client will lose its layout too as its associated capabilities will
be revoked due to the change in permissions.

Benny

>> +
>> nfs_fattr_init(fattr);
>>
>> /* Search for an existing open(O_WRITE) file */
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index a308f3c..3506ad4 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -64,12 +64,18 @@ enum {
>> NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
>> };
>>
>> +enum layoutdriver_policy_flags {
>> + /* Should the pNFS client commit and return the layout upon a setattr */
>> + PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
>> +};
>> +
>> /* Per-layout driver specific registration structure */
>> struct pnfs_layoutdriver_type {
>> struct list_head pnfs_tblid;
>> const u32 id;
>> const char *name;
>> struct module *owner;
>> + unsigned flags;
>> struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
>> void (*free_lseg) (struct pnfs_layout_segment *lseg);
>>
>> @@ -228,6 +234,16 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
>> put_lseg(req->wb_commit_lseg);
>> }
>>
>> +/* Should the pNFS client commit and return the layout upon a setattr */
>> +static inline bool
>> +pnfs_ld_layoutret_on_setattr(struct inode *inode)
>> +{
>> + if (!pnfs_enabled_sb(NFS_SERVER(inode)))
>> + return false;
>> + return NFS_SERVER(inode)->pnfs_curr_ld->flags &
>> + PNFS_LAYOUTRET_ON_SETATTR;
>> +}
>> +
>> static inline int pnfs_return_layout(struct inode *ino,
>> struct pnfs_layout_range *range,
>> bool wait)
>> @@ -290,6 +306,12 @@ static inline int pnfs_return_layout(struct inode *ino,
>> }
>>
>> static inline bool
>> +pnfs_ld_layoutret_on_setattr(struct inode *inode)
>> +{
>> + return false;
>> +}
>> +
>> +static inline bool
>> pnfs_roc(struct inode *ino)
>> {
>> return false;
>


2011-04-20 17:26:19

by Benny Halevy

[permalink] [raw]
Subject: [RFC 01/27] pnfs: CB_NOTIFY_DEVICEID

From: Marc Eshel <[email protected]>

Note: This functionlaity is incomplete as all layout segments referring to
the 'to be removed device id' need to be reaped, and all in flight I/O drained.

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/callback.h | 20 +++++++++
fs/nfs/callback_proc.c | 50 +++++++++++++++++++++++
fs/nfs/callback_xdr.c | 96 +++++++++++++++++++++++++++++++++++++++++++-
fs/nfs/nfs4filelayout.c | 1 +
fs/nfs/nfs4filelayout.h | 1 +
fs/nfs/nfs4filelayoutdev.c | 38 +++++++++++++++++-
fs/nfs/pnfs.h | 3 +
7 files changed, 207 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce..892128f 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,26 @@ extern unsigned nfs4_callback_layoutrecall(

extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
extern void nfs4_cb_take_slot(struct nfs_client *clp);
+
+struct cb_devicenotifyitem {
+ uint32_t cbd_notify_type;
+ uint32_t cbd_layout_type;
+ struct nfs4_deviceid cbd_dev_id;
+ uint32_t cbd_immediate;
+};
+
+/* XXX: Should be dynamic up to max compound size */
+#define NFS4_DEV_NOTIFY_MAXENTRIES 10
+struct cb_devicenotifyargs {
+ struct sockaddr *addr;
+ int ndevs;
+ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES];
+};
+
+extern __be32 nfs4_callback_devicenotify(
+ struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps);
+
#endif /* CONFIG_NFS_V4_1 */
extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dcce..99494f6 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -241,6 +241,56 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
do_callback_layoutrecall(clp, &args);
}

+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+ void *dummy, struct cb_process_state *cps)
+{
+ int i;
+ u32 res = 0;
+ struct nfs_client *clp = cps->clp;
+ struct nfs_server *server = NULL;
+
+ dprintk("%s: -->\n", __func__);
+
+ if (!clp) {
+ res = NFS4ERR_OP_NOT_IN_SESSION;
+ goto out;
+ }
+
+ for (i = 0; i < args->ndevs; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ if (!server ||
+ server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+ if (server->pnfs_curr_ld &&
+ server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+ rcu_read_unlock();
+ goto found;
+ }
+ rcu_read_unlock();
+ dprintk("%s: layout type %u not found\n",
+ __func__, dev->cbd_layout_type);
+ continue;
+ }
+
+ found:
+ if (!server->pnfs_curr_ld->delete_deviceid) {
+ res = NFS4ERR_NOTSUPP;
+ break;
+ }
+ if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+ dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+ "deleting instead\n", __func__);
+ server->pnfs_curr_ld->delete_deviceid(&dev->cbd_dev_id);
+ }
+
+out:
+ dprintk("%s: exit with status = %u\n",
+ __func__, res);
+ return cpu_to_be32(res);
+}
+
int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
{
if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62..5ec2c12 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@

#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
4 + 1 + 3)
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
return status;
}

+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct cb_devicenotifyargs *args)
+{
+ __be32 *p;
+ __be32 status = 0;
+ u32 tmp;
+ int n, i;
+ args->ndevs = 0;
+
+ args->addr = svc_addr(rqstp);
+
+ /* Num of device notifications */
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_RESOURCE);
+ goto out;
+ }
+ n = ntohl(*p++);
+ if (n <= 0)
+ goto out;
+
+ /* XXX: need to possibly return error in this case */
+ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) {
+ dprintk("%s: Processing (%d) notifications out of (%d)\n",
+ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n);
+ n = NFS4_DEV_NOTIFY_MAXENTRIES;
+ }
+
+ /* Decode each dev notification */
+ for (i = 0; i < n; i++) {
+ struct cb_devicenotifyitem *dev = &args->devs[i];
+
+ p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_RESOURCE);
+ goto out;
+ }
+
+ tmp = ntohl(*p++); /* bitmap size */
+ if (tmp != 1) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out;
+ }
+ dev->cbd_notify_type = ntohl(*p++);
+ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out;
+ }
+
+ tmp = ntohl(*p++); /* opaque size */
+ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+ (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+ status = htonl(NFS4ERR_INVAL);
+ goto out;
+ }
+ dev->cbd_layout_type = ntohl(*p++);
+ memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+ p = read_buf(xdr, sizeof(uint32_t));
+ if (unlikely(p == NULL)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
+ }
+ dev->cbd_immediate = ntohl(*p++);
+ } else {
+ dev->cbd_immediate = 0;
+ }
+
+ args->ndevs++;
+
+ dprintk("%s: type %d layout 0x%x immediate %d\n",
+ __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+ dev->cbd_immediate);
+ }
+out:
+ dprintk("%s: status %d ndevs %d\n",
+ __func__, ntohl(status), args->ndevs);
+ return status;
+}
+
static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
case OP_CB_RECALL_ANY:
case OP_CB_RECALL_SLOT:
case OP_CB_LAYOUTRECALL:
+ case OP_CB_NOTIFY_DEVICEID:
*op = &callback_ops[op_nr];
break;

- case OP_CB_NOTIFY_DEVICEID:
case OP_CB_NOTIFY:
case OP_CB_PUSH_DELEG:
case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
(callback_decode_arg_t)decode_layoutrecall_args,
.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
},
+ [OP_CB_NOTIFY_DEVICEID] = {
+ .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+ .decode_args =
+ (callback_decode_arg_t)decode_devicenotify_args,
+ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+ },
[OP_CB_SEQUENCE] = {
.process_op = (callback_process_op_t)nfs4_callback_sequence,
.decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e6e0c294..2feab7f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -867,6 +867,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .delete_deviceid = filelayout_delete_deviceid,
};

static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 7c44579..8be70ab 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -105,5 +105,6 @@ nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+void filelayout_delete_deviceid(struct nfs4_deviceid *);

#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index de5350f..601aaea 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -601,7 +601,7 @@ void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
- hlist_del_rcu(&dsaddr->node);
+ hlist_del_init_rcu(&dsaddr->node);
spin_unlock(&filelayout_deviceid_lock);

synchronize_rcu();
@@ -631,6 +631,42 @@ fail:
return NULL;
}

+static struct nfs4_file_layout_dsaddr *
+nfs4_fl_unhash_deviceid(struct nfs4_deviceid *id)
+{
+ struct nfs4_file_layout_dsaddr *d;
+ struct hlist_node *n;
+ long hash = nfs4_fl_deviceid_hash(id);
+
+ dprintk("%s: hash %ld\n", __func__, hash);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node)
+ if (!memcmp(&d->deviceid, id, sizeof(*id)))
+ goto found;
+ rcu_read_unlock();
+ return NULL;
+
+found:
+ rcu_read_unlock();
+ spin_lock(&filelayout_deviceid_lock);
+ hlist_del_init_rcu(&d->node);
+ spin_unlock(&filelayout_deviceid_lock);
+ synchronize_rcu();
+
+ return d;
+}
+
+void
+filelayout_delete_deviceid(struct nfs4_deviceid *id)
+{
+ struct nfs4_file_layout_dsaddr *d;
+
+ d = nfs4_fl_unhash_deviceid(id);
+ /* balance the initial ref taken in decode_and_add_device */
+ if (d && atomic_dec_and_test(&d->ref))
+ nfs4_fl_free_deviceid(d);
+}
+
/*
* Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
* Then: ((res + fsi) % dsaddr->stripe_count)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index bc48272..4cb0a0d 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -89,6 +89,9 @@ struct pnfs_layoutdriver_type {
*/
enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+
+ /* device notification methods */
+ void (*delete_deviceid)(struct nfs4_deviceid *);
};

struct pnfs_layout_hdr {
--
1.7.3.4


2011-04-20 17:29:17

by Benny Halevy

[permalink] [raw]
Subject: [RFC 24/27] pnfs-obj: objio_osd report osd_errors for layoutreturn

From: Boaz Harrosh <[email protected]>

* Allocate io-error descriptors space as part of io_state
* Use generic objlayout error reporting at end of io.

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 10 +++++++++-
1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 00e6084..027ba38 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -325,13 +325,17 @@ int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
struct objio_state *ios;
const unsigned first_size = sizeof(*ios) +
objio_seg->num_comps * sizeof(ios->per_dev[0]);
+ const unsigned sec_size = objio_seg->num_comps *
+ sizeof(ios->ol_state.ioerrs[0]);

dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
- ios = kzalloc(first_size, GFP_KERNEL);
+ ios = kzalloc(first_size + sec_size, GFP_KERNEL);
if (unlikely(!ios))
return -ENOMEM;

ios->objio_seg = objio_seg;
+ ios->ol_state.ioerrs = ((void *)ios) + first_size;
+ ios->ol_state.num_comps = objio_seg->num_comps;

*outp = &ios->ol_state;
return 0;
@@ -418,6 +422,10 @@ static int _io_check(struct objio_state *ios, bool is_write)

continue; /* we recovered */
}
+ objlayout_io_set_result(&ios->ol_state, i,
+ osd_pri_2_pnfs_err(osi.osd_err_pri),
+ ios->ol_state.offset, ios->length,
+ is_write);

if (osi.osd_err_pri >= oep) {
oep = osi.osd_err_pri;
--
1.7.3.4


2011-04-20 17:29:03

by Benny Halevy

[permalink] [raw]
Subject: [RFC 22/27] sunrpc: New xdr_rewind_stream()

In a long encoded xdr stream, we might run out of allocated xdr space.
In some situations it is possibly to reset the xdr buffer to a previuos
good state and send a parial list, which is better then just BUGing as
today or completely failing the xdr.

* define such API that can move the xdr pointer to a good known
state before the failed encoding.

Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
include/linux/sunrpc/xdr.h | 1 +
net/sunrpc/xdr.c | 21 +++++++++++++++++++++
2 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index fc84b7a..bf17e38 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -213,6 +213,7 @@ typedef int (*kxdrdproc_t)(void *rqstp, struct xdr_stream *xdr, void *obj);

extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q);
extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
unsigned int base, unsigned int len);
extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 679cd67..3e0d79e 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -518,6 +518,27 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
EXPORT_SYMBOL_GPL(xdr_reserve_space);

/**
+ * xdr_rewind_stream - rewind a stream back to some checkpoint
+ * @xdr: pointer to xdr_stream
+ * @q: some checkpoint at historical place of @xdr
+ *
+ * Restors an xdr stream to some historical point. @q must be
+ * a logical xdr point in the past that was sampled by @q = @xdr->p.
+ */
+__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q)
+{
+ size_t nbytes = (xdr->p - q) << 2;
+
+ BUG_ON(xdr->p < q);
+ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len);
+ xdr->p = q;
+ xdr->iov->iov_len -= nbytes;
+ xdr->buf->len -= nbytes;
+ return q;
+}
+EXPORT_SYMBOL_GPL(xdr_rewind_stream);
+
+/**
* xdr_write_pages - Insert a list of pages into an XDR buffer for sending
* @xdr: pointer to xdr_stream
* @pages: list of pages
--
1.7.3.4


2011-04-20 17:28:13

by Benny Halevy

[permalink] [raw]
Subject: [RFC 15/27] pnfs-obj: pnfs_osd XDR definitions

* Add the pnfs_osd_xdr.h header

* defintions the pnfs_osd_layout structure including all it's
sub-types and constants.
* Declare the pnfs_osd_xdr_encode/decode_layout API + all needed
inline helpers.

* Define the pnfs_osd_deviceaddr structure and all its subtypes and
constants.
* Declare API for encoding/decoding of a pnfs_osd_deviceaddr to/from
XDR stream.

* Define the pnfs_osd_ioerr structure, its substructures and constants.
* Declare API for encoding/decoding of a pnfs_osd_ioerr to/from
XDR stream.

* Define the pnfs_osd_layoutupdate structure and its substructures.
* Declare API for encoding/decoding of a pnfs_osd_layoutupdate to/from
XDR stream.

[Some extra debug-prints]
Signed-off-by: Boaz Harrosh <[email protected]>
[objlayout driver skeleton]
[use __be32]
Signed-off-by: Benny Halevy <[email protected]>
---
include/linux/pnfs_osd_xdr.h | 439 ++++++++++++++++++++++++++++++++++++++++++
1 files changed, 439 insertions(+), 0 deletions(-)
create mode 100644 include/linux/pnfs_osd_xdr.h

diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
new file mode 100644
index 0000000..6f2cedf
--- /dev/null
+++ b/include/linux/pnfs_osd_xdr.h
@@ -0,0 +1,439 @@
+/*
+ * pnfs_osd_xdr.h
+ *
+ * pNFS-osd on-the-wire data structures
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __PNFS_OSD_XDR_H__
+#define __PNFS_OSD_XDR_H__
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/exp_xdr.h>
+#include <scsi/osd_protocol.h>
+
+#define PNFS_OSD_OSDNAME_MAXSIZE 256
+
+/*
+ * START OF "GENERIC" DECODE ROUTINES.
+ * These may look a little ugly since they are imported from a "generic"
+ * set of XDR encode/decode routines which are intended to be shared by
+ * all of our NFSv4 implementations (OpenBSD, MacOS X...).
+ *
+ * If the pain of reading these is too great, it should be a straightforward
+ * task to translate them into Linux-specific versions which are more
+ * consistent with the style used in NFSv2/v3...
+ */
+#define READ32(x) (x) = ntohl(*p++)
+#define READ64(x) do { \
+ (x) = (u64)ntohl(*p++) << 32; \
+ (x) |= ntohl(*p++); \
+} while (0)
+#define COPYMEM(x, nbytes) do { \
+ memcpy((x), p, nbytes); \
+ p += XDR_QUADLEN(nbytes); \
+} while (0)
+
+/*
+ * draft-ietf-nfsv4-minorversion-22
+ * draft-ietf-nfsv4-pnfs-obj-12
+ */
+
+/* Layout Structure */
+
+enum pnfs_osd_raid_algorithm4 {
+ PNFS_OSD_RAID_0 = 1,
+ PNFS_OSD_RAID_4 = 2,
+ PNFS_OSD_RAID_5 = 3,
+ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
+};
+
+/* struct pnfs_osd_data_map4 {
+ * uint32_t odm_num_comps;
+ * length4 odm_stripe_unit;
+ * uint32_t odm_group_width;
+ * uint32_t odm_group_depth;
+ * uint32_t odm_mirror_cnt;
+ * pnfs_osd_raid_algorithm4 odm_raid_algorithm;
+ * };
+ */
+struct pnfs_osd_data_map {
+ u32 odm_num_comps;
+ u64 odm_stripe_unit;
+ u32 odm_group_width;
+ u32 odm_group_depth;
+ u32 odm_mirror_cnt;
+ u32 odm_raid_algorithm;
+};
+
+static inline int
+pnfs_osd_data_map_xdr_sz(void)
+{
+ return 1 + 2 + 1 + 1 + 1 + 1;
+}
+
+static inline size_t
+pnfs_osd_data_map_incore_sz(void)
+{
+ return sizeof(struct pnfs_osd_data_map);
+}
+
+/* struct pnfs_osd_objid4 {
+ * deviceid4 oid_device_id;
+ * uint64_t oid_partition_id;
+ * uint64_t oid_object_id;
+ * };
+ */
+struct pnfs_osd_objid {
+ struct nfs4_deviceid oid_device_id;
+ u64 oid_partition_id;
+ u64 oid_object_id;
+};
+
+/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */
+#define _DEVID_LO(oid_device_id) \
+ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data)
+
+#define _DEVID_HI(oid_device_id) \
+ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1)
+
+static inline int
+pnfs_osd_objid_xdr_sz(void)
+{
+ return (NFS4_DEVICEID4_SIZE / 4) + 2 + 2;
+}
+
+static inline size_t
+pnfs_osd_objid_incore_sz(void)
+{
+ return sizeof(struct pnfs_osd_objid);
+}
+
+enum pnfs_osd_version {
+ PNFS_OSD_MISSING = 0,
+ PNFS_OSD_VERSION_1 = 1,
+ PNFS_OSD_VERSION_2 = 2
+};
+
+struct pnfs_osd_opaque_cred {
+ u32 cred_len;
+ u8 *cred;
+};
+
+static inline int
+pnfs_osd_opaque_cred_xdr_sz(__be32 *p)
+{
+ u32 *start = p;
+ u32 n;
+
+ READ32(n);
+ p += XDR_QUADLEN(n);
+ return p - start;
+}
+
+static inline size_t
+pnfs_osd_opaque_cred_incore_sz(__be32 *p)
+{
+ u32 n;
+
+ READ32(n);
+ return XDR_QUADLEN(n) * 4;
+}
+
+enum pnfs_osd_cap_key_sec {
+ PNFS_OSD_CAP_KEY_SEC_NONE = 0,
+ PNFS_OSD_CAP_KEY_SEC_SSV = 1,
+};
+
+/* struct pnfs_osd_object_cred4 {
+ * pnfs_osd_objid4 oc_object_id;
+ * pnfs_osd_version4 oc_osd_version;
+ * pnfs_osd_cap_key_sec4 oc_cap_key_sec;
+ * opaque oc_capability_key<>;
+ * opaque oc_capability<>;
+ * };
+ */
+struct pnfs_osd_object_cred {
+ struct pnfs_osd_objid oc_object_id;
+ u32 oc_osd_version;
+ u32 oc_cap_key_sec;
+ struct pnfs_osd_opaque_cred oc_cap_key;
+ struct pnfs_osd_opaque_cred oc_cap;
+};
+
+static inline int
+pnfs_osd_object_cred_xdr_sz(__be32 *p)
+{
+ __be32 *start = p;
+
+ p += pnfs_osd_objid_xdr_sz() + 2;
+ p += pnfs_osd_opaque_cred_xdr_sz(p);
+ p += pnfs_osd_opaque_cred_xdr_sz(p);
+ return p - start;
+}
+
+static inline size_t
+pnfs_osd_object_cred_incore_sz(__be32 *p)
+{
+ size_t sz = sizeof(struct pnfs_osd_object_cred);
+
+ p += pnfs_osd_objid_xdr_sz() + 2;
+ sz += pnfs_osd_opaque_cred_incore_sz(p);
+ p += pnfs_osd_opaque_cred_xdr_sz(p);
+ sz += pnfs_osd_opaque_cred_incore_sz(p);
+ return sz;
+}
+
+/* struct pnfs_osd_layout4 {
+ * pnfs_osd_data_map4 olo_map;
+ * uint32_t olo_comps_index;
+ * pnfs_osd_object_cred4 olo_components<>;
+ * };
+ */
+struct pnfs_osd_layout {
+ struct pnfs_osd_data_map olo_map;
+ u32 olo_comps_index;
+ u32 olo_num_comps;
+ struct pnfs_osd_object_cred *olo_comps;
+};
+
+static inline int
+pnfs_osd_layout_xdr_sz(__be32 *p)
+{
+ __be32 *start = p;
+ u32 n;
+
+ p += pnfs_osd_data_map_xdr_sz() + 1;
+ READ32(n);
+ while ((int)(n--) > 0)
+ p += pnfs_osd_object_cred_xdr_sz(p);
+ return p - start;
+}
+
+static inline size_t
+pnfs_osd_layout_incore_sz(__be32 *p)
+{
+ u32 n;
+ size_t sz;
+
+ p += pnfs_osd_data_map_xdr_sz() + 1;
+ READ32(n);
+ sz = sizeof(struct pnfs_osd_layout);
+ while ((int)(n--) > 0) {
+ sz += pnfs_osd_object_cred_incore_sz(p);
+ p += pnfs_osd_object_cred_xdr_sz(p);
+ }
+ return sz;
+}
+
+/* Device Address */
+
+enum pnfs_osd_targetid_type {
+ OBJ_TARGET_ANON = 1,
+ OBJ_TARGET_SCSI_NAME = 2,
+ OBJ_TARGET_SCSI_DEVICE_ID = 3,
+};
+
+/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) {
+ * case OBJ_TARGET_SCSI_NAME:
+ * string oti_scsi_name<>;
+ *
+ * case OBJ_TARGET_SCSI_DEVICE_ID:
+ * opaque oti_scsi_device_id<>;
+ *
+ * default:
+ * void;
+ * };
+ *
+ * union pnfs_osd_targetaddr4 switch (bool ota_available) {
+ * case TRUE:
+ * netaddr4 ota_netaddr;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * struct pnfs_osd_deviceaddr4 {
+ * pnfs_osd_targetid4 oda_targetid;
+ * pnfs_osd_targetaddr4 oda_targetaddr;
+ * uint64_t oda_lun;
+ * opaque oda_systemid<>;
+ * pnfs_osd_object_cred4 oda_root_obj_cred;
+ * opaque oda_osdname<>;
+ * };
+ */
+struct pnfs_osd_targetid {
+ u32 oti_type;
+ struct nfs4_string oti_scsi_device_id;
+};
+
+enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 };
+
+/* struct netaddr4 {
+ * // see struct rpcb in RFC1833
+ * string r_netid<>; // network id
+ * string r_addr<>; // universal address
+ * };
+ */
+struct pnfs_osd_net_addr {
+ struct nfs4_string r_netid;
+ struct nfs4_string r_addr;
+};
+
+struct pnfs_osd_targetaddr {
+ u32 ota_available;
+ struct pnfs_osd_net_addr ota_netaddr;
+};
+
+enum {
+ NETWORK_ID_MAX = 16 / 4,
+ UNIVERSAL_ADDRESS_MAX = 64 / 4,
+ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX,
+};
+
+struct pnfs_osd_deviceaddr {
+ struct pnfs_osd_targetid oda_targetid;
+ struct pnfs_osd_targetaddr oda_targetaddr;
+ u8 oda_lun[8];
+ struct nfs4_string oda_systemid;
+ struct pnfs_osd_object_cred oda_root_obj_cred;
+ struct nfs4_string oda_osdname;
+};
+
+enum {
+ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4,
+ PNFS_OSD_DEVICEADDR_MAX =
+ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX +
+ 2 /*oda_lun*/ +
+ 1 + OSD_SYSTEMID_LEN +
+ 1 + ODA_OSDNAME_MAX,
+};
+
+/* LAYOUTCOMMIT: layoutupdate */
+
+/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) {
+ * case TRUE:
+ * int64_t dsu_delta;
+ * case FALSE:
+ * void;
+ * };
+ *
+ * struct pnfs_osd_layoutupdate4 {
+ * pnfs_osd_deltaspaceused4 olu_delta_space_used;
+ * bool olu_ioerr_flag;
+ * };
+ */
+struct pnfs_osd_layoutupdate {
+ u32 dsu_valid;
+ s64 dsu_delta;
+ u32 olu_ioerr_flag;
+};
+
+/* LAYOUTRETURN: I/O Rrror Report */
+
+enum pnfs_osd_errno {
+ PNFS_OSD_ERR_EIO = 1,
+ PNFS_OSD_ERR_NOT_FOUND = 2,
+ PNFS_OSD_ERR_NO_SPACE = 3,
+ PNFS_OSD_ERR_BAD_CRED = 4,
+ PNFS_OSD_ERR_NO_ACCESS = 5,
+ PNFS_OSD_ERR_UNREACHABLE = 6,
+ PNFS_OSD_ERR_RESOURCE = 7
+};
+
+/* struct pnfs_osd_ioerr4 {
+ * pnfs_osd_objid4 oer_component;
+ * length4 oer_comp_offset;
+ * length4 oer_comp_length;
+ * bool oer_iswrite;
+ * pnfs_osd_errno4 oer_errno;
+ * };
+ */
+struct pnfs_osd_ioerr {
+ struct pnfs_osd_objid oer_component;
+ u64 oer_comp_offset;
+ u64 oer_comp_length;
+ u32 oer_iswrite;
+ u32 oer_errno;
+};
+
+static inline unsigned
+pnfs_osd_ioerr_xdr_sz(void)
+{
+ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1;
+}
+
+/* OSD XDR API */
+
+/* Layout helpers */
+extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout(
+ struct pnfs_osd_layout *layout, __be32 *p);
+
+extern int pnfs_osd_xdr_encode_layout(
+ struct exp_xdr_stream *xdr,
+ struct pnfs_osd_layout *layout);
+
+/* Device Info helpers */
+
+/* First pass calculate total size for space needed */
+extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(__be32 *p);
+
+/* Note: some strings pointed to inside @deviceaddr might point
+ * to space inside @p. @p should stay valid while @deviceaddr
+ * is in use.
+ * It is assumed that @deviceaddr points to bigger memory of size
+ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz()
+ */
+extern void pnfs_osd_xdr_decode_deviceaddr(
+ struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p);
+
+/* For Servers */
+extern int pnfs_osd_xdr_encode_deviceaddr(
+ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr);
+
+/* layoutupdate (layout_commit) xdr helpers */
+extern int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+ struct pnfs_osd_layoutupdate *lou);
+extern __be32 *
+pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p);
+
+/* osd_ioerror encoding/decoding (layout_return) */
+extern int
+pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr);
+extern __be32 *
+pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p);
+
+#endif /* __PNFS_OSD_XDR_H__ */
--
1.7.3.4


2011-04-20 17:28:34

by Benny Halevy

[permalink] [raw]
Subject: [RFC 18/27] pnfs-obj: Define PNFS_OBJLAYOUT Kconfig option

* Define the PNFS_OBJLAYOUT Kconfig option in the nfs
master Kconfig file.
* Add the objlayout driver to the Kernel's Kbuild system.
* Add the fs/nfs/objlayout/Kbuild file for building the
objlayoutdriver.ko driver

[pnfs-obj: remove of CONFIG_PNFS fallout]
Signed-off-by: Boaz Harrosh <[email protected]>
[added "unsure" clause]
[depend on NFS_V4_1]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/Kconfig | 10 ++++++++++
fs/nfs/Makefile | 2 ++
fs/nfs/objlayout/Kbuild | 5 +++++
3 files changed, 17 insertions(+), 0 deletions(-)
create mode 100644 fs/nfs/objlayout/Kbuild

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba30665..8151554 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
config PNFS_FILE_LAYOUT
tristate

+config PNFS_OBJLAYOUT
+ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+ help
+ Say M here if you want your pNFS client to support the Objects Layout Driver.
+ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+ upper level driver (SCSI_OSD_ULD).
+
+ If unsure, say N.
+
config ROOT_NFS
bool "Root file system on NFS"
depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9..c9574f0 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -21,3 +21,5 @@ nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o

obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 0000000..c326738
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := pnfs_osd_xdr_cli.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
--
1.7.3.4


2011-04-20 17:28:06

by Benny Halevy

[permalink] [raw]
Subject: [RFC 14/27] pnfsd: introduce exp_xdr.h

Containing xdr encoding helpers.

[nfsd: fix exp_xdr_encode_u64 parameter type]
Reported-by: J. Bruce Fields <[email protected]>
[exportfs: exp_xdr.h: Use #include <linux/string.h> instead of <asm/string.h>]
Signed-off-by: Benny Halevy <[email protected]>
---
include/linux/exp_xdr.h | 141 +++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 141 insertions(+), 0 deletions(-)
create mode 100644 include/linux/exp_xdr.h

diff --git a/include/linux/exp_xdr.h b/include/linux/exp_xdr.h
new file mode 100644
index 0000000..b69c309
--- /dev/null
+++ b/include/linux/exp_xdr.h
@@ -0,0 +1,141 @@
+#ifndef _LINUX_EXP_XDR_H
+#define _LINUX_EXP_XDR_H
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/string.h>
+
+struct exp_xdr_stream {
+ __be32 *p;
+ __be32 *end;
+};
+
+/**
+ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes
+ * @nbytes: number of bytes to encode
+ */
+static inline size_t
+exp_xdr_qwords(__u32 nbytes)
+{
+ return DIV_ROUND_UP(nbytes, 4);
+}
+
+/**
+ * exp_xdr_qbytes - Calculate the number of bytes holding qwords
+ * @qwords: number of quad-words to encode
+ */
+static inline size_t
+exp_xdr_qbytes(size_t qwords)
+{
+ return qwords << 2;
+}
+
+/**
+ * exp_xdr_reserve_space - Reserve buffer space for sending
+ * @xdr: pointer to exp_xdr_stream
+ * @nbytes: number of bytes to reserve
+ *
+ * Checks that we have enough buffer space to encode 'nbytes' more
+ * bytes of data. If so, update the xdr stream.
+ */
+static inline __be32 *
+exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr->p;
+ __be32 *q;
+
+ /* align nbytes on the next 32-bit boundary */
+ q = p + exp_xdr_qwords(nbytes);
+ if (unlikely(q > xdr->end || q < p))
+ return NULL;
+ xdr->p = q;
+ return p;
+}
+
+/**
+ * exp_xdr_reserve_qwords - Reserve buffer space for sending
+ * @xdr: pointer to exp_xdr_stream
+ * @nwords: number of quad words (u32's) to reserve
+ */
+static inline __be32 *
+exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords)
+{
+ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords));
+}
+
+/**
+ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream
+ * @p: pointer to encoding destination
+ * @val: value to encode
+ */
+static inline __be32 *
+exp_xdr_encode_u32(__be32 *p, __u32 val)
+{
+ *p = cpu_to_be32(val);
+ return p + 1;
+}
+
+/**
+ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream
+ * @p: pointer to encoding destination
+ * @val: value to encode
+ */
+static inline __be32 *
+exp_xdr_encode_u64(__be32 *p, __u64 val)
+{
+ put_unaligned_be64(val, p);
+ return p + 2;
+}
+
+/**
+ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream
+ * @p: pointer to encoding destination
+ * @ptr: pointer to the array of bytes
+ * @nbytes: number of bytes to encode
+ */
+static inline __be32 *
+exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes)
+{
+ if (likely(nbytes != 0)) {
+ unsigned int qwords = exp_xdr_qwords(nbytes);
+ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes;
+
+ memcpy(p, ptr, nbytes);
+ if (padding != 0)
+ memset((char *)p + nbytes, 0, padding);
+ p += qwords;
+ }
+ return p;
+}
+
+/**
+ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream
+ * @p: pointer to encoding destination
+ * @ptr: pointer to the opaque array
+ * @nbytes: number of bytes to encode
+ *
+ * Encodes the 32-bit opaque size in bytes followed by the opaque value.
+ */
+static inline __be32 *
+exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes)
+{
+ p = exp_xdr_encode_u32(p, nbytes);
+ return exp_xdr_encode_bytes(p, ptr, nbytes);
+}
+
+/**
+ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream
+ * @lenp: pointer to the opaque length destination
+ * @endp: pointer to the end of the opaque array
+ *
+ * Encodes the 32-bit opaque size in bytes given the start and end pointers
+ */
+static inline __be32 *
+exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp)
+{
+ size_t nbytes = (char *)endp - (char *)(lenp + 1);
+
+ exp_xdr_encode_u32(lenp, nbytes);
+ return lenp + 1 + exp_xdr_qwords(nbytes);
+}
+#endif /* _LINUX_EXP_XDR_H */
--
1.7.3.4


2011-04-20 17:29:11

by Benny Halevy

[permalink] [raw]
Subject: [RFC 23/27] pnfs-obj: objlayout_encode_layoutreturn Implementation.

From: Boaz Harrosh <[email protected]>

An io_state pre-allocates an error information structure for each
possible osd-device that might error during IO. When IO is done if all
was well the io_state is freed. (as today). If the I/O has ended with an
error, the io_state is queued on a per-layout err_list. When eventually
encode_layoutreturn() is called, each error is properly encoded on the
XDR buffer and only then the io_state is removed from err_list and
de-allocated.

It is up to the io_engine to fill in the segment that fault and the type
of osd_error that occurred. By calling objlayout_io_set_result() for
each failing device.

Signed-off-by: Boaz Harrosh <[email protected]>
[use new alloc/free_layout API]
[apply types rename]
[convert to new pnfs-submit changes]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 2 +
fs/nfs/objlayout/objlayout.c | 227 +++++++++++++++++++++++++++++++++++++++++-
fs/nfs/objlayout/objlayout.h | 14 +++
3 files changed, 242 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 75d4ebb..00e6084 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -745,6 +745,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {

.read_pagelist = objlayout_read_pagelist,
.write_pagelist = objlayout_write_pagelist,
+
+ .encode_layoutreturn = objlayout_encode_layoutreturn,
};

void *objio_init_mt(void)
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 7c4c744..322ffa3 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -39,6 +39,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

+#include <scsi/osd_initiator.h>
#include "objlayout.h"

#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -54,6 +55,10 @@ objlayout_alloc_layout_hdr(struct inode *inode)
struct objlayout *objlay;

objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
+ if (objlay) {
+ spin_lock_init(&objlay->lock);
+ INIT_LIST_HEAD(&objlay->err_list);
+ }
dprintk("%s: Return %p\n", __func__, objlay);
return &objlay->pnfs_layout;
}
@@ -68,6 +73,7 @@ objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)

dprintk("%s: objlay %p\n", __func__, objlay);

+ WARN_ON(!list_empty(&objlay->err_list));
kfree(objlay);
}

@@ -204,6 +210,7 @@ objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
pgbase &= ~PAGE_MASK;
}

+ INIT_LIST_HEAD(&state->err_list);
state->objlseg = objlseg;
state->rpcdata = rpcdata;
state->pages = pages;
@@ -234,7 +241,54 @@ objlayout_iodone(struct objlayout_io_state *state)
{
dprintk("%s: state %p status\n", __func__, state);

- objlayout_free_io_state(state);
+ if (likely(state->status >= 0)) {
+ objlayout_free_io_state(state);
+ } else {
+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout);
+
+ spin_lock(&objlay->lock);
+ list_add(&objlay->err_list, &state->err_list);
+ spin_unlock(&objlay->lock);
+ }
+}
+
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+ int osd_error, u64 offset, u64 length, bool is_write)
+{
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+
+ BUG_ON(index >= state->num_comps);
+ if (osd_error) {
+ struct pnfs_osd_layout *layout =
+ (typeof(layout))state->objlseg->pnfs_osd_layout;
+
+ ioerr->oer_component = layout->olo_comps[index].oc_object_id;
+ ioerr->oer_comp_offset = offset;
+ ioerr->oer_comp_length = length;
+ ioerr->oer_iswrite = is_write;
+ ioerr->oer_errno = osd_error;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+ __func__, index, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+ } else {
+ /* User need not call if no error is reported */
+ ioerr->oer_errno = 0;
+ }
}

/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
@@ -401,6 +455,177 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
return PNFS_ATTEMPTED;
}

+static int
+err_prio(u32 oer_errno)
+{
+ switch (oer_errno) {
+ case 0:
+ return 0;
+
+ case PNFS_OSD_ERR_RESOURCE:
+ return OSD_ERR_PRI_RESOURCE;
+ case PNFS_OSD_ERR_BAD_CRED:
+ return OSD_ERR_PRI_BAD_CRED;
+ case PNFS_OSD_ERR_NO_ACCESS:
+ return OSD_ERR_PRI_NO_ACCESS;
+ case PNFS_OSD_ERR_UNREACHABLE:
+ return OSD_ERR_PRI_UNREACHABLE;
+ case PNFS_OSD_ERR_NOT_FOUND:
+ return OSD_ERR_PRI_NOT_FOUND;
+ case PNFS_OSD_ERR_NO_SPACE:
+ return OSD_ERR_PRI_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case PNFS_OSD_ERR_EIO:
+ return OSD_ERR_PRI_EIO;
+ }
+}
+
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+ const struct pnfs_osd_ioerr *src_err)
+{
+ u64 dest_end, src_end;
+
+ if (!dest_err->oer_errno) {
+ *dest_err = *src_err;
+ /* accumulated device must be blank */
+ memset(&dest_err->oer_component.oid_device_id, 0,
+ sizeof(dest_err->oer_component.oid_device_id));
+
+ return;
+ }
+
+ if (dest_err->oer_component.oid_partition_id !=
+ src_err->oer_component.oid_partition_id)
+ dest_err->oer_component.oid_partition_id = 0;
+
+ if (dest_err->oer_component.oid_object_id !=
+ src_err->oer_component.oid_object_id)
+ dest_err->oer_component.oid_object_id = 0;
+
+ if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+ dest_err->oer_comp_offset = src_err->oer_comp_offset;
+
+ dest_end = end_offset(dest_err->oer_comp_offset,
+ dest_err->oer_comp_length);
+ src_end = end_offset(src_err->oer_comp_offset,
+ src_err->oer_comp_length);
+ if (dest_end < src_end)
+ dest_end = src_end;
+
+ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+
+ if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+ dest_err->oer_errno = src_err->oer_errno;
+ } else if (src_err->oer_iswrite) {
+ dest_err->oer_iswrite = true;
+ dest_err->oer_errno = src_err->oer_errno;
+ }
+}
+
+static void
+encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr)
+{
+ struct objlayout_io_state *state, *tmp;
+ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ unsigned i;
+
+ for (i = 0; i < state->num_comps; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ merge_ioerr(&accumulated_err, ioerr);
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+
+ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err));
+}
+
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct objlayout_io_state *state, *tmp;
+ __be32 *start, *uninitialized_var(last_xdr);
+
+ dprintk("%s: Begin\n", __func__);
+ start = xdr_reserve_space(xdr, 4);
+ BUG_ON(!start);
+
+ spin_lock(&objlay->lock);
+
+ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+ unsigned i;
+ int res = 0;
+
+ for (i = 0; i < state->num_comps && !res; i++) {
+ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+
+ if (!ioerr->oer_errno)
+ continue;
+
+ dprintk("%s: err[%d]: errno=%d is_write=%d "
+ "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "offset=0x%llx length=0x%llx\n",
+ __func__, i, ioerr->oer_errno,
+ ioerr->oer_iswrite,
+ _DEVID_LO(&ioerr->oer_component.oid_device_id),
+ _DEVID_HI(&ioerr->oer_component.oid_device_id),
+ ioerr->oer_component.oid_partition_id,
+ ioerr->oer_component.oid_object_id,
+ ioerr->oer_comp_offset,
+ ioerr->oer_comp_length);
+
+ last_xdr = xdr->p;
+ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
+ }
+ if (unlikely(res)) {
+ /* no space for even one error descriptor */
+ BUG_ON(last_xdr == start + 1);
+
+ /* we've encountered a situation with lots and lots of
+ * errors and no space to encode them all. Use the last
+ * available slot to report the union of all the
+ * remaining errors.
+ */
+ xdr_rewind_stream(xdr, last_xdr -
+ pnfs_osd_ioerr_xdr_sz() / 4);
+ encode_accumulated_error(objlay, xdr);
+ goto loop_done;
+ }
+ list_del(&state->err_list);
+ objlayout_free_io_state(state);
+ }
+loop_done:
+ spin_unlock(&objlay->lock);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+ dprintk("%s: Return\n", __func__);
+}
+
struct objlayout_deviceinfo {
struct page *page;
struct pnfs_osd_deviceaddr da; /* This must be last */
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 7a63d34..65f8d44 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -61,6 +61,10 @@ struct objlayout_segment {
*/
struct objlayout {
struct pnfs_layout_hdr pnfs_layout;
+
+ /* for layout_return */
+ spinlock_t lock;
+ struct list_head err_list;
};

static inline struct objlayout *
@@ -87,6 +91,16 @@ struct objlayout_io_state {
int status; /* res */
int eof; /* res */
int committed; /* res */
+
+ /* Error reporting (layout_return) */
+ struct list_head err_list;
+ unsigned num_comps;
+ /* Pointer to array of error descriptors of size num_comps.
+ * It should contain as many entries as devices in the osd_layout
+ * that participate in the I/O. It is up to the io_engine to allocate
+ * needed space and set num_comps.
+ */
+ struct pnfs_osd_ioerr *ioerrs;
};

/*
--
1.7.3.4


2011-04-22 09:11:05

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 15/27] pnfs-obj: pnfs_osd XDR definitions

On 2011-04-20 23:49, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:28 +0300, Benny Halevy wrote:
>> * Add the pnfs_osd_xdr.h header
>>
>> * defintions the pnfs_osd_layout structure including all it's
>> sub-types and constants.
>> * Declare the pnfs_osd_xdr_encode/decode_layout API + all needed
>> inline helpers.
>>
>> * Define the pnfs_osd_deviceaddr structure and all its subtypes and
>> constants.
>> * Declare API for encoding/decoding of a pnfs_osd_deviceaddr to/from
>> XDR stream.
>>
>> * Define the pnfs_osd_ioerr structure, its substructures and constants.
>> * Declare API for encoding/decoding of a pnfs_osd_ioerr to/from
>> XDR stream.
>>
>> * Define the pnfs_osd_layoutupdate structure and its substructures.
>> * Declare API for encoding/decoding of a pnfs_osd_layoutupdate to/from
>> XDR stream.
>>
>> [Some extra debug-prints]
>> Signed-off-by: Boaz Harrosh <[email protected]>
>> [objlayout driver skeleton]
>> [use __be32]
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> include/linux/pnfs_osd_xdr.h | 439 ++++++++++++++++++++++++++++++++++++++++++
>> 1 files changed, 439 insertions(+), 0 deletions(-)
>> create mode 100644 include/linux/pnfs_osd_xdr.h
>>
>> diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h
>> new file mode 100644
>> index 0000000..6f2cedf
>> --- /dev/null
>> +++ b/include/linux/pnfs_osd_xdr.h
>> @@ -0,0 +1,439 @@
>> +/*
>> + * pnfs_osd_xdr.h
>> + *
>> + * pNFS-osd on-the-wire data structures
>> + *
>> + * Copyright (C) 2007-2009 Panasas Inc.
>> + * All rights reserved.
>> + *
>> + * Benny Halevy <[email protected]>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2
>> + * See the file COPYING included with this distribution for more details.
>> + *
>> + * Redistribution and use in source and binary forms, with or without
>> + * modification, are permitted provided that the following conditions
>> + * are met:
>> + *
>> + * 1. Redistributions of source code must retain the above copyright
>> + * notice, this list of conditions and the following disclaimer.
>> + * 2. Redistributions in binary form must reproduce the above copyright
>> + * notice, this list of conditions and the following disclaimer in the
>> + * documentation and/or other materials provided with the distribution.
>> + * 3. Neither the name of the Panasas company nor the names of its
>> + * contributors may be used to endorse or promote products derived
>> + * from this software without specific prior written permission.
>> + *
>> + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
>> + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
>> + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
>> + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
>> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
>> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
>> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
>> + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
>> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
>> + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
>> + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> + */
>> +#ifndef __PNFS_OSD_XDR_H__
>> +#define __PNFS_OSD_XDR_H__
>> +
>> +#include <linux/nfs_fs.h>
>> +#include <linux/nfs_page.h>
>> +#include <linux/exp_xdr.h>
>> +#include <scsi/osd_protocol.h>
>> +
>> +#define PNFS_OSD_OSDNAME_MAXSIZE 256
>> +
>> +/*
>> + * START OF "GENERIC" DECODE ROUTINES.
>> + * These may look a little ugly since they are imported from a "generic"
>> + * set of XDR encode/decode routines which are intended to be shared by
>> + * all of our NFSv4 implementations (OpenBSD, MacOS X...).
>> + *
>> + * If the pain of reading these is too great, it should be a straightforward
>> + * task to translate them into Linux-specific versions which are more
>> + * consistent with the style used in NFSv2/v3...
>
> Please do... We just got rid of these in the NFS code. The last thing we
> want to do is reintroduce them in new code.
>

Yeah, this is on our queue already
(as mentioned in [RFC 0/27] :)

Benny

2011-04-22 08:05:01

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 3/6] SQUASHME: remove wait parameter from the layoutreturn path.

all call sites are sync now
squash into "pnfs: layoutreturn"

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 11 ++++-------
fs/nfs/pnfs.c | 10 +++++-----
fs/nfs/pnfs.h | 12 +++++-------
3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3e1843d..4f637e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2333,7 +2333,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;

if (pnfs_ld_layoutret_on_setattr(inode))
- pnfs_return_layout(inode, NULL, true);
+ pnfs_return_layout(inode, NULL);

nfs_fattr_init(fattr);

@@ -5739,7 +5739,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
.rpc_release = nfs4_layoutreturn_release,
};

-int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
{
struct rpc_task *task;
struct rpc_message msg = {
@@ -5752,22 +5752,19 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
- .flags = RPC_TASK_ASYNC,
};
- int status = 0;
+ int status;

dprintk("--> %s\n", __func__);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
- if (!issync)
- goto out;
status = nfs4_wait_for_completion_rpc_task(task);
if (status != 0)
goto out;
status = task->tk_status;
out:
- dprintk("<-- %s\n", __func__);
+ dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bacde63..1ec5bb8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -667,7 +667,7 @@ out_err_free:
}

static int
-return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
+return_layout(struct inode *ino, struct pnfs_layout_range *range)
{
struct nfs4_layoutreturn *lrp;
struct nfs_server *server = NFS_SERVER(ino);
@@ -687,7 +687,7 @@ return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
lrp->args.inode = ino;
lrp->clp = server->nfs_client;

- status = nfs4_proc_layoutreturn(lrp, wait);
+ status = nfs4_proc_layoutreturn(lrp);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
@@ -695,7 +695,7 @@ out:

/* Initiates a LAYOUTRETURN(FILE) */
int
-_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
+_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range)
{
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
@@ -722,11 +722,11 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wai
pnfs_free_lseg_list(&tmp_list);

/* Return layout even if layoutcommit fails */
- status = pnfs_layoutcommit_inode(ino, wait);
+ status = pnfs_layoutcommit_inode(ino, true);
if (status)
dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
__func__, status);
- status = return_layout(ino, &arg, wait);
+ status = return_layout(ino, &arg);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index b0f9b79..b5d1d22 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -164,7 +164,7 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
-extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);

/* pnfs.c */
void get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -202,7 +202,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
-int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
+int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *);
int pnfs_write_done(struct nfs_write_data *);
int pnfs_read_done(struct nfs_read_data *);

@@ -284,14 +284,13 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
}

static inline int pnfs_return_layout(struct inode *ino,
- struct pnfs_layout_range *range,
- bool wait)
+ struct pnfs_layout_range *range)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_server *nfss = NFS_SERVER(ino);

if (pnfs_enabled_sb(nfss) && nfsi->layout)
- return _pnfs_return_layout(ino, range, wait);
+ return _pnfs_return_layout(ino, range);

return 0;
}
@@ -338,8 +337,7 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
}

static inline int pnfs_return_layout(struct inode *ino,
- struct pnfs_layout_range *range,
- bool wait)
+ struct pnfs_layout_range *range)
{
return 0;
}
--
1.7.3.4


2011-04-20 17:28:20

by Benny Halevy

[permalink] [raw]
Subject: [RFC 16/27] pnfs-obj: pnfs_osd XDR client implementations

* Add the fs/nfs/objlayout/pnfs_osd_xdr_cli.c file, which will
include the XDR encode/decode implementations for the pNFS
client objlayout driver.

* Implement the pnfs_osd_xdr_decode_layout() API

* Implementation of pnfs_osd_xdr_deviceaddr_incore_sz() and
pnfs_osd_xdr_decode_deviceaddr(). It is a two pass operation.
The former is first called to determine the memory allocation
size needed and the later is called to fill in the
pnfs_osd_deviceaddr structure.

* Implementation of pnfs_osd_xdr_encode_ioerr() into an
XDR stream. Used in the LAYOUT_RETURN operation.

* Implementation of pnfs_osd_xdr_encode_layoutupdate() into an
XDR stream. Used in the LAYOUT_COMMIT operation.

[Some extra debug-prints]
Signed-off-by: Boaz Harrosh <[email protected]>
[use NFSDBG_PNFS_LD also in pnfs_osd_xdr_cli.c]
[use __be32]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 435 +++++++++++++++++++++++++++++++++++
1 files changed, 435 insertions(+), 0 deletions(-)
create mode 100644 fs/nfs/objlayout/pnfs_osd_xdr_cli.c

diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 0000000..3463364
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,435 @@
+/*
+ * pnfs_osd_xdr.c
+ *
+ * Object-Based pNFS Layout XDR layer
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/pnfs_osd_xdr.h>
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/*
+ * The following implementation is based on these Internet Drafts:
+ *
+ * draft-ietf-nfsv4-minorversion-21
+ * draft-ietf-nfsv4-pnfs-obj-12
+ */
+
+/*
+ * struct pnfs_osd_objid {
+ * struct pnfs_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ * };
+ */
+static inline __be32 *
+pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data));
+ READ64(objid->oid_partition_id);
+ READ64(objid->oid_object_id);
+ return p;
+}
+
+static inline __be32 *
+pnfs_osd_xdr_decode_opaque_cred(__be32 *p,
+ struct pnfs_osd_opaque_cred *opaque_cred)
+{
+ READ32(opaque_cred->cred_len);
+ COPYMEM(opaque_cred->cred, opaque_cred->cred_len);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_object_cred {
+ * struct pnfs_osd_objid oc_object_id;
+ * u32 oc_osd_version;
+ * u32 oc_cap_key_sec;
+ * struct pnfs_osd_opaque_cred oc_cap_key
+ * struct pnfs_osd_opaque_cred oc_cap;
+ * };
+ */
+static inline __be32 *
+pnfs_osd_xdr_decode_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp,
+ u8 **credp)
+{
+ u8 *cred;
+
+ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id);
+ READ32(comp->oc_osd_version);
+ READ32(comp->oc_cap_key_sec);
+
+ cred = *credp;
+ comp->oc_cap_key.cred = cred;
+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key);
+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len));
+ comp->oc_cap.cred = cred;
+ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap);
+ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len));
+ *credp = cred;
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_data_map {
+ * u32 odm_num_comps;
+ * u64 odm_stripe_unit;
+ * u32 odm_group_width;
+ * u32 odm_group_depth;
+ * u32 odm_mirror_cnt;
+ * u32 odm_raid_algorithm;
+ * };
+ */
+static inline u32 *
+pnfs_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+ READ32(data_map->odm_num_comps);
+ READ64(data_map->odm_stripe_unit);
+ READ32(data_map->odm_group_width);
+ READ32(data_map->odm_group_depth);
+ READ32(data_map->odm_mirror_cnt);
+ READ32(data_map->odm_raid_algorithm);
+ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+ __func__,
+ data_map->odm_num_comps,
+ (unsigned long long)data_map->odm_stripe_unit,
+ data_map->odm_group_width,
+ data_map->odm_group_depth,
+ data_map->odm_mirror_cnt,
+ data_map->odm_raid_algorithm);
+ return p;
+}
+
+struct pnfs_osd_layout *
+pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, __be32 *p)
+{
+ int i;
+ __be32 *start = p;
+ struct pnfs_osd_object_cred *comp;
+ u8 *cred;
+
+ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map);
+ READ32(layout->olo_comps_index);
+ READ32(layout->olo_num_comps);
+ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1);
+ comp = layout->olo_comps;
+ cred = (u8 *)(comp + layout->olo_num_comps);
+ dprintk("%s: comps_index=%u num_comps=%u\n",
+ __func__, layout->olo_comps_index, layout->olo_num_comps);
+ for (i = 0; i < layout->olo_num_comps; i++) {
+ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred);
+ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx "
+ "key_len=%u cap_len=%u\n",
+ __func__, i,
+ _DEVID_LO(&comp->oc_object_id.oid_device_id),
+ _DEVID_HI(&comp->oc_object_id.oid_device_id),
+ comp->oc_object_id.oid_partition_id,
+ comp->oc_object_id.oid_object_id,
+ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+ comp++;
+ }
+ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__,
+ (char *)p - (char *)start, cred, (char *)cred - (char *)layout);
+ return layout;
+}
+
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, most
+ * of the actual fields are left inside the rpc buffer and are only
+ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ * should not be freed while the returned information is in use.
+ */
+
+__be32 *__xdr_read_calc_nfs4_string(
+ __be32 *p, struct nfs4_string *str, u8 **freespace)
+{
+ u32 len;
+ char *data;
+ bool need_copy;
+
+ READ32(len);
+ data = (char *)p;
+
+ if (data[len]) { /* Not null terminated we'll need extra space */
+ data = *freespace;
+ *freespace += len + 1;
+ need_copy = true;
+ } else {
+ need_copy = false;
+ }
+
+ if (str) {
+ str->len = len;
+ str->data = data;
+ if (need_copy) {
+ memcpy(data, p, len);
+ data[len] = 0;
+ }
+ }
+
+ p += XDR_QUADLEN(len);
+ return p;
+}
+
+__be32 *__xdr_read_calc_u8_opaque(
+ __be32 *p, struct nfs4_string *str)
+{
+ u32 len;
+
+ READ32(len);
+
+ if (str) {
+ str->len = len;
+ str->data = (char *)p;
+ }
+
+ p += XDR_QUADLEN(len);
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetid {
+ * u32 oti_type;
+ * struct nfs4_string oti_scsi_device_id;
+ * };
+ */
+__be32 *__xdr_read_calc_targetid(
+ __be32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace)
+{
+ u32 oti_type;
+
+ READ32(oti_type);
+ if (targetid)
+ targetid->oti_type = oti_type;
+
+ switch (oti_type) {
+ case OBJ_TARGET_SCSI_NAME:
+ case OBJ_TARGET_SCSI_DEVICE_ID:
+ p = __xdr_read_calc_u8_opaque(p,
+ targetid ? &targetid->oti_scsi_device_id : NULL);
+ }
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_net_addr {
+ * struct nfs4_string r_netid;
+ * struct nfs4_string r_addr;
+ * };
+ */
+__be32 *__xdr_read_calc_net_addr(
+ __be32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace)
+{
+
+ p = __xdr_read_calc_nfs4_string(p,
+ netaddr ? &netaddr->r_netid : NULL,
+ freespace);
+
+ p = __xdr_read_calc_nfs4_string(p,
+ netaddr ? &netaddr->r_addr : NULL,
+ freespace);
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_targetaddr {
+ * u32 ota_available;
+ * struct pnfs_osd_net_addr ota_netaddr;
+ * };
+ */
+__be32 *__xdr_read_calc_targetaddr(
+ __be32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace)
+{
+ u32 ota_available;
+
+ READ32(ota_available);
+ if (targetaddr)
+ targetaddr->ota_available = ota_available;
+
+ if (ota_available) {
+ p = __xdr_read_calc_net_addr(p,
+ targetaddr ? &targetaddr->ota_netaddr : NULL,
+ freespace);
+ }
+
+ return p;
+}
+
+/*
+ * struct pnfs_osd_deviceaddr {
+ * struct pnfs_osd_targetid oda_targetid;
+ * struct pnfs_osd_targetaddr oda_targetaddr;
+ * u8 oda_lun[8];
+ * struct nfs4_string oda_systemid;
+ * struct pnfs_osd_object_cred oda_root_obj_cred;
+ * struct nfs4_string oda_osdname;
+ * };
+ */
+__be32 *__xdr_read_calc_deviceaddr(
+ __be32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace)
+{
+ p = __xdr_read_calc_targetid(p,
+ deviceaddr ? &deviceaddr->oda_targetid : NULL,
+ freespace);
+
+ p = __xdr_read_calc_targetaddr(p,
+ deviceaddr ? &deviceaddr->oda_targetaddr : NULL,
+ freespace);
+
+ if (deviceaddr)
+ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun));
+ else
+ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun));
+
+ p = __xdr_read_calc_u8_opaque(p,
+ deviceaddr ? &deviceaddr->oda_systemid : NULL);
+
+ if (deviceaddr) {
+ p = pnfs_osd_xdr_decode_object_cred(p,
+ &deviceaddr->oda_root_obj_cred, freespace);
+ } else {
+ *freespace += pnfs_osd_object_cred_incore_sz(p);
+ p += pnfs_osd_object_cred_xdr_sz(p);
+ }
+
+ p = __xdr_read_calc_u8_opaque(p,
+ deviceaddr ? &deviceaddr->oda_osdname : NULL);
+
+ return p;
+}
+
+size_t pnfs_osd_xdr_deviceaddr_incore_sz(__be32 *p)
+{
+ u8 *null_freespace = NULL;
+ size_t sz;
+
+ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace);
+ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace;
+
+ return sz;
+}
+
+void pnfs_osd_xdr_decode_deviceaddr(
+ struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+ u8 *freespace = (u8 *)(deviceaddr + 1);
+
+ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
+}
+
+/*
+ * struct pnfs_osd_layoutupdate {
+ * u32 dsu_valid;
+ * s64 dsu_delta;
+ * u32 olu_ioerr_flag;
+ * };
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+ struct pnfs_osd_layoutupdate *lou)
+{
+ __be32 *p = xdr_reserve_space(xdr, 16);
+
+ if (!p)
+ return -E2BIG;
+
+ *p++ = cpu_to_be32(lou->dsu_valid);
+ if (lou->dsu_valid)
+ p = xdr_encode_hyper(p, lou->dsu_delta);
+ *p++ = cpu_to_be32(lou->olu_ioerr_flag);
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_objid {
+ * struct pnfs_deviceid oid_device_id;
+ * u64 oid_partition_id;
+ * u64 oid_object_id;
+ */
+static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
+ struct pnfs_osd_objid *object_id)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return -E2BIG;
+
+ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+ sizeof(object_id->oid_device_id.data));
+ p = xdr_encode_hyper(p, object_id->oid_partition_id);
+ p = xdr_encode_hyper(p, object_id->oid_object_id);
+
+ return 0;
+}
+
+/*
+ * struct pnfs_osd_ioerr {
+ * struct pnfs_osd_objid oer_component;
+ * u64 oer_comp_offset;
+ * u64 oer_comp_length;
+ * u32 oer_iswrite;
+ * u32 oer_errno;
+ * };
+ */
+int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
+ struct pnfs_osd_ioerr *ioerr)
+{
+ __be32 *p;
+ int ret;
+
+ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
+ if (ret)
+ return ret;
+
+ p = xdr_reserve_space(xdr, 24);
+ if (!p)
+ return -E2BIG;
+
+ p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+ p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+ *p++ = cpu_to_be32(ioerr->oer_iswrite);
+ *p = cpu_to_be32(ioerr->oer_errno);
+
+ return 0;
+}
--
1.7.3.4


2011-04-20 17:27:51

by Benny Halevy

[permalink] [raw]
Subject: [RFC 12/27] pnfs: alloc and free layout_hdr layoutdriver methods

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/pnfs.c | 21 ++++++++++++++++++---
fs/nfs/pnfs.h | 3 +++
2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index afc64b3..2254362 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -188,13 +188,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
atomic_inc(&lo->plh_refcount);
}

+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
+ kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+}
+
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+ return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
+
static void
destroy_layout_hdr(struct pnfs_layout_hdr *lo)
{
dprintk("%s: freeing layout cache %p\n", __func__, lo);
BUG_ON(!list_empty(&lo->plh_layouts));
NFS_I(lo->plh_inode)->layout = NULL;
- kfree(lo);
+ pnfs_free_layout_hdr(lo);
}

static void
@@ -857,7 +872,7 @@ alloc_init_layout_hdr(struct inode *ino)
{
struct pnfs_layout_hdr *lo;

- lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+ lo = pnfs_alloc_layout_hdr(ino);
if (!lo)
return NULL;
atomic_set(&lo->plh_refcount, 1);
@@ -890,7 +905,7 @@ pnfs_find_alloc_layout(struct inode *ino)
if (likely(nfsi->layout == NULL)) /* Won the race? */
nfsi->layout = new;
else
- kfree(new);
+ pnfs_free_layout_hdr(new);
return nfsi->layout;
}

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index bb266ba..35662ac 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -83,6 +83,9 @@ struct pnfs_layoutdriver_type {
int (*set_layoutdriver) (struct nfs_server *);
int (*unset_layoutdriver) (struct nfs_server *);

+ struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
+ void (*free_layout_hdr) (struct pnfs_layout_hdr *);
+
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
void (*free_lseg) (struct pnfs_layout_segment *lseg);

--
1.7.3.4


2011-04-20 17:27:04

by Benny Halevy

[permalink] [raw]
Subject: [RFC 06/27] pnfs: encode_layoutreturn

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
[fixup layout header pointer for encode_layoutreturn]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4xdr.c | 9 +++++++--
fs/nfs/pnfs.h | 4 ++++
2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 53ea3e5..6b64dd8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1927,8 +1927,13 @@ encode_layoutreturn(struct xdr_stream *xdr,
spin_unlock(&args->inode->i_lock);
p = xdr_encode_opaque_fixed(p, &stateid.data,
NFS4_STATEID_SIZE);
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(0);
+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+ NFS_I(args->inode)->layout, xdr, args);
+ } else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0);
+ }
}
hdr->nops++;
hdr->replen += decode_layoutreturn_maxsz;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c315109..51dcbc1 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -98,6 +98,10 @@ struct pnfs_layoutdriver_type {

/* device notification methods */
void (*delete_deviceid)(struct nfs4_deviceid *);
+
+ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args);
};

struct pnfs_layout_hdr {
--
1.7.3.4


2011-04-22 08:05:39

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 5/6] SQUASHME: remove range from nfs4_layoutreturn_args

Currently we always return the layout for the whole file.

squash into "pnfs: layoutreturn" and "pnfs: layoutret_on_setattr"

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/inode.c | 2 +-
fs/nfs/nfs4proc.c | 2 +-
fs/nfs/nfs4xdr.c | 6 +++---
fs/nfs/pnfs.c | 26 ++++++++++----------------
fs/nfs/pnfs.h | 10 ++++------
include/linux/nfs_xdr.h | 1 -
6 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9a48d1c..e9c6d9f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1426,7 +1426,7 @@ void nfs4_evict_inode(struct inode *inode)
{
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
- pnfs_return_layout(inode, NULL);
+ pnfs_return_layout(inode);
pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 059a74c..801121f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2333,7 +2333,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;

if (pnfs_ld_layoutret_on_setattr(inode))
- pnfs_return_layout(inode, NULL);
+ pnfs_return_layout(inode);

nfs_fattr_init(fattr);

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8c0e589..7e77a03 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1962,11 +1962,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
*p++ = cpu_to_be32(OP_LAYOUTRETURN);
*p++ = cpu_to_be32(args->reclaim);
*p++ = cpu_to_be32(args->layout_type);
- *p++ = cpu_to_be32(args->range.iomode);
+ *p++ = cpu_to_be32(IOMODE_ANY);
*p = cpu_to_be32(RETURN_FILE);
p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
- p = xdr_encode_hyper(p, args->range.offset);
- p = xdr_encode_hyper(p, args->range.length);
+ p = xdr_encode_hyper(p, 0);
+ p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
spin_lock(&args->inode->i_lock);
memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data, NFS4_STATEID_SIZE);
spin_unlock(&args->inode->i_lock);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b2066d2..ec2418b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -416,21 +416,22 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
* Must be called under the i_lock (unless from the nfs4_evict_inode path)
*/
static bool
-pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
- struct pnfs_layout_range *range)
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
{
struct pnfs_layout_segment *lseg, *next;
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .length = NFS4_MAX_UINT64,
+ };
bool rv = false;

- dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
- __func__, lo, range->offset, range->length, range->iomode);
if (list_empty(&lo->plh_segs)) {
if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
put_layout_hdr_locked(lo);
return 0;
}
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(&lseg->pls_range, range)) {
+ if (should_free_lseg(&lseg->pls_range, &range)) {
dprintk("%s: freeing lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode,
@@ -439,7 +440,6 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
mark_lseg_invalid(lseg, tmp_list);
rv = true;
}
- dprintk("%s:Return %d\n", __func__, rv);
return rv;
}

@@ -667,7 +667,7 @@ out_err_free:
}

static int
-return_layout(struct inode *ino, struct pnfs_layout_range *range)
+return_layout(struct inode *ino)
{
struct nfs4_layoutreturn *lrp;
struct nfs_server *server = NFS_SERVER(ino);
@@ -682,7 +682,6 @@ return_layout(struct inode *ino, struct pnfs_layout_range *range)
}
lrp->args.reclaim = 0;
lrp->args.layout_type = server->pnfs_curr_ld->id;
- lrp->args.range = *range;
lrp->args.inode = ino;
lrp->clp = server->nfs_client;

@@ -694,23 +693,18 @@ out:

/* Initiates a LAYOUTRETURN(FILE) */
int
-_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range)
+_pnfs_return_layout(struct inode *ino)
{
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
- struct pnfs_layout_range arg;
LIST_HEAD(tmp_list);
int status = 0;

dprintk("--> %s\n", __func__);

- arg.iomode = range ? range->iomode : IOMODE_ANY;
- arg.offset = 0;
- arg.length = NFS4_MAX_UINT64;
-
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) {
+ if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list)) {
spin_unlock(&ino->i_lock);
dprintk("%s: no layout segments to return\n", __func__);
goto out;
@@ -725,7 +719,7 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range)
if (status)
dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
__func__, status);
- status = return_layout(ino, &arg);
+ status = return_layout(ino);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index b5d1d22..2c8da75 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -202,7 +202,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
-int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *);
+int _pnfs_return_layout(struct inode *);
int pnfs_write_done(struct nfs_write_data *);
int pnfs_read_done(struct nfs_read_data *);

@@ -283,14 +283,13 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
PNFS_LAYOUTRET_ON_SETATTR;
}

-static inline int pnfs_return_layout(struct inode *ino,
- struct pnfs_layout_range *range)
+static inline int pnfs_return_layout(struct inode *ino)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_server *nfss = NFS_SERVER(ino);

if (pnfs_enabled_sb(nfss) && nfsi->layout)
- return _pnfs_return_layout(ino, range);
+ return _pnfs_return_layout(ino);

return 0;
}
@@ -336,8 +335,7 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
return PNFS_NOT_ATTEMPTED;
}

-static inline int pnfs_return_layout(struct inode *ino,
- struct pnfs_layout_range *range)
+static inline int pnfs_return_layout(struct inode *ino)
{
return 0;
}
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6506432..2bd48a2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -272,7 +272,6 @@ struct nfs4_layoutcommit_data {
struct nfs4_layoutreturn_args {
__u32 reclaim;
__u32 layout_type;
- struct pnfs_layout_range range;
struct inode *inode;
struct nfs4_sequence_args seq_args;
};
--
1.7.3.4


2011-04-22 08:05:59

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 6/6] SQUASHME: no need to send layoutcommit from _pnfs_return_layout

squash into "pnfs: layoutreturn"

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/pnfs.c | 7 ++-----
1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ec2418b..27b0593 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -714,11 +714,8 @@ _pnfs_return_layout(struct inode *ino)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);

- /* Return layout even if layoutcommit fails */
- status = pnfs_layoutcommit_inode(ino, true);
- if (status)
- dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
- __func__, status);
+ WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+
status = return_layout(ino);
out:
dprintk("<-- %s status: %d\n", __func__, status);
--
1.7.3.4


2011-04-22 08:31:51

by Benny Halevy

[permalink] [raw]
Subject: Re: [PATCH 3/6] SQUASHME: remove wait parameter from the layoutreturn path.

On 2011-04-22 11:04, Benny Halevy wrote:
> all call sites are sync now
> squash into "pnfs: layoutreturn"
>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/nfs4proc.c | 11 ++++-------
> fs/nfs/pnfs.c | 10 +++++-----
> fs/nfs/pnfs.h | 12 +++++-------
> 3 files changed, 14 insertions(+), 19 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 3e1843d..4f637e9 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -2333,7 +2333,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
> int status;
>
> if (pnfs_ld_layoutret_on_setattr(inode))
> - pnfs_return_layout(inode, NULL, true);
> + pnfs_return_layout(inode, NULL);
>
> nfs_fattr_init(fattr);
>
> @@ -5739,7 +5739,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
> .rpc_release = nfs4_layoutreturn_release,
> };
>
> -int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
> +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
> {
> struct rpc_task *task;
> struct rpc_message msg = {
> @@ -5752,22 +5752,19 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
> .rpc_message = &msg,
> .callback_ops = &nfs4_layoutreturn_call_ops,
> .callback_data = lrp,
> - .flags = RPC_TASK_ASYNC,
> };
> - int status = 0;
> + int status;
>
> dprintk("--> %s\n", __func__);
> task = rpc_run_task(&task_setup_data);
> if (IS_ERR(task))
> return PTR_ERR(task);
> - if (!issync)
> - goto out;
> status = nfs4_wait_for_completion_rpc_task(task);

sorry, no need to wait if the task isn't ASYNC.

Benny

> if (status != 0)
> goto out;
> status = task->tk_status;
> out:
> - dprintk("<-- %s\n", __func__);
> + dprintk("<-- %s status=%d\n", __func__, status);
> rpc_put_task(task);
> return status;
> }
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index bacde63..1ec5bb8 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -667,7 +667,7 @@ out_err_free:
> }
>
> static int
> -return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
> +return_layout(struct inode *ino, struct pnfs_layout_range *range)
> {
> struct nfs4_layoutreturn *lrp;
> struct nfs_server *server = NFS_SERVER(ino);
> @@ -687,7 +687,7 @@ return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
> lrp->args.inode = ino;
> lrp->clp = server->nfs_client;
>
> - status = nfs4_proc_layoutreturn(lrp, wait);
> + status = nfs4_proc_layoutreturn(lrp);
> out:
> dprintk("<-- %s status: %d\n", __func__, status);
> return status;
> @@ -695,7 +695,7 @@ out:
>
> /* Initiates a LAYOUTRETURN(FILE) */
> int
> -_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
> +_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range)
> {
> struct pnfs_layout_hdr *lo = NULL;
> struct nfs_inode *nfsi = NFS_I(ino);
> @@ -722,11 +722,11 @@ _pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wai
> pnfs_free_lseg_list(&tmp_list);
>
> /* Return layout even if layoutcommit fails */
> - status = pnfs_layoutcommit_inode(ino, wait);
> + status = pnfs_layoutcommit_inode(ino, true);
> if (status)
> dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
> __func__, status);
> - status = return_layout(ino, &arg, wait);
> + status = return_layout(ino, &arg);
> out:
> dprintk("<-- %s status: %d\n", __func__, status);
> return status;
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index b0f9b79..b5d1d22 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -164,7 +164,7 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
> extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
> struct pnfs_device *dev);
> extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
> -extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
> +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
>
> /* pnfs.c */
> void get_layout_hdr(struct pnfs_layout_hdr *lo);
> @@ -202,7 +202,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
> bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
> void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
> int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
> -int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
> +int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *);
> int pnfs_write_done(struct nfs_write_data *);
> int pnfs_read_done(struct nfs_read_data *);
>
> @@ -284,14 +284,13 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
> }
>
> static inline int pnfs_return_layout(struct inode *ino,
> - struct pnfs_layout_range *range,
> - bool wait)
> + struct pnfs_layout_range *range)
> {
> struct nfs_inode *nfsi = NFS_I(ino);
> struct nfs_server *nfss = NFS_SERVER(ino);
>
> if (pnfs_enabled_sb(nfss) && nfsi->layout)
> - return _pnfs_return_layout(ino, range, wait);
> + return _pnfs_return_layout(ino, range);
>
> return 0;
> }
> @@ -338,8 +337,7 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
> }
>
> static inline int pnfs_return_layout(struct inode *ino,
> - struct pnfs_layout_range *range,
> - bool wait)
> + struct pnfs_layout_range *range)
> {
> return 0;
> }


2011-04-20 20:04:01

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 04/27] pnfs: layoutret_on_setattr

On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
> From: Andy Adamson <[email protected]>
>
> Signed-off-by: Andy Adamson <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/nfs4proc.c | 3 +++
> fs/nfs/pnfs.h | 22 ++++++++++++++++++++++
> 2 files changed, 25 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index b03defb..b4df7a6 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -2332,6 +2332,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
> struct nfs4_state *state = NULL;
> int status;
>
> + if (pnfs_ld_layoutret_on_setattr(inode))
> + pnfs_return_layout(inode, NULL, true);

There is nothing that prevents further reads and writes being scheduled
after this, so what is the plan to prevent them from being sent to the
MDS?
Also, why are we doing this in the case of a file time update or a
modebits update? It seems relevant only for the case of a size update.

> +
> nfs_fattr_init(fattr);
>
> /* Search for an existing open(O_WRITE) file */
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index a308f3c..3506ad4 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -64,12 +64,18 @@ enum {
> NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
> };
>
> +enum layoutdriver_policy_flags {
> + /* Should the pNFS client commit and return the layout upon a setattr */
> + PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
> +};
> +
> /* Per-layout driver specific registration structure */
> struct pnfs_layoutdriver_type {
> struct list_head pnfs_tblid;
> const u32 id;
> const char *name;
> struct module *owner;
> + unsigned flags;
> struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
> void (*free_lseg) (struct pnfs_layout_segment *lseg);
>
> @@ -228,6 +234,16 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
> put_lseg(req->wb_commit_lseg);
> }
>
> +/* Should the pNFS client commit and return the layout upon a setattr */
> +static inline bool
> +pnfs_ld_layoutret_on_setattr(struct inode *inode)
> +{
> + if (!pnfs_enabled_sb(NFS_SERVER(inode)))
> + return false;
> + return NFS_SERVER(inode)->pnfs_curr_ld->flags &
> + PNFS_LAYOUTRET_ON_SETATTR;
> +}
> +
> static inline int pnfs_return_layout(struct inode *ino,
> struct pnfs_layout_range *range,
> bool wait)
> @@ -290,6 +306,12 @@ static inline int pnfs_return_layout(struct inode *ino,
> }
>
> static inline bool
> +pnfs_ld_layoutret_on_setattr(struct inode *inode)
> +{
> + return false;
> +}
> +
> +static inline bool
> pnfs_roc(struct inode *ino)
> {
> return false;

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-22 08:49:04

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 07/27] pnfs: encode_layoutcommit

On 2011-04-20 23:18, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> fs/nfs/nfs4xdr.c | 16 +++++++++++++---
>> fs/nfs/pnfs.h | 4 ++++
>> 2 files changed, 17 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index 6b64dd8..4f7bef9 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -1877,6 +1877,7 @@ encode_layoutget(struct xdr_stream *xdr,
>>
>> static int
>> encode_layoutcommit(struct xdr_stream *xdr,
>> + struct inode *inode,
>> const struct nfs4_layoutcommit_args *args,
>> struct compound_hdr *hdr)
>> {
>> @@ -1885,7 +1886,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
>> dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
>> NFS_SERVER(args->inode)->pnfs_curr_ld->id);
>>
>> - p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
>> + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
>> *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
>> /* Only whole file layouts */
>> p = xdr_encode_hyper(p, 0); /* offset */
>> @@ -1896,7 +1897,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
>> p = xdr_encode_hyper(p, args->lastbytewritten);
>> *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
>> *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
>> - *p++ = cpu_to_be32(0); /* no file layout payload */
>> +
>> + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
>> + NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
>> + NFS_I(inode)->layout, xdr, args);
>> + else {
>> + p = reserve_space(xdr, 4);
>> + *p = cpu_to_be32(0); /* no layout-type payload */
>> + }
>>
>> hdr->nops++;
>> hdr->replen += decode_layoutcommit_maxsz;
>> @@ -2759,6 +2767,8 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
>> struct xdr_stream *xdr,
>> struct nfs4_layoutcommit_args *args)
>> {
>> + struct nfs4_layoutcommit_data *data =
>> + container_of(args, struct nfs4_layoutcommit_data, args);
>> struct compound_hdr hdr = {
>> .minorversion = nfs4_xdr_minorversion(&args->seq_args),
>> };
>> @@ -2766,7 +2776,7 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
>> encode_compound_hdr(xdr, req, &hdr);
>> encode_sequence(xdr, &args->seq_args, &hdr);
>> encode_putfh(xdr, NFS_FH(args->inode), &hdr);
>> - encode_layoutcommit(xdr, args, &hdr);
>> + encode_layoutcommit(xdr, data->args.inode, args, &hdr);
>> encode_getfattr(xdr, args->bitmask, &hdr);
>> encode_nops(&hdr);
>> }
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 51dcbc1..011885e 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -99,6 +99,10 @@ struct pnfs_layoutdriver_type {
>> /* device notification methods */
>> void (*delete_deviceid)(struct nfs4_deviceid *);
>>
>> + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
>> + struct xdr_stream *xdr,
>> + const struct nfs4_layoutcommit_args *args);
>
> This too is way too ugly. Can't the layout payload be pre-encoded by the
> layout driver?
>
>

The contents for the objects layout driver are dynamic, representing
attribute-level metadata rather than something derived from the layout
that can be pre-encoded. In addition, I'd like to avoid having to
alloc and free a buffer for this stuff and rather have the layout driver
encode whatever it needs in-line.

Benny

2011-04-20 20:36:17

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 11/27] pnfs: per mount layout driver private data

On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> include/linux/nfs_fs_sb.h | 3 +++
> 1 files changed, 3 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 216cea5..c5b3fd0 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -142,6 +142,9 @@ struct nfs_server {
> filesystem */
> struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
> struct rpc_wait_queue roc_rpcwaitq;
> + void *pnfs_ld_data; /* Per-mount data */
> + unsigned int ds_rsize; /* Data server read size */
> + unsigned int ds_wsize; /* Data server write size */

Shouldn't the rsize and wsize be part of the layout driver private data?

>
> /* the following fields are protected by nfs_client->cl_lock */
> struct rb_root state_owners;

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-20 17:28:56

by Benny Halevy

[permalink] [raw]
Subject: [RFC 21/27] pnfs-obj: objio_osd real IO implementation

From: Boaz Harrosh <[email protected]>

With the use of the in-kernel osd library. Implement read/write
of data from/to osd-objects according to information specified
in the objects-layout.

TODO: Only a limited Mirror arrangement is implemented. stripping/raid
will come in at later patches.

[pnfs-obj: objio: cleanup un-indent _read_mirrors]
Signed-off-by: Boaz Harrosh <[email protected]>
[added FIXME comment]
[use REQ flags rather than BIO flags]
As they were converged starting 2.6.36
[fix REQ flags usage]
fixes "pnfs-obj use REQ flags rather than BIO flags"

Cc: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 418 +++++++++++++++++++++++++++++++++++++++++-
1 files changed, 415 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index a24bf82..75d4ebb 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -142,11 +142,26 @@ struct objio_segment {
struct osd_dev *ods[1];
};

+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+
struct objio_state {
/* Generic layer */
struct objlayout_io_state ol_state;

struct objio_segment *objio_seg;
+
+ struct kref kref;
+ objio_done_fn done;
+ void *private;
+
+ unsigned long length;
+ unsigned numdevs; /* Actually used devs in this IO */
+ /* A per-device variable array of size numdevs */
+ struct _objio_per_comp {
+ struct bio *bio;
+ struct osd_request *or;
+ } per_dev[];
};

/* Send and wait for a get_device_info of devices in the layout,
@@ -232,6 +247,38 @@ out:
return err;
}

+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+ struct pnfs_osd_data_map *data_map = &layout->olo_map;
+
+/* FIXME: Only Mirror arangment for now. if not so, do not mount */
+ if (data_map->odm_group_width || data_map->odm_group_depth) {
+ printk(KERN_ERR "Group width/depth not supported\n");
+ return -ENOTSUPP;
+ }
+ if (data_map->odm_num_comps != layout->olo_num_comps) {
+ printk(KERN_ERR "odm_num_comps(%u) != olo_num_comps(%u)\n",
+ data_map->odm_num_comps, layout->olo_num_comps);
+ return -ENOTSUPP;
+ }
+ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+ printk(KERN_ERR "Only RAID_0 for now\n");
+ return -ENOTSUPP;
+ }
+ if (data_map->odm_num_comps != data_map->odm_mirror_cnt + 1) {
+ printk(KERN_ERR "Mirror only!, num_comps=%u mirrors=%u\n",
+ data_map->odm_num_comps, data_map->odm_mirror_cnt);
+ return -ENOTSUPP;
+ }
+
+ if (data_map->odm_stripe_unit != PAGE_SIZE) {
+ printk(KERN_ERR "Stripe Unit != PAGE_SIZE not supported\n");
+ return -ENOTSUPP;
+ }
+
+ return 0;
+}
+
int objio_alloc_lseg(void **outp,
struct pnfs_layout_hdr *pnfslay,
struct pnfs_layout_segment *lseg,
@@ -240,6 +287,10 @@ int objio_alloc_lseg(void **outp,
struct objio_segment *objio_seg;
int err;

+ err = _verify_data_map(layout);
+ if (unlikely(err))
+ return err;
+
objio_seg = kzalloc(sizeof(*objio_seg) +
(layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
GFP_KERNEL);
@@ -272,8 +323,11 @@ int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
{
struct objio_segment *objio_seg = seg;
struct objio_state *ios;
+ const unsigned first_size = sizeof(*ios) +
+ objio_seg->num_comps * sizeof(ios->per_dev[0]);

- ios = kzalloc(sizeof(*ios), GFP_KERNEL);
+ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps);
+ ios = kzalloc(first_size, GFP_KERNEL);
if (unlikely(!ios))
return -ENOMEM;

@@ -291,20 +345,378 @@ void objio_free_io_state(struct objlayout_io_state *ol_state)
kfree(ios);
}

+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+ switch (oep) {
+ case OSD_ERR_PRI_NO_ERROR:
+ return (enum pnfs_osd_errno)0;
+
+ case OSD_ERR_PRI_CLEAR_PAGES:
+ BUG_ON(1);
+ return 0;
+
+ case OSD_ERR_PRI_RESOURCE:
+ return PNFS_OSD_ERR_RESOURCE;
+ case OSD_ERR_PRI_BAD_CRED:
+ return PNFS_OSD_ERR_BAD_CRED;
+ case OSD_ERR_PRI_NO_ACCESS:
+ return PNFS_OSD_ERR_NO_ACCESS;
+ case OSD_ERR_PRI_UNREACHABLE:
+ return PNFS_OSD_ERR_UNREACHABLE;
+ case OSD_ERR_PRI_NOT_FOUND:
+ return PNFS_OSD_ERR_NOT_FOUND;
+ case OSD_ERR_PRI_NO_SPACE:
+ return PNFS_OSD_ERR_NO_SPACE;
+ default:
+ WARN_ON(1);
+ /* fallthrough */
+ case OSD_ERR_PRI_EIO:
+ return PNFS_OSD_ERR_EIO;
+ }
+}
+
+static void _clear_bio(struct bio *bio)
+{
+ struct bio_vec *bv;
+ unsigned i;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ unsigned this_count = bv->bv_len;
+
+ if (likely(PAGE_SIZE == this_count))
+ clear_highpage(bv->bv_page);
+ else
+ zero_user(bv->bv_page, bv->bv_offset, this_count);
+ }
+}
+
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+ int lin_ret = 0;
+ int i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_sense_info osi;
+ struct osd_request *or = ios->per_dev[i].or;
+ int ret;
+
+ if (!or)
+ continue;
+
+ ret = osd_req_decode_sense(or, &osi);
+ if (likely(!ret))
+ continue;
+
+ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+ /* start read offset passed endof file */
+ BUG_ON(is_write);
+ _clear_bio(ios->per_dev[i].bio);
+ dprintk("%s: start read offset passed end of file "
+ "offset=0x%llx, length=0x%lx\n", __func__,
+ _LLU(ios->ol_state.offset), ios->length);
+
+ continue; /* we recovered */
+ }
+
+ if (osi.osd_err_pri >= oep) {
+ oep = osi.osd_err_pri;
+ lin_ret = ret;
+ }
+ }
+
+ return lin_ret;
+}
+
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+ unsigned i;
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct _objio_per_comp *per_dev = &ios->per_dev[i];
+
+ if (per_dev->or) {
+ osd_end_request(per_dev->or);
+ per_dev->or = NULL;
+ }
+
+ if (per_dev->bio) {
+ bio_put(per_dev->bio);
+ per_dev->bio = NULL;
+ }
+ }
+}
+
+static int _io_rw_pagelist(struct objio_state *ios)
+{
+ u64 length = ios->ol_state.count;
+ unsigned pgbase = ios->ol_state.pgbase;
+ unsigned nr_pages = ios->ol_state.nr_pages;
+ struct page **pages = ios->ol_state.pages;
+ struct bio *master_bio;
+ unsigned bio_size = min_t(unsigned, nr_pages, BIO_MAX_PAGES_KMALLOC);
+
+ master_bio = bio_kmalloc(GFP_KERNEL, bio_size);
+ if (unlikely(!master_bio)) {
+ dprintk("%s: Faild to alloc bio pages=%d\n",
+ __func__, bio_size);
+ return -ENOMEM;
+ }
+
+ ios->per_dev[0].bio = master_bio;
+
+ while (length) {
+ unsigned cur_len, added_len;
+
+ cur_len = min_t(u64, length, PAGE_SIZE - pgbase);
+
+ added_len = bio_add_pc_page(
+ osd_request_queue(ios->objio_seg->ods[0]),
+ master_bio, *pages, cur_len, pgbase);
+ if (unlikely(cur_len != added_len))
+ break;
+
+ pgbase = 0;
+ ++pages;
+ length -= cur_len;
+ ios->length += cur_len;
+ }
+
+ /* this should never happen */
+ WARN_ON(!ios->length);
+
+ return 0;
+}
+
+static ssize_t _sync_done(struct objio_state *ios)
+{
+ struct completion *waiting = ios->private;
+
+ complete(waiting);
+ return 0;
+}
+
+static void _last_io(struct kref *kref)
+{
+ struct objio_state *ios = container_of(kref, struct objio_state, kref);
+
+ ios->done(ios);
+}
+
+static void _done_io(struct osd_request *or, void *p)
+{
+ struct objio_state *ios = p;
+
+ kref_put(&ios->kref, _last_io);
+}
+
+static ssize_t _io_exec(struct objio_state *ios)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ ssize_t status = 0; /* sync status */
+ unsigned i;
+ objio_done_fn saved_done_fn = ios->done;
+ bool sync = ios->ol_state.sync;
+
+ if (sync) {
+ ios->done = _sync_done;
+ ios->private = &wait;
+ }
+
+ kref_init(&ios->kref);
+
+ for (i = 0; i < ios->numdevs; i++) {
+ struct osd_request *or = ios->per_dev[i].or;
+
+ if (!or)
+ continue;
+
+ kref_get(&ios->kref);
+ osd_execute_request_async(or, _done_io, ios);
+ }
+
+ kref_put(&ios->kref, _last_io);
+
+ if (sync) {
+ wait_for_completion(&wait);
+ status = saved_done_fn(ios);
+ }
+
+ return status;
+}
+
/*
* read
*/
+static ssize_t _read_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, false);
+
+ _io_free(ios);
+
+ if (likely(!ret))
+ status = ios->length;
+ else
+ status = ret;
+
+ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+ struct osd_request *or = NULL;
+ struct _objio_per_comp *per_dev = &ios->per_dev[0];
+ unsigned dev = 0;
+ struct pnfs_osd_object_cred *cred =
+ &ios->objio_seg->layout->olo_comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ int ret;
+
+ or = osd_start_request(ios->objio_seg->ods[dev], GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+ ios->numdevs++;
+
+ osd_req_read(or, &obj, ios->ol_state.offset, per_dev->bio, ios->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s: obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, obj.id, _LLU(ios->ol_state.offset), ios->length);
+ ios->done = _read_done;
+ return _io_exec(ios); /* In sync mode exec returns the io status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
{
- return -EIO;
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ ret = _io_rw_pagelist(ios);
+ if (unlikely(ret))
+ return ret;
+
+ return _read_exec(ios);
}

/*
* write
*/
+static ssize_t _write_done(struct objio_state *ios)
+{
+ ssize_t status;
+ int ret = _io_check(ios, true);
+
+ _io_free(ios);
+
+ if (likely(!ret)) {
+ /* FIXME: should be based on the OSD's persistence model
+ * See OSD2r05 Section 4.13 Data persistence model */
+ ios->ol_state.committed = NFS_FILE_SYNC;
+ status = ios->length;
+ } else {
+ status = ret;
+ }
+
+ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+ return status;
+}
+
+static int _write_exec(struct objio_state *ios)
+{
+ int i, ret;
+ struct bio *master_bio = ios->per_dev[0].bio;
+
+ for (i = 0; i < ios->objio_seg->num_comps; i++) {
+ struct osd_request *or = NULL;
+ struct pnfs_osd_object_cred *cred =
+ &ios->objio_seg->layout->olo_comps[i];
+ struct osd_obj_id obj = {cred->oc_object_id.oid_partition_id,
+ cred->oc_object_id.oid_object_id};
+ struct _objio_per_comp *per_dev = &ios->per_dev[i];
+ struct bio *bio;
+
+ or = osd_start_request(ios->objio_seg->ods[i], GFP_KERNEL);
+ if (unlikely(!or)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ per_dev->or = or;
+ ios->numdevs++;
+
+ if (i != 0) {
+ bio = bio_kmalloc(GFP_KERNEL, master_bio->bi_max_vecs);
+ if (unlikely(!bio)) {
+ dprintk("Faild to allocate BIO size=%u\n",
+ master_bio->bi_max_vecs);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ __bio_clone(bio, master_bio);
+ bio->bi_bdev = NULL;
+ bio->bi_next = NULL;
+ per_dev->bio = bio;
+ } else {
+ bio = master_bio;
+ bio->bi_rw |= REQ_WRITE;
+ }
+
+ osd_req_write(or, &obj, ios->ol_state.offset, bio, ios->length);
+
+ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+ if (ret) {
+ dprintk("%s: Faild to osd_finalize_request() => %d\n",
+ __func__, ret);
+ goto err;
+ }
+
+ dprintk("%s: [%d] obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, i, obj.id, _LLU(ios->ol_state.offset),
+ ios->length);
+ }
+
+ ios->done = _write_done;
+ return _io_exec(ios); /* In sync mode exec returns the io->status */
+
+err:
+ _io_free(ios);
+ return ret;
+}
+
ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
{
- return -EIO;
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+ int ret;
+
+ /* TODO: ios->stable = stable; */
+ ret = _io_rw_pagelist(ios);
+ if (unlikely(ret))
+ return ret;
+
+ return _write_exec(ios);
}

/*
--
1.7.3.4


2011-04-20 17:28:27

by Benny Halevy

[permalink] [raw]
Subject: [RFC 17/27] exofs: pnfs-tree: Remove pnfs-osd private definitions

From: Boaz Harrosh <[email protected]>

Now that pnfs-osd has hit mainline we can remove exofs's
private header. (And the FIXME comment)

Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/exofs/exofs.h | 6 +-----
fs/exofs/pnfs.h | 45 ---------------------------------------------
2 files changed, 1 insertions(+), 50 deletions(-)
delete mode 100644 fs/exofs/pnfs.h

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c965806..e103dbd 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -36,13 +36,9 @@
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/backing-dev.h>
+#include <linux/pnfs_osd_xdr.h>
#include "common.h"

-/* FIXME: Remove once pnfs hits mainline
- * #include <linux/exportfs/pnfs_osd_xdr.h>
- */
-#include "pnfs.h"
-
#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)

#ifdef CONFIG_EXOFS_DEBUG
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
deleted file mode 100644
index c52e988..0000000
--- a/fs/exofs/pnfs.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <[email protected]>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify it under the
- * terms of the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- */
-
-/* FIXME: Remove this file once pnfs hits mainline */
-
-#ifndef __EXOFS_PNFS_H__
-#define __EXOFS_PNFS_H__
-
-#if ! defined(__PNFS_OSD_XDR_H__)
-
-enum pnfs_iomode {
- IOMODE_READ = 1,
- IOMODE_RW = 2,
- IOMODE_ANY = 3,
-};
-
-/* Layout Structure */
-enum pnfs_osd_raid_algorithm4 {
- PNFS_OSD_RAID_0 = 1,
- PNFS_OSD_RAID_4 = 2,
- PNFS_OSD_RAID_5 = 3,
- PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
-};
-
-struct pnfs_osd_data_map {
- u32 odm_num_comps;
- u64 odm_stripe_unit;
- u32 odm_group_width;
- u32 odm_group_depth;
- u32 odm_mirror_cnt;
- u32 odm_raid_algorithm;
-};
-
-#endif /* ! defined(__PNFS_OSD_XDR_H__) */
-
-#endif /* __EXOFS_PNFS_H__ */
--
1.7.3.4


2011-04-20 17:29:39

by Benny Halevy

[permalink] [raw]
Subject: [RFC 27/27] pnfs-obj: objio_osd: groups support

From: Boaz Harrosh <[email protected]>

* _calc_stripe_info() changes to accommodate for grouping
calculations. Returns additional information

* old _prepare_pages() becomes _prepare_one_group()
which stores pages belonging to one device group.

* Iterates on all groups calling _prepare_one_group().

* Enable mounting of groups data_maps (group_width != 0)

TODO:
Support for parial layout will come in next patch

[Support partial layouts]
Signed-off-by: Boaz Harrosh <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 135 +++++++++++++++++++++++++++++++++---------
1 files changed, 106 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5c141d0..47e8695 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -140,6 +140,8 @@ struct objio_segment {
unsigned mirrors_p1;
unsigned stripe_unit;
unsigned group_width; /* Data stripe_units without integrity comps */
+ u64 group_depth;
+ unsigned group_count;

unsigned num_comps;
/* variable length */
@@ -258,12 +260,9 @@ static int _verify_data_map(struct pnfs_osd_layout *layout)
{
struct pnfs_osd_data_map *data_map = &layout->olo_map;
u64 stripe_length;
+ u32 group_width;

-/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
- if (data_map->odm_group_width || data_map->odm_group_depth) {
- printk(KERN_ERR "Group width/depth not supported\n");
- return -ENOTSUPP;
- }
+/* FIXME: Only raid0 for now. if not go through MDS */
if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
printk(KERN_ERR "Only RAID_0 for now\n");
return -ENOTSUPP;
@@ -274,8 +273,13 @@ static int _verify_data_map(struct pnfs_osd_layout *layout)
return -EINVAL;
}

- stripe_length = data_map->odm_stripe_unit * (data_map->odm_num_comps /
- (data_map->odm_mirror_cnt + 1));
+ if (data_map->odm_group_width)
+ group_width = data_map->odm_group_width;
+ else
+ group_width = data_map->odm_num_comps /
+ (data_map->odm_mirror_cnt + 1);
+
+ stripe_length = (u64)data_map->odm_stripe_unit * group_width;
if (stripe_length >= (1ULL << 32)) {
printk(KERN_ERR "Total Stripe length(0x%llx)"
" >= 32bit is not supported\n", _LLU(stripe_length));
@@ -317,8 +321,18 @@ int objio_alloc_lseg(void **outp,

objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
- objio_seg->group_width = layout->olo_map.odm_num_comps /
- objio_seg->mirrors_p1;
+ if (layout->olo_map.odm_group_width) {
+ objio_seg->group_width = layout->olo_map.odm_group_width;
+ objio_seg->group_depth = layout->olo_map.odm_group_depth;
+ objio_seg->group_count = layout->olo_map.odm_num_comps /
+ objio_seg->mirrors_p1 /
+ objio_seg->group_width;
+ } else {
+ objio_seg->group_width = layout->olo_map.odm_num_comps /
+ objio_seg->mirrors_p1;
+ objio_seg->group_depth = -1;
+ objio_seg->group_count = 1;
+ }

*outp = objio_seg;
return 0;
@@ -489,6 +503,9 @@ struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)

struct _striping_info {
u64 obj_offset;
+ u64 group_length;
+ u64 total_group_length;
+ u64 Major;
unsigned dev;
unsigned unit_off;
};
@@ -498,15 +515,34 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
{
u32 stripe_unit = ios->objio_seg->stripe_unit;
u32 group_width = ios->objio_seg->group_width;
+ u64 group_depth = ios->objio_seg->group_depth;
u32 U = stripe_unit * group_width;

- u32 LmodU;
- u64 N = div_u64_rem(file_offset, U, &LmodU);
+ u64 T = U * group_depth;
+ u64 S = T * ios->objio_seg->group_count;
+ u64 M = div64_u64(file_offset, S);
+
+ /*
+ G = (L - (M * S)) / T
+ H = (L - (M * S)) % T
+ */
+ u64 LmodU = file_offset - M * S;
+ u32 G = div64_u64(LmodU, T);
+ u64 H = LmodU - G * T;
+
+ u32 N = div_u64(H, U);
+
+ div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+ si->obj_offset = si->unit_off + (N * stripe_unit) +
+ (M * group_depth * stripe_unit);

- si->unit_off = LmodU % stripe_unit;
- si->obj_offset = N * stripe_unit + si->unit_off;
- si->dev = LmodU / stripe_unit;
+ /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
si->dev *= ios->objio_seg->mirrors_p1;
+
+ si->group_length = T - H;
+ si->total_group_length = T;
+ si->Major = M;
}

static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
@@ -553,15 +589,18 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
return 0;
}

-static int _prepare_pages(struct objio_state *ios, struct _striping_info *si)
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+ struct _striping_info *si, unsigned first_comp,
+ unsigned *last_pg)
{
- u64 length = ios->ol_state.count;
unsigned stripe_unit = ios->objio_seg->stripe_unit;
unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
+ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1;
unsigned dev = si->dev;
- unsigned comp = 0;
- unsigned stripes = 0;
- unsigned cur_pg = 0;
+ unsigned first_dev = dev - (dev % devs_in_group);
+ unsigned comp = first_comp + (dev - first_dev);
+ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+ unsigned cur_pg = *last_pg;
int ret = 0;

while (length) {
@@ -585,10 +624,11 @@ static int _prepare_pages(struct objio_state *ios, struct _striping_info *si)
cur_len = stripe_unit;
}

- stripes++;
+ if (max_comp < comp)
+ max_comp = comp;

dev += mirrors_p1;
- dev %= ios->ol_state.num_comps;
+ dev = (dev % devs_in_group) + first_dev;
} else {
cur_len = stripe_unit;
}
@@ -601,25 +641,58 @@ static int _prepare_pages(struct objio_state *ios, struct _striping_info *si)
goto out;

comp += mirrors_p1;
- comp %= ios->ol_state.num_comps;
+ comp = (comp % devs_in_group) + first_comp;

length -= cur_len;
ios->length += cur_len;
}
out:
- if (!ios->length)
- return ret;
-
- ios->numdevs = stripes * mirrors_p1;
- return 0;
+ ios->numdevs = max_comp + mirrors_p1;
+ *last_pg = cur_pg;
+ return ret;
}

static int _io_rw_pagelist(struct objio_state *ios)
{
+ u64 length = ios->ol_state.count;
struct _striping_info si;
+ unsigned devs_in_group = ios->objio_seg->group_width *
+ ios->objio_seg->mirrors_p1;
+ unsigned first_comp = 0;
+ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps;
+ unsigned last_pg = 0;
+ int ret = 0;

- _calc_stripe_info(ios, ios->ol_state.count, &si);
- return _prepare_pages(ios, &si);
+ _calc_stripe_info(ios, ios->ol_state.offset, &si);
+ while (length) {
+ if (length < si.group_length)
+ si.group_length = length;
+
+ ret = _prepare_one_group(ios, si.group_length, &si, first_comp,
+ &last_pg);
+ if (unlikely(ret))
+ goto out;
+
+ length -= si.group_length;
+
+ si.group_length = si.total_group_length;
+ si.unit_off = 0;
+ ++si.Major;
+ si.obj_offset = si.Major * ios->objio_seg->stripe_unit *
+ ios->objio_seg->group_depth;
+
+ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
+ si.dev %= num_comps;
+
+ first_comp += devs_in_group;
+ first_comp %= num_comps;
+ }
+
+out:
+ if (!ios->length)
+ return ret;
+
+ return 0;
}

static ssize_t _sync_done(struct objio_state *ios)
@@ -741,6 +814,8 @@ static ssize_t _read_exec(struct objio_state *ios)
int ret;

for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
ret = _read_mirrors(ios, i);
if (unlikely(ret))
goto err;
@@ -861,6 +936,8 @@ static ssize_t _write_exec(struct objio_state *ios)
int ret;

for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
+ if (!ios->per_dev[i].length)
+ continue;
ret = _write_mirrors(ios, i);
if (unlikely(ret))
goto err;
--
1.7.3.4


2011-04-20 17:29:32

by Benny Halevy

[permalink] [raw]
Subject: [RFC 26/27] pnfs-obj: objio_osd: RAID0 support

From: Boaz Harrosh <[email protected]>

Support for stripping over mirrors with a received stripe_unit.
There are however a few constrains which are not supported:
1. Stripe Unit must be a multiple of PAGE_SIZE
2. stripe length (stripe_unit * number_of_stripes) can not be
bigger then 32bit.
3. group width/depth not yet supported

[pnfs-obj: RAID0 micro optimization and cleanups]
[pnfs-obj: objio_osd: Prepare for groups]
[Support partial layouts]
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 293 ++++++++++++++++++++++++++++++++----------
1 files changed, 227 insertions(+), 66 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 179dfbd..5c141d0 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -137,6 +137,10 @@ out:
struct objio_segment {
struct pnfs_osd_layout *layout;

+ unsigned mirrors_p1;
+ unsigned stripe_unit;
+ unsigned group_width; /* Data stripe_units without integrity comps */
+
unsigned num_comps;
/* variable length */
struct osd_dev *ods[1];
@@ -161,6 +165,9 @@ struct objio_state {
struct _objio_per_comp {
struct bio *bio;
struct osd_request *or;
+ unsigned long length;
+ u64 offset;
+ unsigned dev;
} per_dev[];
};

@@ -250,29 +257,35 @@ out:
static int _verify_data_map(struct pnfs_osd_layout *layout)
{
struct pnfs_osd_data_map *data_map = &layout->olo_map;
+ u64 stripe_length;

-/* FIXME: Only Mirror arangment for now. if not so, do not mount */
+/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
if (data_map->odm_group_width || data_map->odm_group_depth) {
printk(KERN_ERR "Group width/depth not supported\n");
return -ENOTSUPP;
}
- if (data_map->odm_num_comps != layout->olo_num_comps) {
- printk(KERN_ERR "odm_num_comps(%u) != olo_num_comps(%u)\n",
- data_map->odm_num_comps, layout->olo_num_comps);
- return -ENOTSUPP;
- }
if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
printk(KERN_ERR "Only RAID_0 for now\n");
return -ENOTSUPP;
}
- if (data_map->odm_num_comps != data_map->odm_mirror_cnt + 1) {
- printk(KERN_ERR "Mirror only!, num_comps=%u mirrors=%u\n",
+ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
data_map->odm_num_comps, data_map->odm_mirror_cnt);
+ return -EINVAL;
+ }
+
+ stripe_length = data_map->odm_stripe_unit * (data_map->odm_num_comps /
+ (data_map->odm_mirror_cnt + 1));
+ if (stripe_length >= (1ULL << 32)) {
+ printk(KERN_ERR "Total Stripe length(0x%llx)"
+ " >= 32bit is not supported\n", _LLU(stripe_length));
return -ENOTSUPP;
}

- if (data_map->odm_stripe_unit != PAGE_SIZE) {
- printk(KERN_ERR "Stripe Unit != PAGE_SIZE not supported\n");
+ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+ printk(KERN_ERR "Stripe Unit(0x%llx)"
+ " must be Multples of PAGE_SIZE(0x%lx)\n",
+ _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
return -ENOTSUPP;
}

@@ -302,6 +315,11 @@ int objio_alloc_lseg(void **outp,
if (err)
goto free_seg;

+ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1;
+ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit;
+ objio_seg->group_width = layout->olo_map.odm_num_comps /
+ objio_seg->mirrors_p1;
+
*outp = objio_seg;
return 0;

@@ -418,13 +436,15 @@ static int _io_check(struct objio_state *ios, bool is_write)
_clear_bio(ios->per_dev[i].bio);
dprintk("%s: start read offset passed end of file "
"offset=0x%llx, length=0x%lx\n", __func__,
- _LLU(ios->ol_state.offset), ios->length);
+ _LLU(ios->per_dev[i].offset),
+ ios->per_dev[i].length);

continue; /* we recovered */
}
- objlayout_io_set_result(&ios->ol_state, i,
+ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev,
osd_pri_2_pnfs_err(osi.osd_err_pri),
- ios->ol_state.offset, ios->length,
+ ios->per_dev[i].offset,
+ ios->per_dev[i].length,
is_write);

if (osi.osd_err_pri >= oep) {
@@ -458,47 +478,150 @@ static void _io_free(struct objio_state *ios)
}
}

-static int _io_rw_pagelist(struct objio_state *ios)
+struct osd_dev * _io_od(struct objio_state *ios, unsigned dev)
{
- u64 length = ios->ol_state.count;
- unsigned pgbase = ios->ol_state.pgbase;
- unsigned nr_pages = ios->ol_state.nr_pages;
- struct page **pages = ios->ol_state.pages;
- struct bio *master_bio;
- unsigned bio_size = min_t(unsigned, nr_pages, BIO_MAX_PAGES_KMALLOC);
-
- master_bio = bio_kmalloc(GFP_KERNEL, bio_size);
- if (unlikely(!master_bio)) {
- dprintk("%s: Faild to alloc bio pages=%d\n",
- __func__, bio_size);
- return -ENOMEM;
+ unsigned min_dev = ios->objio_seg->layout->olo_comps_index;
+ unsigned max_dev = min_dev + ios->ol_state.num_comps;
+
+ BUG_ON(dev < min_dev || max_dev <= dev);
+ return ios->objio_seg->ods[dev - min_dev];
+}
+
+struct _striping_info {
+ u64 obj_offset;
+ unsigned dev;
+ unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+ struct _striping_info *si)
+{
+ u32 stripe_unit = ios->objio_seg->stripe_unit;
+ u32 group_width = ios->objio_seg->group_width;
+ u32 U = stripe_unit * group_width;
+
+ u32 LmodU;
+ u64 N = div_u64_rem(file_offset, U, &LmodU);
+
+ si->unit_off = LmodU % stripe_unit;
+ si->obj_offset = N * stripe_unit + si->unit_off;
+ si->dev = LmodU / stripe_unit;
+ si->dev *= ios->objio_seg->mirrors_p1;
+}
+
+static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
+ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len)
+{
+ unsigned pg = *cur_pg;
+ struct request_queue *q =
+ osd_request_queue(_io_od(ios, per_dev->dev));
+
+ per_dev->length += cur_len;
+
+ if (per_dev->bio == NULL) {
+ unsigned stripes = ios->ol_state.num_comps /
+ ios->objio_seg->mirrors_p1;
+ unsigned pages_in_stripe = stripes *
+ (ios->objio_seg->stripe_unit / PAGE_SIZE);
+ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+ stripes;
+
+ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+ if (unlikely(!per_dev->bio)) {
+ dprintk("Faild to allocate BIO size=%u\n", bio_size);
+ return -ENOMEM;
+ }
}

- ios->per_dev[0].bio = master_bio;
+ while (cur_len > 0) {
+ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+ unsigned added_len;
+
+ BUG_ON(ios->ol_state.nr_pages <= pg);
+ cur_len -= pglen;
+
+ added_len = bio_add_pc_page(q, per_dev->bio,
+ ios->ol_state.pages[pg], pglen, pgbase);
+ if (unlikely(pglen != added_len))
+ return -ENOMEM;
+ pgbase = 0;
+ ++pg;
+ }
+ BUG_ON(cur_len);
+
+ *cur_pg = pg;
+ return 0;
+}
+
+static int _prepare_pages(struct objio_state *ios, struct _striping_info *si)
+{
+ u64 length = ios->ol_state.count;
+ unsigned stripe_unit = ios->objio_seg->stripe_unit;
+ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1;
+ unsigned dev = si->dev;
+ unsigned comp = 0;
+ unsigned stripes = 0;
+ unsigned cur_pg = 0;
+ int ret = 0;

while (length) {
- unsigned cur_len, added_len;
+ struct _objio_per_comp *per_dev = &ios->per_dev[comp];
+ unsigned cur_len, page_off = 0;
+
+ if (!per_dev->length) {
+ per_dev->dev = dev;
+ if (dev < si->dev) {
+ per_dev->offset = si->obj_offset + stripe_unit -
+ si->unit_off;
+ cur_len = stripe_unit;
+ } else if (dev == si->dev) {
+ per_dev->offset = si->obj_offset;
+ cur_len = stripe_unit - si->unit_off;
+ page_off = si->unit_off & ~PAGE_MASK;
+ BUG_ON(page_off &&
+ (page_off != ios->ol_state.pgbase));
+ } else { /* dev > si->dev */
+ per_dev->offset = si->obj_offset - si->unit_off;
+ cur_len = stripe_unit;
+ }

- cur_len = min_t(u64, length, PAGE_SIZE - pgbase);
+ stripes++;

- added_len = bio_add_pc_page(
- osd_request_queue(ios->objio_seg->ods[0]),
- master_bio, *pages, cur_len, pgbase);
- if (unlikely(cur_len != added_len))
- break;
+ dev += mirrors_p1;
+ dev %= ios->ol_state.num_comps;
+ } else {
+ cur_len = stripe_unit;
+ }
+ if (cur_len >= length)
+ cur_len = length;
+
+ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+ cur_len);
+ if (unlikely(ret))
+ goto out;
+
+ comp += mirrors_p1;
+ comp %= ios->ol_state.num_comps;

- pgbase = 0;
- ++pages;
length -= cur_len;
ios->length += cur_len;
}
+out:
+ if (!ios->length)
+ return ret;

- /* this should never happen */
- WARN_ON(!ios->length);
-
+ ios->numdevs = stripes * mirrors_p1;
return 0;
}

+static int _io_rw_pagelist(struct objio_state *ios)
+{
+ struct _striping_info si;
+
+ _calc_stripe_info(ios, ios->ol_state.count, &si);
+ return _prepare_pages(ios, &si);
+}
+
static ssize_t _sync_done(struct objio_state *ios)
{
struct completion *waiting = ios->private;
@@ -575,11 +698,11 @@ static ssize_t _read_done(struct objio_state *ios)
return status;
}

-static ssize_t _read_exec(struct objio_state *ios)
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
{
struct osd_request *or = NULL;
- struct _objio_per_comp *per_dev = &ios->per_dev[0];
- unsigned dev = 0;
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+ unsigned dev = per_dev->dev;
struct pnfs_osd_object_cred *cred =
&ios->objio_seg->layout->olo_comps[dev];
struct osd_obj_id obj = {
@@ -588,15 +711,14 @@ static ssize_t _read_exec(struct objio_state *ios)
};
int ret;

- or = osd_start_request(ios->objio_seg->ods[dev], GFP_KERNEL);
+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
if (unlikely(!or)) {
ret = -ENOMEM;
goto err;
}
per_dev->or = or;
- ios->numdevs++;

- osd_req_read(or, &obj, ios->ol_state.offset, per_dev->bio, ios->length);
+ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);

ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
if (ret) {
@@ -605,8 +727,25 @@ static ssize_t _read_exec(struct objio_state *ios)
goto err;
}

- dprintk("%s: obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, obj.id, _LLU(ios->ol_state.offset), ios->length);
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+
+err:
+ return ret;
+}
+
+static ssize_t _read_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
+ ret = _read_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
+ }
+
ios->done = _read_done;
return _io_exec(ios); /* In sync mode exec returns the io status */

@@ -651,47 +790,54 @@ static ssize_t _write_done(struct objio_state *ios)
return status;
}

-static int _write_exec(struct objio_state *ios)
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
{
- int i, ret;
- struct bio *master_bio = ios->per_dev[0].bio;
+ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+ unsigned dev = ios->per_dev[cur_comp].dev;
+ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1;
+ int ret;

- for (i = 0; i < ios->objio_seg->num_comps; i++) {
+ for (; cur_comp < last_comp; ++cur_comp, ++dev) {
struct osd_request *or = NULL;
struct pnfs_osd_object_cred *cred =
- &ios->objio_seg->layout->olo_comps[i];
- struct osd_obj_id obj = {cred->oc_object_id.oid_partition_id,
- cred->oc_object_id.oid_object_id};
- struct _objio_per_comp *per_dev = &ios->per_dev[i];
+ &ios->objio_seg->layout->olo_comps[dev];
+ struct osd_obj_id obj = {
+ .partition = cred->oc_object_id.oid_partition_id,
+ .id = cred->oc_object_id.oid_object_id,
+ };
+ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
struct bio *bio;

- or = osd_start_request(ios->objio_seg->ods[i], GFP_KERNEL);
+ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
if (unlikely(!or)) {
ret = -ENOMEM;
goto err;
}
per_dev->or = or;
- ios->numdevs++;

- if (i != 0) {
- bio = bio_kmalloc(GFP_KERNEL, master_bio->bi_max_vecs);
+ if (per_dev != master_dev) {
+ bio = bio_kmalloc(GFP_KERNEL,
+ master_dev->bio->bi_max_vecs);
if (unlikely(!bio)) {
dprintk("Faild to allocate BIO size=%u\n",
- master_bio->bi_max_vecs);
+ master_dev->bio->bi_max_vecs);
ret = -ENOMEM;
goto err;
}

- __bio_clone(bio, master_bio);
+ __bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
per_dev->bio = bio;
+ per_dev->dev = dev;
+ per_dev->length = master_dev->length;
+ per_dev->offset = master_dev->offset;
} else {
- bio = master_bio;
+ bio = master_dev->bio;
bio->bi_rw |= REQ_WRITE;
}

- osd_req_write(or, &obj, ios->ol_state.offset, bio, ios->length);
+ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);

ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
if (ret) {
@@ -700,9 +846,24 @@ static int _write_exec(struct objio_state *ios)
goto err;
}

- dprintk("%s: [%d] obj=0x%llx start=0x%llx length=0x%lx\n",
- __func__, i, obj.id, _LLU(ios->ol_state.offset),
- ios->length);
+ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+ per_dev->length);
+ }
+
+err:
+ return ret;
+}
+
+static ssize_t _write_exec(struct objio_state *ios)
+{
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) {
+ ret = _write_mirrors(ios, i);
+ if (unlikely(ret))
+ goto err;
}

ios->done = _write_done;
--
1.7.3.4


2011-04-20 17:26:28

by Benny Halevy

[permalink] [raw]
Subject: [RFC 02/27] pnfs: direct i/o

From: Andy Adamson <[email protected]>

Signed-off-by: Dean Hildebrand <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/direct.c | 160 +++++++++++++++++++++++++++++++-----------------------
1 files changed, 92 insertions(+), 68 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8eea253..55dffb7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -272,6 +272,38 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
.rpc_release = nfs_direct_read_release,
};

+static long nfs_direct_read_execute(struct nfs_read_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg)
+{
+ struct inode *inode = data->inode;
+ struct rpc_task *task;
+
+ nfs_fattr_init(&data->fattr);
+ msg->rpc_argp = &data->args;
+ msg->rpc_resp = &data->res;
+
+ task_setup_data->task = &data->task;
+ task_setup_data->callback_data = data;
+ NFS_PROTO(inode)->read_setup(data, msg);
+
+ task = rpc_run_task(task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u initiated direct read call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ return 0;
+}
+
/*
* For each rsize'd chunk of the user's buffer, dispatch an NFS READ
* operation. If nfs_readdata_alloc() or get_user_pages() fails,
@@ -288,7 +320,6 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
unsigned long user_addr = (unsigned long)iov->iov_base;
size_t count = iov->iov_len;
size_t rsize = NFS_SERVER(inode)->rsize;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_cred = ctx->cred,
};
@@ -349,26 +380,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
data->res.fattr = &data->fattr;
data->res.eof = 0;
data->res.count = bytes;
- nfs_fattr_init(&data->fattr);
- msg.rpc_argp = &data->args;
- msg.rpc_resp = &data->res;

- task_setup_data.task = &data->task;
- task_setup_data.callback_data = data;
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
+ if (nfs_direct_read_execute(data, &task_setup_data, &msg))
break;
- rpc_put_task(task);
-
- dprintk("NFS: %5u initiated direct read call "
- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- bytes,
- (unsigned long long)data->args.offset);

started += bytes;
user_addr += bytes;
@@ -461,12 +475,15 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
}

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static long nfs_direct_write_execute(struct nfs_write_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg);
+
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct inode *inode = dreq->inode;
struct list_head *p;
struct nfs_write_data *data;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_cred = dreq->ctx->cred,
};
@@ -500,25 +517,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
* Reuse data->task; data->args should not have changed
* since the original request was sent.
*/
- task_setup_data.task = &data->task;
- task_setup_data.callback_data = data;
- msg.rpc_argp = &data->args;
- msg.rpc_resp = &data->res;
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- /*
- * We're called via an RPC callback, so BKL is already held.
- */
- task = rpc_run_task(&task_setup_data);
- if (!IS_ERR(task))
- rpc_put_task(task);
-
- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ nfs_direct_write_execute(data, &task_setup_data, &msg);
}

if (put_dreq(dreq))
@@ -561,10 +560,31 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
.rpc_release = nfs_direct_commit_release,
};

+static long nfs_direct_commit_execute(struct nfs_direct_req *dreq,
+ struct nfs_write_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg)
+{
+ struct rpc_task *task;
+
+ NFS_PROTO(data->inode)->commit_setup(data, msg);
+
+ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+ dreq->commit_data = NULL;
+
+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+
+ task = rpc_run_task(task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ rpc_put_task(task);
+ return 0;
+}
+
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
struct nfs_write_data *data = dreq->commit_data;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &data->args,
.rpc_resp = &data->res,
@@ -593,16 +613,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);

- NFS_PROTO(data->inode)->commit_setup(data, &msg);
-
- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */
- dreq->commit_data = NULL;
-
- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
-
- task = rpc_run_task(&task_setup_data);
- if (!IS_ERR(task))
- rpc_put_task(task);
+ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg);
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
@@ -703,6 +714,36 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
.rpc_release = nfs_direct_write_release,
};

+static long nfs_direct_write_execute(struct nfs_write_data *data,
+ struct rpc_task_setup *task_setup_data,
+ struct rpc_message *msg)
+{
+ struct inode *inode = data->inode;
+ struct rpc_task *task;
+
+ task_setup_data->task = &data->task;
+ task_setup_data->callback_data = data;
+ msg->rpc_argp = &data->args;
+ msg->rpc_resp = &data->res;
+ NFS_PROTO(inode)->write_setup(data, msg);
+
+ task = rpc_run_task(task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ rpc_put_task(task);
+
+ dprintk("NFS: %5u initiated direct write call "
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ inode->i_sb->s_id,
+ (long long)NFS_FILEID(inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ return 0;
+}
+
/*
* For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
* operation. If nfs_writedata_alloc() or get_user_pages() fails,
@@ -718,7 +759,6 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
struct inode *inode = ctx->path.dentry->d_inode;
unsigned long user_addr = (unsigned long)iov->iov_base;
size_t count = iov->iov_len;
- struct rpc_task *task;
struct rpc_message msg = {
.rpc_cred = ctx->cred,
};
@@ -785,24 +825,8 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
data->res.verf = &data->verf;
nfs_fattr_init(&data->fattr);

- task_setup_data.task = &data->task;
- task_setup_data.callback_data = data;
- msg.rpc_argp = &data->args;
- msg.rpc_resp = &data->res;
- NFS_PROTO(inode)->write_setup(data, &msg);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
+ if (nfs_direct_write_execute(data, &task_setup_data, &msg))
break;
- rpc_put_task(task);
-
- dprintk("NFS: %5u initiated direct write call "
- "(req %s/%Ld, %zu bytes @ offset %Lu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- bytes,
- (unsigned long long)data->args.offset);

started += bytes;
user_addr += bytes;
--
1.7.3.4


2011-04-22 08:05:19

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 4/6] SQUASHME: remove return_type field from nfs4_layoutreturn_args

Currently, we only send layoutreturns for single file.

squash into "pnfs: layoutreturn"

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 24 ++++++++----------------
fs/nfs/nfs4xdr.c | 32 ++++++++++++++------------------
fs/nfs/pnfs.c | 1 -
include/linux/nfs_xdr.h | 1 -
4 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4f637e9..059a74c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5697,23 +5697,20 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
if (!nfs4_sequence_done(task, &lrp->res.seq_res))
return;

- if (lrp->args.return_type == RETURN_FILE)
- server = NFS_SERVER(lrp->args.inode);
- else
- server = NULL;
+ server = NFS_SERVER(lrp->args.inode);
if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
nfs_restart_rpc(task, lrp->clp);
return;
}
- if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
+ if (task->tk_status == 0) {
struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;

- spin_lock(&lo->plh_inode->i_lock);
- if (lrp->res.lrs_present)
+ if (lrp->res.lrs_present) {
+ spin_lock(&lo->plh_inode->i_lock);
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- else
+ spin_unlock(&lo->plh_inode->i_lock);
+ } else
BUG_ON(!list_empty(&lo->plh_segs));
- spin_unlock(&lo->plh_inode->i_lock);
}
dprintk("<-- %s\n", __func__);
}
@@ -5722,13 +5719,8 @@ static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;

- dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
- if (lrp->args.return_type == RETURN_FILE) {
- struct inode *ino = lrp->args.inode;
- struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
-
- put_layout_hdr(lo);
- }
+ dprintk("--> %s\n", __func__);
+ put_layout_hdr(NFS_I(lrp->args.inode)->layout);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a21bbbe..8c0e589 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1963,24 +1963,20 @@ encode_layoutreturn(struct xdr_stream *xdr,
*p++ = cpu_to_be32(args->reclaim);
*p++ = cpu_to_be32(args->layout_type);
*p++ = cpu_to_be32(args->range.iomode);
- *p = cpu_to_be32(args->return_type);
- if (args->return_type == RETURN_FILE) {
- p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
- p = xdr_encode_hyper(p, args->range.offset);
- p = xdr_encode_hyper(p, args->range.length);
- spin_lock(&args->inode->i_lock);
- memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data,
- NFS4_STATEID_SIZE);
- spin_unlock(&args->inode->i_lock);
- p = xdr_encode_opaque_fixed(p, &stateid.data,
- NFS4_STATEID_SIZE);
- if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
- NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
- NFS_I(args->inode)->layout, xdr, args);
- } else {
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(0);
- }
+ *p = cpu_to_be32(RETURN_FILE);
+ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
+ spin_lock(&args->inode->i_lock);
+ memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data, NFS4_STATEID_SIZE);
+ spin_unlock(&args->inode->i_lock);
+ p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+ if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+ NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+ NFS_I(args->inode)->layout, xdr, args);
+ } else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0);
}
hdr->nops++;
hdr->replen += decode_layoutreturn_maxsz;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1ec5bb8..b2066d2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -682,7 +682,6 @@ return_layout(struct inode *ino, struct pnfs_layout_range *range)
}
lrp->args.reclaim = 0;
lrp->args.layout_type = server->pnfs_curr_ld->id;
- lrp->args.return_type = RETURN_FILE;
lrp->args.range = *range;
lrp->args.inode = ino;
lrp->clp = server->nfs_client;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0f9dc74..6506432 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -272,7 +272,6 @@ struct nfs4_layoutcommit_data {
struct nfs4_layoutreturn_args {
__u32 reclaim;
__u32 layout_type;
- __u32 return_type;
struct pnfs_layout_range range;
struct inode *inode;
struct nfs4_sequence_args seq_args;
--
1.7.3.4


2011-04-22 08:27:03

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 06/27] pnfs: encode_layoutreturn

On 2011-04-20 23:16, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
>> From: Andy Adamson <[email protected]>
>>
>> Signed-off-by: Andy Adamson <[email protected]>
>> [fixup layout header pointer for encode_layoutreturn]
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> fs/nfs/nfs4xdr.c | 9 +++++++--
>> fs/nfs/pnfs.h | 4 ++++
>> 2 files changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index 53ea3e5..6b64dd8 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -1927,8 +1927,13 @@ encode_layoutreturn(struct xdr_stream *xdr,
>> spin_unlock(&args->inode->i_lock);
>> p = xdr_encode_opaque_fixed(p, &stateid.data,
>> NFS4_STATEID_SIZE);
>> - p = reserve_space(xdr, 4);
>> - *p = cpu_to_be32(0);
>> + if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
>> + NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
>> + NFS_I(args->inode)->layout, xdr, args);
>> + } else {
>> + p = reserve_space(xdr, 4);
>> + *p = cpu_to_be32(0);
>> + }
>> }
>> hdr->nops++;
>> hdr->replen += decode_layoutreturn_maxsz;
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index c315109..51dcbc1 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -98,6 +98,10 @@ struct pnfs_layoutdriver_type {
>>
>> /* device notification methods */
>> void (*delete_deviceid)(struct nfs4_deviceid *);
>> +
>> + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
>> + struct xdr_stream *xdr,
>> + const struct nfs4_layoutreturn_args *args);
>
> Ugh, no... The only user of this is the OSD layout type, and it just
> uses it to pass error information. Why can't we just add that particular
> case to the layoutreturn XDR.
>

We can do that and add the vector with the blocks layout driver
though in my opinion using the vector, even for the single use
case, is cleaner than special casing the xdr code for the obj layout...

Benny

>> };
>>
>> struct pnfs_layout_hdr {
>


2011-04-20 20:43:58

by Trond Myklebust

[permalink] [raw]
Subject: Re: [RFC 12/27] pnfs: alloc and free layout_hdr layoutdriver methods

On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:

Why is this needed?

> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/pnfs.c | 21 ++++++++++++++++++---
> fs/nfs/pnfs.h | 3 +++
> 2 files changed, 21 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index afc64b3..2254362 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -188,13 +188,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
> atomic_inc(&lo->plh_refcount);
> }
>
> +static struct pnfs_layout_hdr *
> +pnfs_alloc_layout_hdr(struct inode *ino)
> +{
> + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
> + return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino) :
> + kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);

BTW: GFP_KERNEL is a bug here. It should be GFP_NOFS or else we can
recurse back into the filesystem through the page reclaim code.

> +}
> +
> +static void
> +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
> +{
> + struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
> + return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
> +}
> +
> static void
> destroy_layout_hdr(struct pnfs_layout_hdr *lo)
> {
> dprintk("%s: freeing layout cache %p\n", __func__, lo);
> BUG_ON(!list_empty(&lo->plh_layouts));
> NFS_I(lo->plh_inode)->layout = NULL;
> - kfree(lo);
> + pnfs_free_layout_hdr(lo);
> }
>
> static void
> @@ -857,7 +872,7 @@ alloc_init_layout_hdr(struct inode *ino)
> {
> struct pnfs_layout_hdr *lo;
>
> - lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
> + lo = pnfs_alloc_layout_hdr(ino);
> if (!lo)
> return NULL;
> atomic_set(&lo->plh_refcount, 1);
> @@ -890,7 +905,7 @@ pnfs_find_alloc_layout(struct inode *ino)
> if (likely(nfsi->layout == NULL)) /* Won the race? */
> nfsi->layout = new;
> else
> - kfree(new);
> + pnfs_free_layout_hdr(new);
> return nfsi->layout;
> }
>
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index bb266ba..35662ac 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -83,6 +83,9 @@ struct pnfs_layoutdriver_type {
> int (*set_layoutdriver) (struct nfs_server *);
> int (*unset_layoutdriver) (struct nfs_server *);
>
> + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode);
> + void (*free_layout_hdr) (struct pnfs_layout_hdr *);
> +
> struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
> void (*free_lseg) (struct pnfs_layout_segment *lseg);
>



2011-04-20 17:27:12

by Benny Halevy

[permalink] [raw]
Subject: [RFC 07/27] pnfs: encode_layoutcommit

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4xdr.c | 16 +++++++++++++---
fs/nfs/pnfs.h | 4 ++++
2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6b64dd8..4f7bef9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1877,6 +1877,7 @@ encode_layoutget(struct xdr_stream *xdr,

static int
encode_layoutcommit(struct xdr_stream *xdr,
+ struct inode *inode,
const struct nfs4_layoutcommit_args *args,
struct compound_hdr *hdr)
{
@@ -1885,7 +1886,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
NFS_SERVER(args->inode)->pnfs_curr_ld->id);

- p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
/* Only whole file layouts */
p = xdr_encode_hyper(p, 0); /* offset */
@@ -1896,7 +1897,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten);
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
- *p++ = cpu_to_be32(0); /* no file layout payload */
+
+ if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+ NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+ NFS_I(inode)->layout, xdr, args);
+ else {
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(0); /* no layout-type payload */
+ }

hdr->nops++;
hdr->replen += decode_layoutcommit_maxsz;
@@ -2759,6 +2767,8 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
struct xdr_stream *xdr,
struct nfs4_layoutcommit_args *args)
{
+ struct nfs4_layoutcommit_data *data =
+ container_of(args, struct nfs4_layoutcommit_data, args);
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
@@ -2766,7 +2776,7 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, NFS_FH(args->inode), &hdr);
- encode_layoutcommit(xdr, args, &hdr);
+ encode_layoutcommit(xdr, data->args.inode, args, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 51dcbc1..011885e 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -99,6 +99,10 @@ struct pnfs_layoutdriver_type {
/* device notification methods */
void (*delete_deviceid)(struct nfs4_deviceid *);

+ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args);
+
void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
--
1.7.3.4


2011-04-22 09:03:37

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 09/27] pnfs: support for non-rpc layout drivers

On 2011-04-20 23:34, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
>> Non-rpc layout driver such as for objects and blocks
>> implement their own I/O path and error handling logic.
>> Therefore bypass NFS-based error handling for these layout drivers.
>>
>> Signed-off-by: Benny Halevy <[email protected]>
>> ---
>> fs/nfs/internal.h | 2 +
>> fs/nfs/nfs4filelayout.c | 1 +
>> fs/nfs/nfs4proc.c | 14 +++++++++++-
>> fs/nfs/pnfs.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++
>> fs/nfs/pnfs.h | 7 +++++-
>> include/linux/nfs_xdr.h | 2 +
>> 6 files changed, 71 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index ce118ce..1914d2f 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -310,6 +310,8 @@ extern int nfs_migrate_page(struct address_space *,
>> #endif
>>
>> /* nfs4proc.c */
>> +extern void __nfs4_read_done_cb(struct nfs_read_data *);
>> +extern void __nfs4_write_done_cb(struct nfs_write_data *);
>> extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
>> extern int nfs4_init_client(struct nfs_client *clp,
>> const struct rpc_timeout *timeparms,
>> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
>> index 2feab7f..e67a0d4 100644
>> --- a/fs/nfs/nfs4filelayout.c
>> +++ b/fs/nfs/nfs4filelayout.c
>> @@ -859,6 +859,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
>> .id = LAYOUT_NFSV4_1_FILES,
>> .name = "LAYOUT_NFSV4_1_FILES",
>> .owner = THIS_MODULE,
>> + .flags = PNFS_USE_RPC_CODE,
>
> This isn't being used anywhere, so why do I need it in this patch?
>

Sorry, it's just leftovers from the previous version.
I'll get rid of it.

>> .alloc_lseg = filelayout_alloc_lseg,
>> .free_lseg = filelayout_free_lseg,
>> .pg_test = filelayout_pg_test,
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index d0eb50b..cc2cdcd 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -3149,6 +3149,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
>> return err;
>> }
>>
>> +void __nfs4_read_done_cb(struct nfs_read_data *data)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^ why the wrapper?

To be called from nfs4_read_done_cb and from pnfs_read_done.
I can just call nfs_invalidate_atime from there but since
this is common logic I think the code, small as it is,
should be kept common.

By the way, what about filelayout_read_done_cb()?
shouldn't it invalidate the inode's atime too?

>> +{
>> + nfs_invalidate_atime(data->inode);
>> +}
>> +
>> static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
>> {
>> struct nfs_server *server = NFS_SERVER(data->inode);
>> @@ -3158,7 +3163,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
>> return -EAGAIN;
>> }
>>
>> - nfs_invalidate_atime(data->inode);
>> + __nfs4_read_done_cb(data);
>> if (task->tk_status > 0)
>> renew_lease(server, data->timestamp);
>> return 0;
>> @@ -3198,6 +3203,11 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
>> }
>> EXPORT_SYMBOL_GPL(nfs4_reset_read);
>>
>> +void __nfs4_write_done_cb(struct nfs_write_data *data)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Again, why the wrapper?
>
>> +{
>> + nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);

Sorry, my bad, the pnfs done path needs just pnfs_set_layoutcommit()

>> +}
>> +
>> static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
>> {
>> struct inode *inode = data->inode;
>> @@ -3208,7 +3218,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
>> }
>> if (task->tk_status >= 0) {
>> renew_lease(NFS_SERVER(inode), data->timestamp);
>> - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
>> + __nfs4_write_done_cb(data);
>> }
>> return 0;
>> }
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index a5050d2..18ae397 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -1130,6 +1130,30 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
>> pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
>> }
>>
>> +/*
>> + * Called by non rpc-based layout drivers
>> + */
>> +int
>> +pnfs_write_done(struct nfs_write_data *data)
> ^^^^^^^^^^^^^^^^^^ If this is not generic to all pnfs layout drivers,
> then why the apparently generic name?
>

Makes sense, how about pnfs_ld_write_done?

> Why isn't this being introduced together with a driver that actually
> uses the functionality? There is no way to review it outside of that
> context.
>

OK, will do in next version of the patch series.

Benny

>> +{
>> + int status;
>> +
>> + put_lseg(data->lseg);
>> + data->lseg = NULL;
>> + if (!data->pnfs_error) {
>> + __nfs4_write_done_cb(data);
>> + data->mds_ops->rpc_call_done(NULL, data);
>> + data->mds_ops->rpc_release(data);
>> + return 0;
>> + }
>> +
>> + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>> + data->pnfs_error);
>> + status = nfs_initiate_write(data, NFS_CLIENT(data->inode), data->mds_ops, NFS_FILE_SYNC);
>> + return status ? : -EAGAIN;
>> +}
>> +EXPORT_SYMBOL_GPL(pnfs_write_done);
>> +
>> enum pnfs_try_status
>> pnfs_try_to_write_data(struct nfs_write_data *wdata,
>> const struct rpc_call_ops *call_ops, int how)
>> @@ -1155,6 +1179,30 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
>> }
>>
>> /*
>> + * Called by non rpc-based layout drivers
>> + */
>> +int
>> +pnfs_read_done(struct nfs_read_data *data)
>> +{
>> + int status;
>> +
>> + put_lseg(data->lseg);
>> + data->lseg = NULL;
>> + if (!data->pnfs_error) {
>> + __nfs4_read_done_cb(data);
>> + data->mds_ops->rpc_call_done(NULL, data);
>> + data->mds_ops->rpc_release(data);
>> + return 0;
>> + }
>> +
>> + dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>> + data->pnfs_error);
>> + status = nfs_initiate_read(data, NFS_CLIENT(data->inode), data->mds_ops);
>> + return status ? : -EAGAIN;
>> +}
>> +EXPORT_SYMBOL_GPL(pnfs_read_done);
>> +
>> +/*
>> * Call the appropriate parallel I/O subsystem read function.
>> */
>> enum pnfs_try_status
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 9f8e970..18b84ce 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -65,8 +65,11 @@ enum {
>> };
>>
>> enum layoutdriver_policy_flags {
>> + /* Should the full nfs rpc cleanup code be used after io */
>> + PNFS_USE_RPC_CODE = 1 << 0,
>> +
>> /* Should the pNFS client commit and return the layout upon a setattr */
>> - PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
>> + PNFS_LAYOUTRET_ON_SETATTR = 1 << 1,
>> };
>>
>> /* Per-layout driver specific registration structure */
>> @@ -182,6 +185,8 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
>> void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
>> int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
>> int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
>> +int pnfs_write_done(struct nfs_write_data *);
>> +int pnfs_read_done(struct nfs_read_data *);
>>
>> static inline int lo_fail_bit(u32 iomode)
>> {
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index 01eb1ae..41f896a 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -1108,6 +1108,7 @@ struct nfs_read_data {
>> const struct rpc_call_ops *mds_ops;
>> int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data);
>> __u64 mds_offset;
>> + int pnfs_error;
>> struct page *page_array[NFS_PAGEVEC_SIZE];
>> };
>>
>> @@ -1133,6 +1134,7 @@ struct nfs_write_data {
>> unsigned long timestamp; /* For lease renewal */
>> #endif
>> __u64 mds_offset; /* Filelayout dense stripe */
>> + int pnfs_error;
>> struct page *page_array[NFS_PAGEVEC_SIZE];
>> };
>>
>


2011-04-20 20:16:04

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 06/27] pnfs: encode_layoutreturn

On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
> From: Andy Adamson <[email protected]>
>
> Signed-off-by: Andy Adamson <[email protected]>
> [fixup layout header pointer for encode_layoutreturn]
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/nfs4xdr.c | 9 +++++++--
> fs/nfs/pnfs.h | 4 ++++
> 2 files changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 53ea3e5..6b64dd8 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -1927,8 +1927,13 @@ encode_layoutreturn(struct xdr_stream *xdr,
> spin_unlock(&args->inode->i_lock);
> p = xdr_encode_opaque_fixed(p, &stateid.data,
> NFS4_STATEID_SIZE);
> - p = reserve_space(xdr, 4);
> - *p = cpu_to_be32(0);
> + if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
> + NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
> + NFS_I(args->inode)->layout, xdr, args);
> + } else {
> + p = reserve_space(xdr, 4);
> + *p = cpu_to_be32(0);
> + }
> }
> hdr->nops++;
> hdr->replen += decode_layoutreturn_maxsz;
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index c315109..51dcbc1 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -98,6 +98,10 @@ struct pnfs_layoutdriver_type {
>
> /* device notification methods */
> void (*delete_deviceid)(struct nfs4_deviceid *);
> +
> + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
> + struct xdr_stream *xdr,
> + const struct nfs4_layoutreturn_args *args);

Ugh, no... The only user of this is the OSD layout type, and it just
uses it to pass error information. Why can't we just add that particular
case to the layoutreturn XDR.

> };
>
> struct pnfs_layout_hdr {

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-20 20:18:27

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 07/27] pnfs: encode_layoutcommit

On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/nfs4xdr.c | 16 +++++++++++++---
> fs/nfs/pnfs.h | 4 ++++
> 2 files changed, 17 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 6b64dd8..4f7bef9 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -1877,6 +1877,7 @@ encode_layoutget(struct xdr_stream *xdr,
>
> static int
> encode_layoutcommit(struct xdr_stream *xdr,
> + struct inode *inode,
> const struct nfs4_layoutcommit_args *args,
> struct compound_hdr *hdr)
> {
> @@ -1885,7 +1886,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
> dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
> NFS_SERVER(args->inode)->pnfs_curr_ld->id);
>
> - p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
> + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
> *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
> /* Only whole file layouts */
> p = xdr_encode_hyper(p, 0); /* offset */
> @@ -1896,7 +1897,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
> p = xdr_encode_hyper(p, args->lastbytewritten);
> *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
> *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
> - *p++ = cpu_to_be32(0); /* no file layout payload */
> +
> + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
> + NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
> + NFS_I(inode)->layout, xdr, args);
> + else {
> + p = reserve_space(xdr, 4);
> + *p = cpu_to_be32(0); /* no layout-type payload */
> + }
>
> hdr->nops++;
> hdr->replen += decode_layoutcommit_maxsz;
> @@ -2759,6 +2767,8 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
> struct xdr_stream *xdr,
> struct nfs4_layoutcommit_args *args)
> {
> + struct nfs4_layoutcommit_data *data =
> + container_of(args, struct nfs4_layoutcommit_data, args);
> struct compound_hdr hdr = {
> .minorversion = nfs4_xdr_minorversion(&args->seq_args),
> };
> @@ -2766,7 +2776,7 @@ static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
> encode_compound_hdr(xdr, req, &hdr);
> encode_sequence(xdr, &args->seq_args, &hdr);
> encode_putfh(xdr, NFS_FH(args->inode), &hdr);
> - encode_layoutcommit(xdr, args, &hdr);
> + encode_layoutcommit(xdr, data->args.inode, args, &hdr);
> encode_getfattr(xdr, args->bitmask, &hdr);
> encode_nops(&hdr);
> }
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 51dcbc1..011885e 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -99,6 +99,10 @@ struct pnfs_layoutdriver_type {
> /* device notification methods */
> void (*delete_deviceid)(struct nfs4_deviceid *);
>
> + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
> + struct xdr_stream *xdr,
> + const struct nfs4_layoutcommit_args *args);

This too is way too ugly. Can't the layout payload be pre-encoded by the
layout driver?


--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-22 06:22:37

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 01/27] pnfs: CB_NOTIFY_DEVICEID

On 2011-04-20 22:41, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
>> From: Marc Eshel <[email protected]>
>> +struct cb_devicenotifyargs {
>> + struct sockaddr *addr;
>
> No sockaddr_size parameter?
>

Actually, it can be safely removed altogether.

>> + int ndevs;
>> + struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES];
>> +};
>
> Why can't we make this dynamic at this time?
>

Will do.
How about the following patch?
(while at it I fixed the error codes in decode_devicenotify_args)

---
fs/nfs/callback.h | 5 +----
fs/nfs/callback_proc.c | 1 +
fs/nfs/callback_xdr.c | 30 +++++++++++++++---------------
3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 892128f..b257383 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -175,12 +175,9 @@ struct cb_devicenotifyitem {
uint32_t cbd_immediate;
};

-/* XXX: Should be dynamic up to max compound size */
-#define NFS4_DEV_NOTIFY_MAXENTRIES 10
struct cb_devicenotifyargs {
- struct sockaddr *addr;
int ndevs;
- struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES];
+ struct cb_devicenotifyitem *devs;
};

extern __be32 nfs4_callback_devicenotify(
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 96f35f2..964c416 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -286,6 +286,7 @@ __be32 nfs4_callback_devicenotify(struct
cb_devicenotifyargs *args,
}

out:
+ kfree(args->devs);
dprintk("%s: exit with status = %u\n",
__func__, res);
return cpu_to_be32(res);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 5ec2c12..c6c86a7 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -296,23 +296,20 @@ __be32 decode_devicenotify_args(struct svc_rqst
*rqstp,
int n, i;
args->ndevs = 0;

- args->addr = svc_addr(rqstp);
-
/* Num of device notifications */
p = read_buf(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_RESOURCE);
+ status = htonl(NFS4ERR_BADXDR);
goto out;
}
n = ntohl(*p++);
if (n <= 0)
goto out;

- /* XXX: need to possibly return error in this case */
- if (n > NFS4_DEV_NOTIFY_MAXENTRIES) {
- dprintk("%s: Processing (%d) notifications out of (%d)\n",
- __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n);
- n = NFS4_DEV_NOTIFY_MAXENTRIES;
+ args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+ if (!args->devs) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out;
}

/* Decode each dev notification */
@@ -321,20 +318,20 @@ __be32 decode_devicenotify_args(struct svc_rqst
*rqstp,

p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_RESOURCE);
- goto out;
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
}

tmp = ntohl(*p++); /* bitmap size */
if (tmp != 1) {
status = htonl(NFS4ERR_INVAL);
- goto out;
+ goto err;
}
dev->cbd_notify_type = ntohl(*p++);
if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
status = htonl(NFS4ERR_INVAL);
- goto out;
+ goto err;
}

tmp = ntohl(*p++); /* opaque size */
@@ -343,7 +340,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
(tmp != NFS4_DEVICEID4_SIZE + 4))) {
status = htonl(NFS4ERR_INVAL);
- goto out;
+ goto err;
}
dev->cbd_layout_type = ntohl(*p++);
memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
@@ -352,8 +349,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
p = read_buf(xdr, sizeof(uint32_t));
if (unlikely(p == NULL)) {
- status = htonl(NFS4ERR_DELAY);
- goto out;
+ status = htonl(NFS4ERR_BADXDR);
+ goto err;
}
dev->cbd_immediate = ntohl(*p++);
} else {
@@ -370,6 +367,9 @@ out:
dprintk("%s: status %d ndevs %d\n",
__func__, ntohl(status), args->ndevs);
return status;
+err:
+ kfree(args->devs);
+ goto out;
}

static __be32 decode_sessionid(struct xdr_stream *xdr,
--
1.7.3.4




2011-04-20 17:28:49

by Benny Halevy

[permalink] [raw]
Subject: [RFC 20/27] pnfs-obj: objio_osd device information retrieval and caching

From: Boaz Harrosh <[email protected]>

When a new layout is received in objio_alloc_lseg all device_ids
referenced are retrieved. The device information is queried for from MDS
and then the osd_device is looked-up from the osd-initiator library. The
devices are cached in a per-mount-point list, for later use. At unmount
all devices are "put" back to the library.

objlayout_get_deviceinfo(), objlayout_put_deviceinfo() middleware
API for retrieving device information given a device_id.

TODO: The device cache can get big. Cap it's size. Keep an LRU and start
to return devices which were not used, when list gets to big, or
when new entries allocation fail.

[Some extra debug-prints]
Signed-off-by: Boaz Harrosh <[email protected]>
[convert APIs pnfs-post-submit]
[apply types rename]
[convert to new pnfs-submit changes]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 176 +++++++++++++++++++++++++++++++++++++++++-
fs/nfs/objlayout/objlayout.c | 67 ++++++++++++++++
fs/nfs/objlayout/objlayout.h | 4 +
3 files changed, 246 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 4b88c0a..a24bf82 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -62,8 +62,84 @@ struct objio_mount_type {
spinlock_t dev_list_lock;
};

+struct _dev_ent {
+ struct list_head list;
+ struct nfs4_deviceid d_id;
+ struct osd_dev *od;
+};
+
+static void _dev_list_remove_all(struct objio_mount_type *omt)
+{
+ spin_lock(&omt->dev_list_lock);
+
+ while (!list_empty(&omt->dev_list)) {
+ struct _dev_ent *de = list_entry(omt->dev_list.next,
+ struct _dev_ent, list);
+
+ list_del_init(&de->list);
+ osduld_put_device(de->od);
+ kfree(de);
+ }
+
+ spin_unlock(&omt->dev_list_lock);
+}
+
+static struct osd_dev *___dev_list_find(struct objio_mount_type *omt,
+ struct nfs4_deviceid *d_id)
+{
+ struct list_head *le;
+
+ list_for_each(le, &omt->dev_list) {
+ struct _dev_ent *de = list_entry(le, struct _dev_ent, list);
+
+ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id)))
+ return de->od;
+ }
+
+ return NULL;
+}
+
+static struct osd_dev *_dev_list_find(struct objio_mount_type *omt,
+ struct nfs4_deviceid *d_id)
+{
+ struct osd_dev *od;
+
+ spin_lock(&omt->dev_list_lock);
+ od = ___dev_list_find(omt, d_id);
+ spin_unlock(&omt->dev_list_lock);
+ return od;
+}
+
+static int _dev_list_add(struct objio_mount_type *omt,
+ struct nfs4_deviceid *d_id, struct osd_dev *od)
+{
+ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL);
+
+ if (!de)
+ return -ENOMEM;
+
+ spin_lock(&omt->dev_list_lock);
+
+ if (___dev_list_find(omt, d_id)) {
+ kfree(de);
+ goto out;
+ }
+
+ de->d_id = *d_id;
+ de->od = od;
+ list_add(&de->list, &omt->dev_list);
+
+out:
+ spin_unlock(&omt->dev_list_lock);
+ return 0;
+}
+
struct objio_segment {
struct pnfs_osd_layout *layout;
+
+ unsigned num_comps;
+ /* variable length */
+ struct osd_dev *ods[1];
};

struct objio_state {
@@ -73,21 +149,116 @@ struct objio_state {
struct objio_segment *objio_seg;
};

+/* Send and wait for a get_device_info of devices in the layout,
+ then look them up with the osd_initiator library */
+static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg, unsigned comp)
+{
+ struct pnfs_osd_layout *layout = objio_seg->layout;
+ struct pnfs_osd_deviceaddr *deviceaddr;
+ struct nfs4_deviceid *d_id;
+ struct osd_dev *od;
+ struct osd_dev_info odi;
+ struct objio_mount_type *omt = NFS_SERVER(pnfslay->plh_inode)->pnfs_ld_data;
+ int err;
+
+ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id;
+
+ od = _dev_list_find(omt, d_id);
+ if (od)
+ return od;
+
+ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr);
+ if (unlikely(err)) {
+ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err);
+ return ERR_PTR(err);
+ }
+
+ odi.systemid_len = deviceaddr->oda_systemid.len;
+ if (odi.systemid_len > sizeof(odi.systemid)) {
+ err = -EINVAL;
+ goto out;
+ } else if (odi.systemid_len)
+ memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+ odi.systemid_len);
+ odi.osdname_len = deviceaddr->oda_osdname.len;
+ odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
+
+ if (!odi.osdname_len && !odi.systemid_len) {
+ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+ __func__);
+ err = -ENODEV;
+ goto out;
+ }
+
+ od = osduld_info_lookup(&odi);
+ if (unlikely(IS_ERR(od))) {
+ err = PTR_ERR(od);
+ dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+ goto out;
+ }
+
+ _dev_list_add(omt, d_id, od);
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ objlayout_put_deviceinfo(deviceaddr);
+ return err ? ERR_PTR(err) : od;
+}
+
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+ struct objio_segment *objio_seg)
+{
+ struct pnfs_osd_layout *layout = objio_seg->layout;
+ unsigned i, num_comps = layout->olo_num_comps;
+ int err;
+
+ /* lookup all devices */
+ for (i = 0; i < num_comps; i++) {
+ struct osd_dev *od;
+
+ od = _device_lookup(pnfslay, objio_seg, i);
+ if (unlikely(IS_ERR(od))) {
+ err = PTR_ERR(od);
+ goto out;
+ }
+ objio_seg->ods[i] = od;
+ }
+ objio_seg->num_comps = num_comps;
+ err = 0;
+
+out:
+ dprintk("%s: return=%d\n", __func__, err);
+ return err;
+}
+
int objio_alloc_lseg(void **outp,
struct pnfs_layout_hdr *pnfslay,
struct pnfs_layout_segment *lseg,
struct pnfs_osd_layout *layout)
{
struct objio_segment *objio_seg;
+ int err;

- objio_seg = kzalloc(sizeof(*objio_seg), GFP_KERNEL);
+ objio_seg = kzalloc(sizeof(*objio_seg) +
+ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]),
+ GFP_KERNEL);
if (!objio_seg)
return -ENOMEM;

objio_seg->layout = layout;
+ err = objio_devices_lookup(pnfslay, objio_seg);
+ if (err)
+ goto free_seg;

*outp = objio_seg;
return 0;
+
+free_seg:
+ dprintk("%s: Error: return %d\n", __func__, err);
+ kfree(objio_seg);
+ *outp = NULL;
+ return err;
}

void objio_free_lseg(void *p)
@@ -171,11 +342,14 @@ void *objio_init_mt(void)
if (!omt)
return ERR_PTR(-ENOMEM);

+ INIT_LIST_HEAD(&omt->dev_list);
+ spin_lock_init(&omt->dev_list_lock);
return omt;
}

void objio_fini_mt(void *mountid)
{
+ _dev_list_remove_all(mountid);
kfree(mountid);
}

diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index ae14a24..7c4c744 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -401,6 +401,73 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
return PNFS_ATTEMPTED;
}

+struct objlayout_deviceinfo {
+ struct page *page;
+ struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr)
+{
+ struct objlayout_deviceinfo *odi;
+ struct pnfs_device pd;
+ struct super_block *sb;
+ struct page *page, **pages;
+ size_t sz;
+ u32 *p;
+ int err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ pages = &page;
+ pd.pages = pages;
+
+ memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+ pd.layout_type = LAYOUT_OSD2_OBJECTS;
+ pd.pages = &page;
+ pd.pgbase = 0;
+ pd.pglen = PAGE_SIZE;
+ pd.mincount = 0;
+
+ sb = pnfslay->plh_inode->i_sb;
+ err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+ if (err)
+ goto err_out;
+
+ p = page_address(page);
+ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p);
+ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL);
+ if (!odi) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+ odi->page = page;
+ *deviceaddr = &odi->da;
+ return 0;
+
+err_out:
+ __free_page(page);
+ return err;
+}
+
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+ struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+ struct objlayout_deviceinfo,
+ da);
+
+ __free_page(odi->page);
+ kfree(odi);
+}
+
/*
* Perform the objio specific init_mt method.
* Set the layout driver private data pointer for later use.
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 24b36d4..7a63d34 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -120,6 +120,10 @@ extern void objlayout_read_done(struct objlayout_io_state *state,
extern void objlayout_write_done(struct objlayout_io_state *state,
ssize_t status, bool sync);

+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+
/*
* exported generic objects function vectors
*/
--
1.7.3.4


2011-04-20 17:27:44

by Benny Halevy

[permalink] [raw]
Subject: [RFC 11/27] pnfs: per mount layout driver private data

Signed-off-by: Benny Halevy <[email protected]>
---
include/linux/nfs_fs_sb.h | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 216cea5..c5b3fd0 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -142,6 +142,9 @@ struct nfs_server {
filesystem */
struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
struct rpc_wait_queue roc_rpcwaitq;
+ void *pnfs_ld_data; /* Per-mount data */
+ unsigned int ds_rsize; /* Data server read size */
+ unsigned int ds_wsize; /* Data server write size */

/* the following fields are protected by nfs_client->cl_lock */
struct rb_root state_owners;
--
1.7.3.4


2011-04-22 06:52:57

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC 03/27] pnfs: layoutreturn

On 2011-04-20 22:53, Trond Myklebust wrote:
> On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
>> @@ -1424,9 +1424,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
>> */
>> void nfs4_evict_inode(struct inode *inode)
>> {
>> - pnfs_destroy_layout(NFS_I(inode));
>> + pnfs_return_layout(inode, NULL, true);
>
> Why does this want to come before the call to truncate_inode_pages()?

Actually, I don't see any good reason. :-/

> Is there any reason not to put pnfs_return_layout() and
> pnfs_destroy_layout into a single helper here?
>

Looks like an overkill to me for this one call site.

>> truncate_inode_pages(&inode->i_data, 0);
>> end_writeback(inode);
>> + pnfs_destroy_layout(NFS_I(inode));
>> /* If we are holding a delegation, return it! */
>> nfs_inode_return_delegation_noreclaim(inode);
>> /* First call standard NFS clear_inode() code */
>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>> index 9bf41ea..b03defb 100644
>> --- a/fs/nfs/nfs4proc.c
>> +++ b/fs/nfs/nfs4proc.c
>> @@ -5662,6 +5662,103 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
>> return status;
>> }
>>
>> +static void
>> +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
>> +{
>> + struct nfs4_layoutreturn *lrp = calldata;
>> +
>> + dprintk("--> %s\n", __func__);
>> + if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
>> + &lrp->res.seq_res, 0, task))
>> + return;
>> + rpc_call_start(task);
>> +}
>> +
>> +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
>> +{
>> + struct nfs4_layoutreturn *lrp = calldata;
>> + struct nfs_server *server;
>> +
>> + dprintk("--> %s\n", __func__);
>> +
>> + if (!nfs4_sequence_done(task, &lrp->res.seq_res))
>> + return;
>> +
>> + if (lrp->args.return_type == RETURN_FILE)
>> + server = NFS_SERVER(lrp->args.inode);
>> + else
>> + server = NULL;
>> + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
>> + nfs_restart_rpc(task, lrp->clp);
>> + return;
>> + }
>> + if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
>> + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
>> +
>> + spin_lock(&lo->plh_inode->i_lock);
>> + if (lrp->res.lrs_present)
>> + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
>> + else
>> + BUG_ON(!list_empty(&lo->plh_segs));
>> + spin_unlock(&lo->plh_inode->i_lock);
>> + }
>> + dprintk("<-- %s\n", __func__);
>> +}
>> +
>> +static void nfs4_layoutreturn_release(void *calldata)
>> +{
>> + struct nfs4_layoutreturn *lrp = calldata;
>> +
>> + dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
>> + if (lrp->args.return_type == RETURN_FILE) {
>> + struct inode *ino = lrp->args.inode;
>> + struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
>> +
>> + put_layout_hdr(lo);
>> + }
>> + kfree(calldata);
>> + dprintk("<-- %s\n", __func__);
>> +}
>> +
>> +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
>> + .rpc_call_prepare = nfs4_layoutreturn_prepare,
>> + .rpc_call_done = nfs4_layoutreturn_done,
>> + .rpc_release = nfs4_layoutreturn_release,
>> +};
>> +
>> +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)
>
> Why the 'issync' parameter?
>

Hmm, you're right. Currently all call sites use it as sync.

>> +{
>> + struct rpc_task *task;
>> + struct rpc_message msg = {
>> + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
>> + .rpc_argp = &lrp->args,
>> + .rpc_resp = &lrp->res,
>> + };
>> + struct rpc_task_setup task_setup_data = {
>> + .rpc_client = lrp->clp->cl_rpcclient,
>> + .rpc_message = &msg,
>> + .callback_ops = &nfs4_layoutreturn_call_ops,
>> + .callback_data = lrp,
>> + .flags = RPC_TASK_ASYNC,
>> + };
>> + int status = 0;
>> +
>> + dprintk("--> %s\n", __func__);
>> + task = rpc_run_task(&task_setup_data);
>> + if (IS_ERR(task))
>> + return PTR_ERR(task);
>> + if (!issync)
>> + goto out;
>> + status = nfs4_wait_for_completion_rpc_task(task);
>> + if (status != 0)
>> + goto out;
>> + status = task->tk_status;
>> +out:
>> + dprintk("<-- %s\n", __func__);
>> + rpc_put_task(task);
>> + return status;
>> +}
>> +
>> static int
>> _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
>> {
>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
>> index dddfb57..53ea3e5 100644
>> --- a/fs/nfs/nfs4xdr.c
>> +++ b/fs/nfs/nfs4xdr.c
>> @@ -338,7 +338,12 @@ static int nfs4_stat_to_errno(int);
>> 1 /* layoutupdate4 layout type */ + \
>> 1 /* NULL filelayout layoutupdate4 payload */)
>> #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
>> -
>> +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
>> + encode_stateid_maxsz + \
>> + 1 /* FIXME: opaque lrf_body always empty at
>> + *the moment */)
>> +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
>> + 1 + decode_stateid_maxsz)
>> #else /* CONFIG_NFS_V4_1 */
>> #define encode_sequence_maxsz 0
>> #define decode_sequence_maxsz 0
>> @@ -760,7 +765,14 @@ static int nfs4_stat_to_errno(int);
>> decode_putfh_maxsz + \
>> decode_layoutcommit_maxsz + \
>> decode_getattr_maxsz)
>> -
>> +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
>> + encode_sequence_maxsz + \
>> + encode_putfh_maxsz + \
>> + encode_layoutreturn_maxsz)
>> +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
>> + decode_sequence_maxsz + \
>> + decode_putfh_maxsz + \
>> + decode_layoutreturn_maxsz)
>>
>> const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
>> compound_encode_hdr_maxsz +
>> @@ -1890,6 +1902,37 @@ encode_layoutcommit(struct xdr_stream *xdr,
>> hdr->replen += decode_layoutcommit_maxsz;
>> return 0;
>> }
>> +
>> +static void
>> +encode_layoutreturn(struct xdr_stream *xdr,
>> + const struct nfs4_layoutreturn_args *args,
>> + struct compound_hdr *hdr)
>> +{
>> + nfs4_stateid stateid;
>> + __be32 *p;
>> +
>> + p = reserve_space(xdr, 20);
>> + *p++ = cpu_to_be32(OP_LAYOUTRETURN);
>> + *p++ = cpu_to_be32(args->reclaim);
>> + *p++ = cpu_to_be32(args->layout_type);
>> + *p++ = cpu_to_be32(args->range.iomode);
>> + *p = cpu_to_be32(args->return_type);
>> + if (args->return_type == RETURN_FILE) {
>> + p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
>> + p = xdr_encode_hyper(p, args->range.offset);
>> + p = xdr_encode_hyper(p, args->range.length);
>> + spin_lock(&args->inode->i_lock);
>> + memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data,
>> + NFS4_STATEID_SIZE);
>> + spin_unlock(&args->inode->i_lock);
>> + p = xdr_encode_opaque_fixed(p, &stateid.data,
>> + NFS4_STATEID_SIZE);
>> + p = reserve_space(xdr, 4);
>> + *p = cpu_to_be32(0);
>> + }
>> + hdr->nops++;
>> + hdr->replen += decode_layoutreturn_maxsz;
>> +}
>> #endif /* CONFIG_NFS_V4_1 */
>>
>> /*
>> @@ -2707,9 +2750,9 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
>> /*
>> * Encode LAYOUTCOMMIT request
>> */
>> -static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
>> - struct xdr_stream *xdr,
>> - struct nfs4_layoutcommit_args *args)
>> +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
>> + struct xdr_stream *xdr,
>> + struct nfs4_layoutcommit_args *args)
>> {
>> struct compound_hdr hdr = {
>> .minorversion = nfs4_xdr_minorversion(&args->seq_args),
>> @@ -2721,7 +2764,24 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
>> encode_layoutcommit(xdr, args, &hdr);
>> encode_getfattr(xdr, args->bitmask, &hdr);
>> encode_nops(&hdr);
>> - return 0;
>> +}
>> +
>> +/*
>> + * Encode LAYOUTRETURN request
>> + */
>> +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
>> + struct xdr_stream *xdr,
>> + struct nfs4_layoutreturn_args *args)
>> +{
>> + struct compound_hdr hdr = {
>> + .minorversion = nfs4_xdr_minorversion(&args->seq_args),
>> + };
>> +
>> + encode_compound_hdr(xdr, req, &hdr);
>> + encode_sequence(xdr, &args->seq_args, &hdr);
>> + encode_putfh(xdr, NFS_FH(args->inode), &hdr);
>> + encode_layoutreturn(xdr, args, &hdr);
>> + encode_nops(&hdr);
>> }
>> #endif /* CONFIG_NFS_V4_1 */
>>
>> @@ -5202,6 +5262,27 @@ out_overflow:
>> return -EIO;
>> }
>>
>> +static int decode_layoutreturn(struct xdr_stream *xdr,
>> + struct nfs4_layoutreturn_res *res)
>> +{
>> + __be32 *p;
>> + int status;
>> +
>> + status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
>> + if (status)
>> + return status;
>> + p = xdr_inline_decode(xdr, 4);
>> + if (unlikely(!p))
>> + goto out_overflow;
>> + res->lrs_present = be32_to_cpup(p);
>> + if (res->lrs_present)
>> + status = decode_stateid(xdr, &res->stateid);
>> + return status;
>> +out_overflow:
>> + print_overflow_msg(__func__, xdr);
>> + return -EIO;
>> +}
>> +
>> static int decode_layoutcommit(struct xdr_stream *xdr,
>> struct rpc_rqst *req,
>> struct nfs4_layoutcommit_res *res)
>> @@ -6319,6 +6400,30 @@ out:
>> }
>>
>> /*
>> + * Decode LAYOUTRETURN response
>> + */
>> +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
>> + struct xdr_stream *xdr,
>> + struct nfs4_layoutreturn_res *res)
>> +{
>> + struct compound_hdr hdr;
>> + int status;
>> +
>> + status = decode_compound_hdr(xdr, &hdr);
>> + if (status)
>> + goto out;
>> + status = decode_sequence(xdr, &res->seq_res, rqstp);
>> + if (status)
>> + goto out;
>> + status = decode_putfh(xdr);
>> + if (status)
>> + goto out;
>> + status = decode_layoutreturn(xdr, res);
>> +out:
>> + return status;
>> +}
>> +
>> +/*
>> * Decode LAYOUTCOMMIT response
>> */
>> static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
>> @@ -6544,6 +6649,7 @@ struct rpc_procinfo nfs4_procedures[] = {
>> PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
>> PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
>> PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
>> + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
>> #endif /* CONFIG_NFS_V4_1 */
>> };
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index d9ab972..89e7725 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -321,6 +321,36 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
>> return invalid - removed;
>> }
>>
>> +/* Returns false if there was nothing to do, true otherwise */
>> +static bool
>> +pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
>> + struct pnfs_layout_range *range)
>> +{
>> + struct pnfs_layout_segment *lseg, *next;
>> + bool rv = false;
>> +
>> + dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
>> + __func__, lo, range->offset, range->length, range->iomode);
>> + assert_spin_locked(&lo->plh_inode->i_lock);
>
> Not needed in the case of nfs4_evict_inode.
>

True. I can remove the assert
but the lock better be taken if called while the inode is alive.

>> + if (list_empty(&lo->plh_segs)) {
>> + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
>> + put_layout_hdr_locked(lo);
>> + return 0;
>> + }
>> + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
>> + if (should_free_lseg(lseg->pls_range.iomode, range->iomode)) {
>> + dprintk("%s: freeing lseg %p iomode %d "
>> + "offset %llu length %llu\n", __func__,
>> + lseg, lseg->pls_range.iomode,
>> + lseg->pls_range.offset,
>> + lseg->pls_range.length);
>> + mark_lseg_invalid(lseg, tmp_list);
>> + rv = true;
>> + }
>> + dprintk("%s:Return %d\n", __func__, rv);
>> + return rv;
>> +}
>> +
>> /* note free_me must contain lsegs from a single layout_hdr */
>> void
>> pnfs_free_lseg_list(struct list_head *free_me)
>> @@ -539,6 +569,72 @@ out_err_free:
>> return NULL;
>> }
>>
>> +static int
>> +return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
>> +{
>> + struct nfs4_layoutreturn *lrp;
>> + struct nfs_server *server = NFS_SERVER(ino);
>> + int status = -ENOMEM;
>> +
>> + dprintk("--> %s\n", __func__);
>> +
>> + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
>> + if (lrp == NULL) {
>> + put_layout_hdr(NFS_I(ino)->layout);
>> + goto out;
>> + }
>> + lrp->args.reclaim = 0;
>> + lrp->args.layout_type = server->pnfs_curr_ld->id;
>> + lrp->args.return_type = RETURN_FILE;
>> + lrp->args.range = *range;
>> + lrp->args.inode = ino;
>> + lrp->clp = server->nfs_client;
>> +
>> + status = nfs4_proc_layoutreturn(lrp, wait);
>> +out:
>> + dprintk("<-- %s status: %d\n", __func__, status);
>> + return status;
>> +}
>> +
>> +/* Initiates a LAYOUTRETURN(FILE) */
>> +int
>> +_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
>
> What are the 'range' and 'wait' parameters for? We don't have any users
> other than nfs4_evict_inode.
>

OK, I'll introduce these with their usage.

>> +{
>> + struct pnfs_layout_hdr *lo = NULL;
>> + struct nfs_inode *nfsi = NFS_I(ino);
>> + struct pnfs_layout_range arg;
>> + LIST_HEAD(tmp_list);
>> + int status = 0;
>> +
>> + dprintk("--> %s\n", __func__);
>> +
>> + arg.iomode = range ? range->iomode : IOMODE_ANY;
>> + arg.offset = 0;
>> + arg.length = NFS4_MAX_UINT64;
>> +
>> + spin_lock(&ino->i_lock);
>> + lo = nfsi->layout;
>> + if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) {
>> + spin_unlock(&ino->i_lock);
>> + dprintk("%s: no layout segments to return\n", __func__);
>> + goto out;
>> + }
>> + /* Reference matched in nfs4_layoutreturn_release */
>> + get_layout_hdr(lo);
>> + spin_unlock(&ino->i_lock);
>> + pnfs_free_lseg_list(&tmp_list);
>> +
>> + /* Return layout even if layoutcommit fails */
>> + status = pnfs_layoutcommit_inode(ino, wait);
>
> Why is this needed? Again, by the time we get to nfs4_evict_inode, the
> inode is guaranteed to be clean.
>

OK, let me just add a WARN_ON if we get here and layout commit is
required. Does that work for you?

>> + if (status)
>> + dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
>> + __func__, status);
>> + status = return_layout(ino, &arg, wait);
>> +out:
>> + dprintk("<-- %s status: %d\n", __func__, status);
>> + return status;
>> +}
>> +
>> bool pnfs_roc(struct inode *ino)
>> {
>> struct pnfs_layout_hdr *lo;
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 4cb0a0d..a308f3c 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -123,6 +123,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
>> extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
>> struct pnfs_device *dev);
>> extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
>> +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
>>
>> /* pnfs.c */
>> void get_layout_hdr(struct pnfs_layout_hdr *lo);
>> @@ -158,6 +159,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
>> bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
>> void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
>> int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
>> +int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
>>
>> static inline int lo_fail_bit(u32 iomode)
>> {
>> @@ -226,6 +228,19 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
>> put_lseg(req->wb_commit_lseg);
>> }
>>
>> +static inline int pnfs_return_layout(struct inode *ino,
>> + struct pnfs_layout_range *range,
>> + bool wait)
>> +{
>> + struct nfs_inode *nfsi = NFS_I(ino);
>> + struct nfs_server *nfss = NFS_SERVER(ino);
>> +
>> + if (pnfs_enabled_sb(nfss) && nfsi->layout)
>> + return _pnfs_return_layout(ino, range, wait);
>> +
>> + return 0;
>> +}
>> +
>> #else /* CONFIG_NFS_V4_1 */
>>
>> static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
>> @@ -267,6 +282,13 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
>> return PNFS_NOT_ATTEMPTED;
>> }
>>
>> +static inline int pnfs_return_layout(struct inode *ino,
>> + struct pnfs_layout_range *range,
>> + bool wait)
>> +{
>> + return 0;
>> +}
>> +
>> static inline bool
>> pnfs_roc(struct inode *ino)
>> {
>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>> index 178fafe..9376eaf 100644
>> --- a/include/linux/nfs4.h
>> +++ b/include/linux/nfs4.h
>> @@ -562,6 +562,7 @@ enum {
>> NFSPROC4_CLNT_LAYOUTGET,
>> NFSPROC4_CLNT_GETDEVICEINFO,
>> NFSPROC4_CLNT_LAYOUTCOMMIT,
>> + NFSPROC4_CLNT_LAYOUTRETURN,
>> };
>>
>> /* nfs41 types */
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index 78b101e..455ddfb 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -266,6 +266,29 @@ struct nfs4_layoutcommit_data {
>> struct nfs4_layoutcommit_res res;
>> };
>>
>> +struct nfs4_layoutreturn_args {
>> + __u32 reclaim;
>> + __u32 layout_type;
>> + __u32 return_type;
>
> Why do we need a 'return_type'? As far as I can see, this will always be
> RETURN_FILE.
>

Yeah, we can remove this for now.

Benny

>> + struct pnfs_layout_range range;
>> + struct inode *inode;
>> + struct nfs4_sequence_args seq_args;
>> +};
>> +
>> +struct nfs4_layoutreturn_res {
>> + struct nfs4_sequence_res seq_res;
>> + u32 lrs_present;
>> + nfs4_stateid stateid;
>> +};
>> +
>> +struct nfs4_layoutreturn {
>> + struct nfs4_layoutreturn_args args;
>> + struct nfs4_layoutreturn_res res;
>> + struct rpc_cred *cred;
>> + struct nfs_client *clp;
>> + int rpc_status;
>> +};
>> +
>> /*
>> * Arguments to the open call.
>> */
>


2011-04-20 17:27:58

by Benny Halevy

[permalink] [raw]
Subject: [RFC 13/27] pnfs: client stats

From: J. Bruce Fields <[email protected]>

A pNFS client auto-negotiates a lot of features (minorversion level,
pNFS layout type, etc.). This is convenient, but makes certain kinds of
failures hard for a user to detect.

For example, if the client falls back on 4.0, or falls back to MDS IO
because the user didn't connect to the right iscsi disks before
mounting, the only symptoms may be reduced performance, which may not be
noticed till long after the actual failure, and may be difficult for a
user to diagnose.

However, such "failures" may also be perfectly normal in some cases, so
we don't want to spam the system logs with them.

One approach would be to put some more information into
/proc/self/mountstats.

Signed-off-by: J. Bruce Fields <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfs: add commit client stats]
[fixup data types for "ret" variables in pnfs_try_to* inline funcs.]
Signed-off-by: Benny Halevy <[email protected]>
[fix definition of show_pnfs for !CONFIG_PNFS]
Signed-off-by: Benny Halevy <[email protected]>
[nfs41: Fix show_sessions in the not CONFIG_NFS_V4_1 case]
There is a build error when CONFIG_NFS_V4 is set but
CONFIG_NFS_V4_1 is *not* set. show_sessions() prototype
was unbalanced between the two cases.
Signed-off-by: Boaz Harrosh <[email protected]>
[pnfs: super.c remove CONFIG_PNFS]
Signed-off-by: Andy Adamson <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/super.c | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2b8e9a5..50f3987 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "pnfs.h"

#define NFSDBG_FACILITY NFSDBG_VFS

@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)

return 0;
}
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+ if (nfs4_has_session(server->nfs_client))
+ seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+ seq_printf(m, ",pnfs=");
+ if (server->pnfs_curr_ld)
+ seq_printf(m, "%s", server->pnfs_curr_ld->name);
+ else
+ seq_printf(m, "not configured");
+}
+#else /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */

static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
{
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+ show_sessions(m, nfss);
+ show_pnfs(m, nfss);
}
#endif

--
1.7.3.4


2011-04-22 08:04:46

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 2/6] SQUASHME: remove assert_spin_locked from pnfs_clear_lseg_list

currently it's called only from the evict_inode path
no lock is needed there...

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/pnfs.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0237b2c..bacde63 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -410,7 +410,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return invalid - removed;
}

-/* Returns false if there was nothing to do, true otherwise */
+/*
+ * Returns false if there was nothing to do, true otherwise.
+ *
+ * Must be called under the i_lock (unless from the nfs4_evict_inode path)
+ */
static bool
pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
struct pnfs_layout_range *range)
@@ -420,7 +424,6 @@ pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,

dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
__func__, lo, range->offset, range->length, range->iomode);
- assert_spin_locked(&lo->plh_inode->i_lock);
if (list_empty(&lo->plh_segs)) {
if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
put_layout_hdr_locked(lo);
--
1.7.3.4


2011-04-22 08:04:25

by Benny Halevy

[permalink] [raw]
Subject: [PATCH 1/6] SQUASHME: call pnfs_return_layout right before pnfs_destroy_layout

squash into "pnfs: layoutreturn"

Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/inode.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 73a2529..9a48d1c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1424,9 +1424,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
void nfs4_evict_inode(struct inode *inode)
{
- pnfs_return_layout(inode, NULL, true);
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
+ pnfs_return_layout(inode, NULL);
pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
--
1.7.3.4


2011-04-20 17:29:24

by Benny Halevy

[permalink] [raw]
Subject: [RFC 25/27] pnfs-obj: objlayout_encode_layoutcommit implementation

From: Boaz Harrosh <[email protected]>

* Define API for io-engines to report delta_space_used in IOs
* Encode the osd-layout specific information of the layoutcommit
XDR buffer.

Signed-off-by: Boaz Harrosh <[email protected]>
[check for OBJ_DSU_INVALID in objlayout_add_delta_space_used under lock]
[use new alloc/free_layout API]
[apply types rename]
[convert to new pnfs-submit changes]
[fixup encode_layoutcommit arguments]
[fixup layoutcommit methods args]
[use pnfs_layout_hdr and layout_segment field prefix]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/objio_osd.c | 1 +
fs/nfs/objlayout/objlayout.c | 30 ++++++++++++++++++++++++++++++
fs/nfs/objlayout/objlayout.h | 25 +++++++++++++++++++++++++
3 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 027ba38..179dfbd 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -755,6 +755,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
.write_pagelist = objlayout_write_pagelist,

.encode_layoutreturn = objlayout_encode_layoutreturn,
+ .encode_layoutcommit = objlayout_encode_layoutcommit,
};

void *objio_init_mt(void)
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 322ffa3..2cf139c 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -247,6 +247,7 @@ objlayout_iodone(struct objlayout_io_state *state)
struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout);

spin_lock(&objlay->lock);
+ objlay->delta_space_valid = OBJ_DSU_INVALID;
list_add(&objlay->err_list, &state->err_list);
spin_unlock(&objlay->lock);
}
@@ -455,6 +456,35 @@ objlayout_write_pagelist(struct nfs_write_data *wdata,
return PNFS_ATTEMPTED;
}

+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *args)
+{
+ struct objlayout *objlay = OBJLAYOUT(pnfslay);
+ struct pnfs_osd_layoutupdate lou;
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+
+ spin_lock(&objlay->lock);
+ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+ lou.dsu_delta = objlay->delta_space_used;
+ objlay->delta_space_used = 0;
+ objlay->delta_space_valid = OBJ_DSU_INIT;
+ lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+ spin_unlock(&objlay->lock);
+
+ start = xdr_reserve_space(xdr, 4);
+
+ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+
+ dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+ lou.dsu_delta, lou.olu_ioerr_flag);
+}
+
static int
err_prio(u32 oer_errno)
{
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 65f8d44..fb0bf93 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -62,6 +62,14 @@ struct objlayout_segment {
struct objlayout {
struct pnfs_layout_hdr pnfs_layout;

+ /* for layout_commit */
+ enum osd_delta_space_valid_enum {
+ OBJ_DSU_INIT = 0,
+ OBJ_DSU_VALID,
+ OBJ_DSU_INVALID,
+ } delta_space_valid;
+ s64 delta_space_used; /* consumed by write ops */
+
/* for layout_return */
spinlock_t lock;
struct list_head err_list;
@@ -129,6 +137,23 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state,
unsigned index, int osd_error,
u64 offset, u64 length, bool is_write);

+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+ struct objlayout *objlay = OBJLAYOUT(state->objlseg->lseg.pls_layout);
+
+ /* If one of the I/Os errored out and the delta_space_used was
+ * invalid we render the complete report as invalid. Protocol mandate
+ * the DSU be accurate or not reported.
+ */
+ spin_lock(&objlay->lock);
+ if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+ objlay->delta_space_valid = OBJ_DSU_VALID;
+ objlay->delta_space_used += space_used;
+ }
+ spin_unlock(&objlay->lock);
+}
+
extern void objlayout_read_done(struct objlayout_io_state *state,
ssize_t status, bool sync);
extern void objlayout_write_done(struct objlayout_io_state *state,
--
1.7.3.4


2011-04-20 17:28:42

by Benny Halevy

[permalink] [raw]
Subject: [RFC 19/27] pnfs-obj: objlayout driver skeleton

* Add the objlayout middleware and initial driver skeleton.
* Establish API with IO engine.
* Add an empty objio_osd IO engine that does nothing and just requests
IO through the MDS.

At this stage the layoutdriver is loadable and registers with the pnfs
client to provide osd layouts. But it does not actually do anything.

[Some extra debug-prints]
[move objlayout to the PNFS_LD dprint channel]
[bug in last conversion to embedded pnfs_layout_segment]
Signed-off-by: Boaz Harrosh <[email protected]>
[convert APIs pnfs-post-submit]
[get rid of threshold policy ops]
[git rid of PNFS_LAYOUTGET_ON_OPEN]
[use new alloc/free_layout API]
[use new commit api]
[use new read_pagelist api]
[use new write_pagelist api]
[apply types rename]
[convert to new pnfs-submit changes]
[deprecate get_stripesize]
[use {set,clear}_layoutdriver]
[use pnfs_layout_hdr and layout_segment field prefix]
[deprecate get_blocksize, commit methods]
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/objlayout/Kbuild | 2 +-
fs/nfs/objlayout/objio_osd.c | 210 ++++++++++++++++++++
fs/nfs/objlayout/objlayout.c | 435 ++++++++++++++++++++++++++++++++++++++++++
fs/nfs/objlayout/objlayout.h | 155 +++++++++++++++
include/linux/nfs4.h | 2 +
5 files changed, 803 insertions(+), 1 deletions(-)
create mode 100644 fs/nfs/objlayout/objio_osd.c
create mode 100644 fs/nfs/objlayout/objlayout.c
create mode 100644 fs/nfs/objlayout/objlayout.h

diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
index c326738..a45339c 100644
--- a/fs/nfs/objlayout/Kbuild
+++ b/fs/nfs/objlayout/Kbuild
@@ -1,5 +1,5 @@
#
# Makefile for the pNFS Objects Layout Driver kernel module
#
-objlayoutdriver-y := pnfs_osd_xdr_cli.o
+objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 0000000..4b88c0a
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,210 @@
+/*
+ * objio_osd.c
+ *
+ * pNFS Objects layout implementation over open-osd initiator library
+ *
+ * Copyright (C) 2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <[email protected]>
+ * Boaz Harrosh <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <scsi/scsi_device.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+#include <scsi/osd_sense.h>
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define _LLU(x) ((unsigned long long)x)
+
+enum { BIO_MAX_PAGES_KMALLOC =
+ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+
+/* A per mountpoint struct currently for device cache */
+struct objio_mount_type {
+ struct list_head dev_list;
+ spinlock_t dev_list_lock;
+};
+
+struct objio_segment {
+ struct pnfs_osd_layout *layout;
+};
+
+struct objio_state {
+ /* Generic layer */
+ struct objlayout_io_state ol_state;
+
+ struct objio_segment *objio_seg;
+};
+
+int objio_alloc_lseg(void **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_osd_layout *layout)
+{
+ struct objio_segment *objio_seg;
+
+ objio_seg = kzalloc(sizeof(*objio_seg), GFP_KERNEL);
+ if (!objio_seg)
+ return -ENOMEM;
+
+ objio_seg->layout = layout;
+
+ *outp = objio_seg;
+ return 0;
+}
+
+void objio_free_lseg(void *p)
+{
+ struct objio_segment *objio_seg = p;
+
+ kfree(objio_seg);
+}
+
+int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp)
+{
+ struct objio_segment *objio_seg = seg;
+ struct objio_state *ios;
+
+ ios = kzalloc(sizeof(*ios), GFP_KERNEL);
+ if (unlikely(!ios))
+ return -ENOMEM;
+
+ ios->objio_seg = objio_seg;
+
+ *outp = &ios->ol_state;
+ return 0;
+}
+
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+ struct objio_state *ios = container_of(ol_state, struct objio_state,
+ ol_state);
+
+ kfree(ios);
+}
+
+/*
+ * read
+ */
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+ return -EIO;
+}
+
+/*
+ * write
+ */
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+ return -EIO;
+}
+
+/*
+ * Policy Operations
+ */
+
+/*
+ * Don't gather across stripes, but rather gather (coalesce) up to
+ * the stripe size.
+ *
+ * FIXME: change interface to use merge_align, merge_count
+ */
+static struct pnfs_layoutdriver_type objlayout_type = {
+ .id = LAYOUT_OSD2_OBJECTS,
+ .name = "LAYOUT_OSD2_OBJECTS",
+ .flags = PNFS_LAYOUTRET_ON_SETATTR,
+
+ .set_layoutdriver = objlayout_set_layoutdriver,
+ .unset_layoutdriver = objlayout_unset_layoutdriver,
+
+ .alloc_layout_hdr = objlayout_alloc_layout_hdr,
+ .free_layout_hdr = objlayout_free_layout_hdr,
+
+ .alloc_lseg = objlayout_alloc_lseg,
+ .free_lseg = objlayout_free_lseg,
+
+ .read_pagelist = objlayout_read_pagelist,
+ .write_pagelist = objlayout_write_pagelist,
+};
+
+void *objio_init_mt(void)
+{
+ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL);
+
+ if (!omt)
+ return ERR_PTR(-ENOMEM);
+
+ return omt;
+}
+
+void objio_fini_mt(void *mountid)
+{
+ kfree(mountid);
+}
+
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <[email protected]>");
+MODULE_LICENSE("GPL");
+
+static int __init
+objlayout_init(void)
+{
+ int ret = pnfs_register_layoutdriver(&objlayout_type);
+
+ if (ret)
+ printk(KERN_INFO
+ "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+ __func__, ret);
+ else
+ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+ __func__);
+ return ret;
+}
+
+static void __exit
+objlayout_exit(void)
+{
+ pnfs_unregister_layoutdriver(&objlayout_type);
+ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+ __func__);
+}
+
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 0000000..ae14a24
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,435 @@
+/*
+ * objlayout.c
+ *
+ * pNFS layout driver for Panasas OSDs
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <[email protected]>
+ * Boaz Harrosh <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "objlayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+struct pnfs_client_operations *pnfs_client_ops;
+
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode)
+{
+ struct objlayout *objlay;
+
+ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL);
+ dprintk("%s: Return %p\n", __func__, objlay);
+ return &objlay->pnfs_layout;
+}
+
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct objlayout *objlay = OBJLAYOUT(lo);
+
+ dprintk("%s: objlay %p\n", __func__, objlay);
+
+ kfree(objlay);
+}
+
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+ struct nfs4_layoutget_res *lgr)
+{
+ int status = -ENOMEM;
+ struct xdr_stream stream;
+ struct xdr_buf buf = {
+ .pages = lgr->layoutp->pages,
+ .page_len = lgr->layoutp->len,
+ .buflen = lgr->layoutp->len,
+ .len = lgr->layoutp->len,
+ };
+ struct page *scratch;
+ __be32 *p;
+ struct objlayout_segment *objlseg = NULL;
+ struct pnfs_osd_layout *pnfs_osd_layout;
+
+ dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (!scratch)
+ goto err_nofree;
+
+ xdr_init_decode(&stream, &buf, NULL);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* FIXME: presuming whole layout fits in the first page,
+ need to use the xdr decoding helpers rather than the READXX macros */
+ p = xdr_inline_decode(&stream, pnfs_osd_data_map_xdr_sz() << 2);
+ if (unlikely(!p))
+ goto err;
+
+ objlseg = kzalloc(sizeof(*objlseg) +
+ pnfs_osd_layout_incore_sz(p), GFP_KERNEL);
+ if (!objlseg)
+ goto err;
+
+ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout;
+ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, p);
+
+ objlseg->lseg.pls_range = lgr->range;
+ status = objio_alloc_lseg(&objlseg->internal, pnfslay, &objlseg->lseg,
+ pnfs_osd_layout);
+ if (status)
+ goto err;
+
+ __free_page(scratch);
+
+ dprintk("%s: Return %p\n", __func__, &objlseg->lseg);
+ return &objlseg->lseg;
+
+err:
+ kfree(objlseg);
+ __free_page(scratch);
+err_nofree:
+ return ERR_PTR(status);
+}
+
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct objlayout_segment *objlseg;
+
+ dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+
+ if (unlikely(!lseg))
+ return;
+
+ objlseg = container_of(lseg, struct objlayout_segment, lseg);
+ objio_free_lseg(objlseg->internal);
+ kfree(objlseg);
+}
+
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ BUG_ON(!len);
+ end = start + len;
+ return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+ struct page **pages,
+ unsigned pgbase,
+ loff_t offset,
+ size_t count,
+ struct pnfs_layout_segment *lseg,
+ void *rpcdata)
+{
+ struct objlayout_segment *objlseg =
+ container_of(lseg, struct objlayout_segment, lseg);
+ struct objlayout_io_state *state;
+ u64 lseg_end_offset;
+
+ dprintk("%s: allocating io_state\n", __func__);
+ if (objio_alloc_io_state(objlseg->internal, &state))
+ return NULL;
+
+ BUG_ON(offset < lseg->pls_range.offset);
+ lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length);
+ BUG_ON(offset >= lseg_end_offset);
+ if (offset + count > lseg_end_offset) {
+ count = lseg->pls_range.length - (offset - lseg->pls_range.offset);
+ dprintk("%s: truncated count %Zd\n", __func__, count);
+ }
+
+ if (pgbase > PAGE_SIZE) {
+ pages += pgbase >> PAGE_SHIFT;
+ pgbase &= ~PAGE_MASK;
+ }
+
+ state->objlseg = objlseg;
+ state->rpcdata = rpcdata;
+ state->pages = pages;
+ state->pgbase = pgbase;
+ state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ state->offset = offset;
+ state->count = count;
+ state->sync = 0;
+
+ return state;
+}
+
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+ dprintk("%s: freeing io_state\n", __func__);
+ if (unlikely(!state))
+ return;
+
+ objio_free_io_state(state);
+}
+
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+ dprintk("%s: state %p status\n", __func__, state);
+
+ objlayout_free_io_state(state);
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+
+ pnfs_read_done(rdata);
+}
+
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+ int eof = state->eof;
+ struct nfs_read_data *rdata;
+
+ state->status = status;
+ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+ rdata = state->rpcdata;
+ rdata->task.tk_status = status;
+ if (status >= 0) {
+ rdata->res.count = status;
+ rdata->res.eof = eof;
+ }
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_read_done(rdata);
+ else {
+ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+ schedule_work(&rdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+ loff_t offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct objlayout_io_state *state;
+ ssize_t status = 0;
+ loff_t eof;
+
+ dprintk("%s: Begin inode %p offset %llu count %d\n",
+ __func__, rdata->inode, offset, (int)count);
+
+ eof = i_size_read(rdata->inode);
+ if (unlikely(offset + count > eof)) {
+ if (offset >= eof) {
+ status = 0;
+ rdata->res.count = 0;
+ rdata->res.eof = 1;
+ goto out;
+ }
+ count = eof - offset;
+ }
+
+ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+ rdata->args.pages, rdata->args.pgbase,
+ offset, count,
+ rdata->lseg, rdata);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->eof = state->offset + state->count >= eof;
+
+ status = objio_read_pagelist(state);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ rdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+
+ pnfs_write_done(wdata);
+}
+
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+ bool sync)
+{
+ struct nfs_write_data *wdata;
+
+ dprintk("%s: Begin\n", __func__);
+ wdata = state->rpcdata;
+ state->status = status;
+ wdata->task.tk_status = status;
+ if (status >= 0) {
+ wdata->res.count = status;
+ wdata->verf.committed = state->committed;
+ dprintk("%s: Return status %d committed %d\n",
+ __func__, wdata->task.tk_status,
+ wdata->verf.committed);
+ } else
+ dprintk("%s: Return status %d\n",
+ __func__, wdata->task.tk_status);
+ objlayout_iodone(state);
+ /* must not use state after this point */
+
+ if (sync)
+ pnfs_write_done(wdata);
+ else {
+ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+ schedule_work(&wdata->task.u.tk_work);
+ }
+}
+
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+ int how)
+{
+ struct objlayout_io_state *state;
+ ssize_t status;
+
+ dprintk("%s: Begin inode %p offset %llu count %u\n",
+ __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+
+ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+ wdata->args.pages,
+ wdata->args.pgbase,
+ wdata->args.offset,
+ wdata->args.count,
+ wdata->lseg, wdata);
+ if (unlikely(!state)) {
+ status = -ENOMEM;
+ goto out;
+ }
+
+ state->sync = how & FLUSH_SYNC;
+
+ status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+ dprintk("%s: Return status %Zd\n", __func__, status);
+ wdata->pnfs_error = status;
+ return PNFS_ATTEMPTED;
+}
+
+/*
+ * Perform the objio specific init_mt method.
+ * Set the layout driver private data pointer for later use.
+ */
+int
+objlayout_set_layoutdriver(struct nfs_server *server)
+{
+ void *data;
+
+ data = objio_init_mt();
+ if (IS_ERR(data)) {
+ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n",
+ __func__, PTR_ERR(data));
+ return PTR_ERR(data);
+ }
+ server->pnfs_ld_data = data;
+
+ dprintk("%s: Return data=%p\n", __func__, data);
+ return 0;
+}
+
+/*
+ * Perform the objio specific fini_mt method to release the
+ * layoutdriver private data.
+ */
+int
+objlayout_unset_layoutdriver(struct nfs_server *server)
+{
+ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data);
+ objio_fini_mt(server->pnfs_ld_data);
+ return 0;
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 0000000..24b36d4
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,155 @@
+/*
+ * objlayout.h
+ *
+ * Data types and function declerations for interfacing with the
+ * pNFS standard object layout driver.
+ *
+ * Copyright (C) 2007-2009 Panasas Inc.
+ * All rights reserved.
+ *
+ * Benny Halevy <[email protected]>
+ * Boaz Harrosh <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * See the file COPYING included with this distribution for more details.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Panasas company nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+
+/*
+ * in-core layout segment
+ */
+struct objlayout_segment {
+ struct pnfs_layout_segment lseg;
+ void *internal; /* for provider internal use */
+ u8 pnfs_osd_layout[];
+};
+
+/*
+ * per-inode layout
+ */
+struct objlayout {
+ struct pnfs_layout_hdr pnfs_layout;
+};
+
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct objlayout, pnfs_layout);
+}
+
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+ struct objlayout_segment *objlseg;
+
+ struct page **pages;
+ unsigned pgbase;
+ unsigned nr_pages;
+ unsigned long count;
+ loff_t offset;
+ bool sync;
+
+ void *rpcdata;
+ int status; /* res */
+ int eof; /* res */
+ int committed; /* res */
+};
+
+/*
+ * Raid engine I/O API
+ */
+extern void *objio_init_mt(void);
+extern void objio_fini_mt(void *mt);
+
+extern int objio_alloc_lseg(void **outp,
+ struct pnfs_layout_hdr *pnfslay,
+ struct pnfs_layout_segment *lseg,
+ struct pnfs_osd_layout *layout);
+extern void objio_free_lseg(void *p);
+
+extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+ bool stable);
+
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+ unsigned index, int osd_error,
+ u64 offset, u64 length, bool is_write);
+
+extern void objlayout_read_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+ ssize_t status, bool sync);
+
+/*
+ * exported generic objects function vectors
+ */
+
+extern int objlayout_set_layoutdriver(struct nfs_server *);
+extern int objlayout_unset_layoutdriver(struct nfs_server *);
+
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+ struct pnfs_layout_hdr *,
+ struct nfs4_layoutget_res *);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+
+extern enum pnfs_try_status objlayout_read_pagelist(
+ struct nfs_read_data *);
+
+extern enum pnfs_try_status objlayout_write_pagelist(
+ struct nfs_write_data *,
+ int how);
+
+extern void objlayout_encode_layoutcommit(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutcommit_args *);
+
+extern void objlayout_encode_layoutreturn(
+ struct pnfs_layout_hdr *,
+ struct xdr_stream *,
+ const struct nfs4_layoutreturn_args *);
+
+#endif /* _OBJLAYOUT_H */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 9376eaf..54556ca6 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -585,6 +585,8 @@ enum pnfs_layouttype {
LAYOUT_NFSV4_1_FILES = 1,
LAYOUT_OSD2_OBJECTS = 2,
LAYOUT_BLOCK_VOLUME = 3,
+
+ NFS4_PNFS_PRIVATE_LAYOUT = 0x80000000
};

/* used for both layout return and recall */
--
1.7.3.4


2011-04-20 19:53:59

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 03/27] pnfs: layoutreturn

On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
> Signed-off-by: Alexandros Batsakis <[email protected]>
> Signed-off-by: Andy Adamson <[email protected]>
> Signed-off-by: Andy Adamson <[email protected]>
> Signed-off-by: Dean Hildebrand <[email protected]>
> Signed-off-by: Fred Isaman <[email protected]>
> Signed-off-by: Fred Isaman <[email protected]>
> Signed-off-by: Marc Eshel <[email protected]>
> Signed-off-by: Zhang Jingwang <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/inode.c | 3 +-
> fs/nfs/nfs4proc.c | 97 ++++++++++++++++++++++++++++++++++++++
> fs/nfs/nfs4xdr.c | 118 ++++++++++++++++++++++++++++++++++++++++++++--
> fs/nfs/pnfs.c | 96 ++++++++++++++++++++++++++++++++++++++
> fs/nfs/pnfs.h | 22 +++++++++
> include/linux/nfs4.h | 1 +
> include/linux/nfs_xdr.h | 23 +++++++++
> 7 files changed, 353 insertions(+), 7 deletions(-)
>
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 57bb31a..73a2529 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -1424,9 +1424,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
> */
> void nfs4_evict_inode(struct inode *inode)
> {
> - pnfs_destroy_layout(NFS_I(inode));
> + pnfs_return_layout(inode, NULL, true);

Why does this want to come before the call to truncate_inode_pages()?
Is there any reason not to put pnfs_return_layout() and
pnfs_destroy_layout into a single helper here?

> truncate_inode_pages(&inode->i_data, 0);
> end_writeback(inode);
> + pnfs_destroy_layout(NFS_I(inode));
> /* If we are holding a delegation, return it! */
> nfs_inode_return_delegation_noreclaim(inode);
> /* First call standard NFS clear_inode() code */
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 9bf41ea..b03defb 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5662,6 +5662,103 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
> return status;
> }
>
> +static void
> +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
> +{
> + struct nfs4_layoutreturn *lrp = calldata;
> +
> + dprintk("--> %s\n", __func__);
> + if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
> + &lrp->res.seq_res, 0, task))
> + return;
> + rpc_call_start(task);
> +}
> +
> +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
> +{
> + struct nfs4_layoutreturn *lrp = calldata;
> + struct nfs_server *server;
> +
> + dprintk("--> %s\n", __func__);
> +
> + if (!nfs4_sequence_done(task, &lrp->res.seq_res))
> + return;
> +
> + if (lrp->args.return_type == RETURN_FILE)
> + server = NFS_SERVER(lrp->args.inode);
> + else
> + server = NULL;
> + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
> + nfs_restart_rpc(task, lrp->clp);
> + return;
> + }
> + if ((task->tk_status == 0) && (lrp->args.return_type == RETURN_FILE)) {
> + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
> +
> + spin_lock(&lo->plh_inode->i_lock);
> + if (lrp->res.lrs_present)
> + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
> + else
> + BUG_ON(!list_empty(&lo->plh_segs));
> + spin_unlock(&lo->plh_inode->i_lock);
> + }
> + dprintk("<-- %s\n", __func__);
> +}
> +
> +static void nfs4_layoutreturn_release(void *calldata)
> +{
> + struct nfs4_layoutreturn *lrp = calldata;
> +
> + dprintk("--> %s return_type %d\n", __func__, lrp->args.return_type);
> + if (lrp->args.return_type == RETURN_FILE) {
> + struct inode *ino = lrp->args.inode;
> + struct pnfs_layout_hdr *lo = NFS_I(ino)->layout;
> +
> + put_layout_hdr(lo);
> + }
> + kfree(calldata);
> + dprintk("<-- %s\n", __func__);
> +}
> +
> +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
> + .rpc_call_prepare = nfs4_layoutreturn_prepare,
> + .rpc_call_done = nfs4_layoutreturn_done,
> + .rpc_release = nfs4_layoutreturn_release,
> +};
> +
> +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync)

Why the 'issync' parameter?

> +{
> + struct rpc_task *task;
> + struct rpc_message msg = {
> + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
> + .rpc_argp = &lrp->args,
> + .rpc_resp = &lrp->res,
> + };
> + struct rpc_task_setup task_setup_data = {
> + .rpc_client = lrp->clp->cl_rpcclient,
> + .rpc_message = &msg,
> + .callback_ops = &nfs4_layoutreturn_call_ops,
> + .callback_data = lrp,
> + .flags = RPC_TASK_ASYNC,
> + };
> + int status = 0;
> +
> + dprintk("--> %s\n", __func__);
> + task = rpc_run_task(&task_setup_data);
> + if (IS_ERR(task))
> + return PTR_ERR(task);
> + if (!issync)
> + goto out;
> + status = nfs4_wait_for_completion_rpc_task(task);
> + if (status != 0)
> + goto out;
> + status = task->tk_status;
> +out:
> + dprintk("<-- %s\n", __func__);
> + rpc_put_task(task);
> + return status;
> +}
> +
> static int
> _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
> {
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index dddfb57..53ea3e5 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -338,7 +338,12 @@ static int nfs4_stat_to_errno(int);
> 1 /* layoutupdate4 layout type */ + \
> 1 /* NULL filelayout layoutupdate4 payload */)
> #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
> -
> +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
> + encode_stateid_maxsz + \
> + 1 /* FIXME: opaque lrf_body always empty at
> + *the moment */)
> +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
> + 1 + decode_stateid_maxsz)
> #else /* CONFIG_NFS_V4_1 */
> #define encode_sequence_maxsz 0
> #define decode_sequence_maxsz 0
> @@ -760,7 +765,14 @@ static int nfs4_stat_to_errno(int);
> decode_putfh_maxsz + \
> decode_layoutcommit_maxsz + \
> decode_getattr_maxsz)
> -
> +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
> + encode_sequence_maxsz + \
> + encode_putfh_maxsz + \
> + encode_layoutreturn_maxsz)
> +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
> + decode_sequence_maxsz + \
> + decode_putfh_maxsz + \
> + decode_layoutreturn_maxsz)
>
> const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
> compound_encode_hdr_maxsz +
> @@ -1890,6 +1902,37 @@ encode_layoutcommit(struct xdr_stream *xdr,
> hdr->replen += decode_layoutcommit_maxsz;
> return 0;
> }
> +
> +static void
> +encode_layoutreturn(struct xdr_stream *xdr,
> + const struct nfs4_layoutreturn_args *args,
> + struct compound_hdr *hdr)
> +{
> + nfs4_stateid stateid;
> + __be32 *p;
> +
> + p = reserve_space(xdr, 20);
> + *p++ = cpu_to_be32(OP_LAYOUTRETURN);
> + *p++ = cpu_to_be32(args->reclaim);
> + *p++ = cpu_to_be32(args->layout_type);
> + *p++ = cpu_to_be32(args->range.iomode);
> + *p = cpu_to_be32(args->return_type);
> + if (args->return_type == RETURN_FILE) {
> + p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
> + p = xdr_encode_hyper(p, args->range.offset);
> + p = xdr_encode_hyper(p, args->range.length);
> + spin_lock(&args->inode->i_lock);
> + memcpy(stateid.data, NFS_I(args->inode)->layout->plh_stateid.data,
> + NFS4_STATEID_SIZE);
> + spin_unlock(&args->inode->i_lock);
> + p = xdr_encode_opaque_fixed(p, &stateid.data,
> + NFS4_STATEID_SIZE);
> + p = reserve_space(xdr, 4);
> + *p = cpu_to_be32(0);
> + }
> + hdr->nops++;
> + hdr->replen += decode_layoutreturn_maxsz;
> +}
> #endif /* CONFIG_NFS_V4_1 */
>
> /*
> @@ -2707,9 +2750,9 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
> /*
> * Encode LAYOUTCOMMIT request
> */
> -static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
> - struct xdr_stream *xdr,
> - struct nfs4_layoutcommit_args *args)
> +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
> + struct xdr_stream *xdr,
> + struct nfs4_layoutcommit_args *args)
> {
> struct compound_hdr hdr = {
> .minorversion = nfs4_xdr_minorversion(&args->seq_args),
> @@ -2721,7 +2764,24 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
> encode_layoutcommit(xdr, args, &hdr);
> encode_getfattr(xdr, args->bitmask, &hdr);
> encode_nops(&hdr);
> - return 0;
> +}
> +
> +/*
> + * Encode LAYOUTRETURN request
> + */
> +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
> + struct xdr_stream *xdr,
> + struct nfs4_layoutreturn_args *args)
> +{
> + struct compound_hdr hdr = {
> + .minorversion = nfs4_xdr_minorversion(&args->seq_args),
> + };
> +
> + encode_compound_hdr(xdr, req, &hdr);
> + encode_sequence(xdr, &args->seq_args, &hdr);
> + encode_putfh(xdr, NFS_FH(args->inode), &hdr);
> + encode_layoutreturn(xdr, args, &hdr);
> + encode_nops(&hdr);
> }
> #endif /* CONFIG_NFS_V4_1 */
>
> @@ -5202,6 +5262,27 @@ out_overflow:
> return -EIO;
> }
>
> +static int decode_layoutreturn(struct xdr_stream *xdr,
> + struct nfs4_layoutreturn_res *res)
> +{
> + __be32 *p;
> + int status;
> +
> + status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
> + if (status)
> + return status;
> + p = xdr_inline_decode(xdr, 4);
> + if (unlikely(!p))
> + goto out_overflow;
> + res->lrs_present = be32_to_cpup(p);
> + if (res->lrs_present)
> + status = decode_stateid(xdr, &res->stateid);
> + return status;
> +out_overflow:
> + print_overflow_msg(__func__, xdr);
> + return -EIO;
> +}
> +
> static int decode_layoutcommit(struct xdr_stream *xdr,
> struct rpc_rqst *req,
> struct nfs4_layoutcommit_res *res)
> @@ -6319,6 +6400,30 @@ out:
> }
>
> /*
> + * Decode LAYOUTRETURN response
> + */
> +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
> + struct xdr_stream *xdr,
> + struct nfs4_layoutreturn_res *res)
> +{
> + struct compound_hdr hdr;
> + int status;
> +
> + status = decode_compound_hdr(xdr, &hdr);
> + if (status)
> + goto out;
> + status = decode_sequence(xdr, &res->seq_res, rqstp);
> + if (status)
> + goto out;
> + status = decode_putfh(xdr);
> + if (status)
> + goto out;
> + status = decode_layoutreturn(xdr, res);
> +out:
> + return status;
> +}
> +
> +/*
> * Decode LAYOUTCOMMIT response
> */
> static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
> @@ -6544,6 +6649,7 @@ struct rpc_procinfo nfs4_procedures[] = {
> PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
> PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
> PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
> + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
> #endif /* CONFIG_NFS_V4_1 */
> };
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index d9ab972..89e7725 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -321,6 +321,36 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
> return invalid - removed;
> }
>
> +/* Returns false if there was nothing to do, true otherwise */
> +static bool
> +pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list,
> + struct pnfs_layout_range *range)
> +{
> + struct pnfs_layout_segment *lseg, *next;
> + bool rv = false;
> +
> + dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n",
> + __func__, lo, range->offset, range->length, range->iomode);
> + assert_spin_locked(&lo->plh_inode->i_lock);

Not needed in the case of nfs4_evict_inode.

> + if (list_empty(&lo->plh_segs)) {
> + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
> + put_layout_hdr_locked(lo);
> + return 0;
> + }
> + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
> + if (should_free_lseg(lseg->pls_range.iomode, range->iomode)) {
> + dprintk("%s: freeing lseg %p iomode %d "
> + "offset %llu length %llu\n", __func__,
> + lseg, lseg->pls_range.iomode,
> + lseg->pls_range.offset,
> + lseg->pls_range.length);
> + mark_lseg_invalid(lseg, tmp_list);
> + rv = true;
> + }
> + dprintk("%s:Return %d\n", __func__, rv);
> + return rv;
> +}
> +
> /* note free_me must contain lsegs from a single layout_hdr */
> void
> pnfs_free_lseg_list(struct list_head *free_me)
> @@ -539,6 +569,72 @@ out_err_free:
> return NULL;
> }
>
> +static int
> +return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)
> +{
> + struct nfs4_layoutreturn *lrp;
> + struct nfs_server *server = NFS_SERVER(ino);
> + int status = -ENOMEM;
> +
> + dprintk("--> %s\n", __func__);
> +
> + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
> + if (lrp == NULL) {
> + put_layout_hdr(NFS_I(ino)->layout);
> + goto out;
> + }
> + lrp->args.reclaim = 0;
> + lrp->args.layout_type = server->pnfs_curr_ld->id;
> + lrp->args.return_type = RETURN_FILE;
> + lrp->args.range = *range;
> + lrp->args.inode = ino;
> + lrp->clp = server->nfs_client;
> +
> + status = nfs4_proc_layoutreturn(lrp, wait);
> +out:
> + dprintk("<-- %s status: %d\n", __func__, status);
> + return status;
> +}
> +
> +/* Initiates a LAYOUTRETURN(FILE) */
> +int
> +_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, bool wait)

What are the 'range' and 'wait' parameters for? We don't have any users
other than nfs4_evict_inode.

> +{
> + struct pnfs_layout_hdr *lo = NULL;
> + struct nfs_inode *nfsi = NFS_I(ino);
> + struct pnfs_layout_range arg;
> + LIST_HEAD(tmp_list);
> + int status = 0;
> +
> + dprintk("--> %s\n", __func__);
> +
> + arg.iomode = range ? range->iomode : IOMODE_ANY;
> + arg.offset = 0;
> + arg.length = NFS4_MAX_UINT64;
> +
> + spin_lock(&ino->i_lock);
> + lo = nfsi->layout;
> + if (!lo || !pnfs_clear_lseg_list(lo, &tmp_list, &arg)) {
> + spin_unlock(&ino->i_lock);
> + dprintk("%s: no layout segments to return\n", __func__);
> + goto out;
> + }
> + /* Reference matched in nfs4_layoutreturn_release */
> + get_layout_hdr(lo);
> + spin_unlock(&ino->i_lock);
> + pnfs_free_lseg_list(&tmp_list);
> +
> + /* Return layout even if layoutcommit fails */
> + status = pnfs_layoutcommit_inode(ino, wait);

Why is this needed? Again, by the time we get to nfs4_evict_inode, the
inode is guaranteed to be clean.

> + if (status)
> + dprintk("%s: layoutcommit failed, status=%d. Returning layout anyway\n",
> + __func__, status);
> + status = return_layout(ino, &arg, wait);
> +out:
> + dprintk("<-- %s status: %d\n", __func__, status);
> + return status;
> +}
> +
> bool pnfs_roc(struct inode *ino)
> {
> struct pnfs_layout_hdr *lo;
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 4cb0a0d..a308f3c 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -123,6 +123,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
> extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
> struct pnfs_device *dev);
> extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
> +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait);
>
> /* pnfs.c */
> void get_layout_hdr(struct pnfs_layout_hdr *lo);
> @@ -158,6 +159,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
> bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
> void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
> int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
> +int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, bool wait);
>
> static inline int lo_fail_bit(u32 iomode)
> {
> @@ -226,6 +228,19 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
> put_lseg(req->wb_commit_lseg);
> }
>
> +static inline int pnfs_return_layout(struct inode *ino,
> + struct pnfs_layout_range *range,
> + bool wait)
> +{
> + struct nfs_inode *nfsi = NFS_I(ino);
> + struct nfs_server *nfss = NFS_SERVER(ino);
> +
> + if (pnfs_enabled_sb(nfss) && nfsi->layout)
> + return _pnfs_return_layout(ino, range, wait);
> +
> + return 0;
> +}
> +
> #else /* CONFIG_NFS_V4_1 */
>
> static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
> @@ -267,6 +282,13 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
> return PNFS_NOT_ATTEMPTED;
> }
>
> +static inline int pnfs_return_layout(struct inode *ino,
> + struct pnfs_layout_range *range,
> + bool wait)
> +{
> + return 0;
> +}
> +
> static inline bool
> pnfs_roc(struct inode *ino)
> {
> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
> index 178fafe..9376eaf 100644
> --- a/include/linux/nfs4.h
> +++ b/include/linux/nfs4.h
> @@ -562,6 +562,7 @@ enum {
> NFSPROC4_CLNT_LAYOUTGET,
> NFSPROC4_CLNT_GETDEVICEINFO,
> NFSPROC4_CLNT_LAYOUTCOMMIT,
> + NFSPROC4_CLNT_LAYOUTRETURN,
> };
>
> /* nfs41 types */
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 78b101e..455ddfb 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -266,6 +266,29 @@ struct nfs4_layoutcommit_data {
> struct nfs4_layoutcommit_res res;
> };
>
> +struct nfs4_layoutreturn_args {
> + __u32 reclaim;
> + __u32 layout_type;
> + __u32 return_type;

Why do we need a 'return_type'? As far as I can see, this will always be
RETURN_FILE.

> + struct pnfs_layout_range range;
> + struct inode *inode;
> + struct nfs4_sequence_args seq_args;
> +};
> +
> +struct nfs4_layoutreturn_res {
> + struct nfs4_sequence_res seq_res;
> + u32 lrs_present;
> + nfs4_stateid stateid;
> +};
> +
> +struct nfs4_layoutreturn {
> + struct nfs4_layoutreturn_args args;
> + struct nfs4_layoutreturn_res res;
> + struct rpc_cred *cred;
> + struct nfs_client *clp;
> + int rpc_status;
> +};
> +
> /*
> * Arguments to the open call.
> */

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-20 19:41:30

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 01/27] pnfs: CB_NOTIFY_DEVICEID

On Wed, 2011-04-20 at 20:26 +0300, Benny Halevy wrote:
> From: Marc Eshel <[email protected]>
>
> Note: This functionlaity is incomplete as all layout segments referring to
> the 'to be removed device id' need to be reaped, and all in flight I/O drained.
>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/callback.h | 20 +++++++++
> fs/nfs/callback_proc.c | 50 +++++++++++++++++++++++
> fs/nfs/callback_xdr.c | 96 +++++++++++++++++++++++++++++++++++++++++++-
> fs/nfs/nfs4filelayout.c | 1 +
> fs/nfs/nfs4filelayout.h | 1 +
> fs/nfs/nfs4filelayoutdev.c | 38 +++++++++++++++++-
> fs/nfs/pnfs.h | 3 +
> 7 files changed, 207 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
> index 46d93ce..892128f 100644
> --- a/fs/nfs/callback.h
> +++ b/fs/nfs/callback.h
> @@ -167,6 +167,26 @@ extern unsigned nfs4_callback_layoutrecall(
>
> extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
> extern void nfs4_cb_take_slot(struct nfs_client *clp);
> +
> +struct cb_devicenotifyitem {
> + uint32_t cbd_notify_type;
> + uint32_t cbd_layout_type;
> + struct nfs4_deviceid cbd_dev_id;
> + uint32_t cbd_immediate;
> +};
> +
> +/* XXX: Should be dynamic up to max compound size */
> +#define NFS4_DEV_NOTIFY_MAXENTRIES 10
> +struct cb_devicenotifyargs {
> + struct sockaddr *addr;

No sockaddr_size parameter?

> + int ndevs;
> + struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES];
> +};

Why can't we make this dynamic at this time?

> +
> +extern __be32 nfs4_callback_devicenotify(
> + struct cb_devicenotifyargs *args,
> + void *dummy, struct cb_process_state *cps);
> +
> #endif /* CONFIG_NFS_V4_1 */
> extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
> extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
> index 2f41dcce..99494f6 100644
> --- a/fs/nfs/callback_proc.c
> +++ b/fs/nfs/callback_proc.c
> @@ -241,6 +241,56 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
> do_callback_layoutrecall(clp, &args);
> }
>
> +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
> + void *dummy, struct cb_process_state *cps)
> +{
> + int i;
> + u32 res = 0;
> + struct nfs_client *clp = cps->clp;
> + struct nfs_server *server = NULL;
> +
> + dprintk("%s: -->\n", __func__);
> +
> + if (!clp) {
> + res = NFS4ERR_OP_NOT_IN_SESSION;
> + goto out;
> + }
> +
> + for (i = 0; i < args->ndevs; i++) {
> + struct cb_devicenotifyitem *dev = &args->devs[i];
> +
> + if (!server ||
> + server->pnfs_curr_ld->id != dev->cbd_layout_type) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
> + if (server->pnfs_curr_ld &&
> + server->pnfs_curr_ld->id == dev->cbd_layout_type) {
> + rcu_read_unlock();
> + goto found;
> + }
> + rcu_read_unlock();
> + dprintk("%s: layout type %u not found\n",
> + __func__, dev->cbd_layout_type);
> + continue;
> + }
> +
> + found:
> + if (!server->pnfs_curr_ld->delete_deviceid) {
> + res = NFS4ERR_NOTSUPP;
> + break;
> + }
> + if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
> + dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
> + "deleting instead\n", __func__);
> + server->pnfs_curr_ld->delete_deviceid(&dev->cbd_dev_id);
> + }
> +
> +out:
> + dprintk("%s: exit with status = %u\n",
> + __func__, res);
> + return cpu_to_be32(res);
> +}
> +
> int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
> {
> if (delegation == NULL)
> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
> index 00ecf62..5ec2c12 100644
> --- a/fs/nfs/callback_xdr.c
> +++ b/fs/nfs/callback_xdr.c
> @@ -25,6 +25,7 @@
>
> #if defined(CONFIG_NFS_V4_1)
> #define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
> +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
> #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
> 4 + 1 + 3)
> #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
> @@ -284,6 +285,93 @@ out:
> return status;
> }
>
> +static
> +__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
> + struct xdr_stream *xdr,
> + struct cb_devicenotifyargs *args)
> +{
> + __be32 *p;
> + __be32 status = 0;
> + u32 tmp;
> + int n, i;
> + args->ndevs = 0;
> +
> + args->addr = svc_addr(rqstp);
> +
> + /* Num of device notifications */
> + p = read_buf(xdr, sizeof(uint32_t));
> + if (unlikely(p == NULL)) {
> + status = htonl(NFS4ERR_RESOURCE);
> + goto out;
> + }
> + n = ntohl(*p++);
> + if (n <= 0)
> + goto out;
> +
> + /* XXX: need to possibly return error in this case */
> + if (n > NFS4_DEV_NOTIFY_MAXENTRIES) {
> + dprintk("%s: Processing (%d) notifications out of (%d)\n",
> + __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n);
> + n = NFS4_DEV_NOTIFY_MAXENTRIES;
> + }
> +
> + /* Decode each dev notification */
> + for (i = 0; i < n; i++) {
> + struct cb_devicenotifyitem *dev = &args->devs[i];
> +
> + p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
> + if (unlikely(p == NULL)) {
> + status = htonl(NFS4ERR_RESOURCE);
> + goto out;
> + }
> +
> + tmp = ntohl(*p++); /* bitmap size */
> + if (tmp != 1) {
> + status = htonl(NFS4ERR_INVAL);
> + goto out;
> + }
> + dev->cbd_notify_type = ntohl(*p++);
> + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
> + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
> + status = htonl(NFS4ERR_INVAL);
> + goto out;
> + }
> +
> + tmp = ntohl(*p++); /* opaque size */
> + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
> + (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
> + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
> + (tmp != NFS4_DEVICEID4_SIZE + 4))) {
> + status = htonl(NFS4ERR_INVAL);
> + goto out;
> + }
> + dev->cbd_layout_type = ntohl(*p++);
> + memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
> + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
> +
> + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
> + p = read_buf(xdr, sizeof(uint32_t));
> + if (unlikely(p == NULL)) {
> + status = htonl(NFS4ERR_DELAY);
> + goto out;
> + }
> + dev->cbd_immediate = ntohl(*p++);
> + } else {
> + dev->cbd_immediate = 0;
> + }
> +
> + args->ndevs++;
> +
> + dprintk("%s: type %d layout 0x%x immediate %d\n",
> + __func__, dev->cbd_notify_type, dev->cbd_layout_type,
> + dev->cbd_immediate);
> + }
> +out:
> + dprintk("%s: status %d ndevs %d\n",
> + __func__, ntohl(status), args->ndevs);
> + return status;
> +}
> +
> static __be32 decode_sessionid(struct xdr_stream *xdr,
> struct nfs4_sessionid *sid)
> {
> @@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
> case OP_CB_RECALL_ANY:
> case OP_CB_RECALL_SLOT:
> case OP_CB_LAYOUTRECALL:
> + case OP_CB_NOTIFY_DEVICEID:
> *op = &callback_ops[op_nr];
> break;
>
> - case OP_CB_NOTIFY_DEVICEID:
> case OP_CB_NOTIFY:
> case OP_CB_PUSH_DELEG:
> case OP_CB_RECALLABLE_OBJ_AVAIL:
> @@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
> (callback_decode_arg_t)decode_layoutrecall_args,
> .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
> },
> + [OP_CB_NOTIFY_DEVICEID] = {
> + .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
> + .decode_args =
> + (callback_decode_arg_t)decode_devicenotify_args,
> + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
> + },
> [OP_CB_SEQUENCE] = {
> .process_op = (callback_process_op_t)nfs4_callback_sequence,
> .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index e6e0c294..2feab7f 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -867,6 +867,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
> .commit_pagelist = filelayout_commit_pagelist,
> .read_pagelist = filelayout_read_pagelist,
> .write_pagelist = filelayout_write_pagelist,
> + .delete_deviceid = filelayout_delete_deviceid,
> };
>
> static int __init nfs4filelayout_init(void)
> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> index 7c44579..8be70ab 100644
> --- a/fs/nfs/nfs4filelayout.h
> +++ b/fs/nfs/nfs4filelayout.h
> @@ -105,5 +105,6 @@ nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
> extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
> struct nfs4_file_layout_dsaddr *
> get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
> +void filelayout_delete_deviceid(struct nfs4_deviceid *);
>
> #endif /* FS_NFS_NFS4FILELAYOUT_H */
> diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
> index de5350f..601aaea 100644
> --- a/fs/nfs/nfs4filelayoutdev.c
> +++ b/fs/nfs/nfs4filelayoutdev.c
> @@ -601,7 +601,7 @@ void
> nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
> {
> if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
> - hlist_del_rcu(&dsaddr->node);
> + hlist_del_init_rcu(&dsaddr->node);
> spin_unlock(&filelayout_deviceid_lock);
>
> synchronize_rcu();
> @@ -631,6 +631,42 @@ fail:
> return NULL;
> }
>
> +static struct nfs4_file_layout_dsaddr *
> +nfs4_fl_unhash_deviceid(struct nfs4_deviceid *id)
> +{
> + struct nfs4_file_layout_dsaddr *d;
> + struct hlist_node *n;
> + long hash = nfs4_fl_deviceid_hash(id);
> +
> + dprintk("%s: hash %ld\n", __func__, hash);
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node)
> + if (!memcmp(&d->deviceid, id, sizeof(*id)))
> + goto found;
> + rcu_read_unlock();
> + return NULL;
> +
> +found:
> + rcu_read_unlock();

Is there a reason why we should drop the rcu lock here...

> + spin_lock(&filelayout_deviceid_lock);
> + hlist_del_init_rcu(&d->node);
> + spin_unlock(&filelayout_deviceid_lock);

...instead of here? IOW: do we need to enable preemption before we
redisable it in the spin_lock() call?

Also, how are you preventing races? There is no test under the spin lock
for whether or not the device is still hashed before you call
hlist_del_init_rcu(), so how do you know that it is safe to put the
reference to filelayout_delete_deviceid()?

> + synchronize_rcu();
> +
> + return d;
> +}
> +
> +void
> +filelayout_delete_deviceid(struct nfs4_deviceid *id)
> +{
> + struct nfs4_file_layout_dsaddr *d;
> +
> + d = nfs4_fl_unhash_deviceid(id);
> + /* balance the initial ref taken in decode_and_add_device */
> + if (d && atomic_dec_and_test(&d->ref))
> + nfs4_fl_free_deviceid(d);
> +}
> +
> /*
> * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
> * Then: ((res + fsi) % dsaddr->stripe_count)
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index bc48272..4cb0a0d 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -89,6 +89,9 @@ struct pnfs_layoutdriver_type {
> */
> enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
> enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
> +
> + /* device notification methods */
> + void (*delete_deviceid)(struct nfs4_deviceid *);
> };
>
> struct pnfs_layout_hdr {

--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com


2011-04-20 20:22:17

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC 08/27] pnfs: {setup,cleanup}_layoutcommit

On Wed, 2011-04-20 at 20:27 +0300, Benny Halevy wrote:
> From: Andy Adamson <[email protected]>
>
> Signed-off-by: Andy Adamson <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/nfs4proc.c | 2 ++
> fs/nfs/nfs4xdr.c | 1 +
> fs/nfs/pnfs.c | 21 +++++++++++++++++++++
> fs/nfs/pnfs.h | 8 ++++++++
> include/linux/nfs_xdr.h | 1 +
> 5 files changed, 33 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index b4df7a6..d0eb50b 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5807,6 +5807,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
> if (nfs4_setup_sequence(server, &data->args.seq_args,
> &data->res.seq_res, 1, task))
> return;
> + data->res.status = -1;
> rpc_call_start(task);
> }
>
> @@ -5841,6 +5842,7 @@ static void nfs4_layoutcommit_release(void *calldata)
> {
> struct nfs4_layoutcommit_data *data = calldata;
>
> + pnfs_cleanup_layoutcommit(data->args.inode, data);
> /* Matched by references in pnfs_set_layoutcommit */
> put_lseg(data->lseg);
> put_rpccred(data->cred);
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 4f7bef9..23e608f 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -5307,6 +5307,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
> int status;
>
> status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
> + res->status = status;
> if (status)
> return status;
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 0b4ad1f..a5050d2 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1181,6 +1181,19 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
> return trypnfs;
> }
>
> +void pnfs_cleanup_layoutcommit(struct inode *inode,
> + struct nfs4_layoutcommit_data *data)
> +{
> + struct nfs_server *nfss = NFS_SERVER(inode);
> +
> + /* TODO: Maybe we should avoid this by allowing the layout driver
> + * to directly xdr its layout on the wire.
> + */
> + if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
> + nfss->pnfs_curr_ld->cleanup_layoutcommit(
> + NFS_I(inode)->layout, data);
> +}
> +
> /*
> * Currently there is only one (whole file) write lseg.
> */
> @@ -1277,6 +1290,14 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
> data->args.lastbytewritten = end_pos - 1;
> data->res.server = NFS_SERVER(inode);
>
> + /* Call layout driver to set the arguments */
> + if (NFS_SERVER(inode)->pnfs_curr_ld->setup_layoutcommit) {
> + status = NFS_SERVER(inode)->pnfs_curr_ld->setup_layoutcommit(
> + NFS_I(inode)->layout, &data->args);
> + if (status)
> + goto out;
> + }
> +
> status = nfs4_proc_layoutcommit(data, sync);
> out:
> dprintk("<-- %s status %d\n", __func__, status);
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 011885e..9f8e970 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -99,10 +99,16 @@ struct pnfs_layoutdriver_type {
> /* device notification methods */
> void (*delete_deviceid)(struct nfs4_deviceid *);
>
> + int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
> + struct nfs4_layoutcommit_args *args);
> +
> void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
> struct xdr_stream *xdr,
> const struct nfs4_layoutcommit_args *args);
>
> + void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
> + struct nfs4_layoutcommit_data *data);
> +

This is yet more callback ugliness. Please replace this + the
encode_layoutcommit+.... with a single

int (*layoutcommit) (struct pnfs_layout_hdr)

that does everything from pre- and post-processing to the actual RPC
call for layoutcommit.


--
Trond Myklebust
Linux NFS client maintainer

NetApp
[email protected]
http://www.netapp.com