2012-05-23 13:03:39

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 1/4] NFSv4.1 mdsthreshold attribute xdr

From: Andy Adamson <[email protected]>

We only support one layout type per file system, so one threshold_item4 per
mdsthreshold4.

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/nfs4xdr.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/nfs4.h | 7 +++
include/linux/nfs_xdr.h | 10 ++++
3 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index db040e9..db199f8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -99,9 +99,12 @@ static int nfs4_stat_to_errno(int);
#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+/* We support only one layout type per file system */
+#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
/* This is based on getfattr, which uses the most attributes: */
#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
- 3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz))
+ 3 + 3 + 3 + nfs4_owner_maxsz + \
+ nfs4_group_maxsz + decode_mdsthreshold_maxsz))
#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
nfs4_fattr_value_maxsz)
#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -1170,6 +1173,16 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
bitmask[1] & nfs4_fattr_bitmap[1], hdr);
}

+static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
+ struct compound_hdr *hdr)
+{
+ encode_getattr_three(xdr,
+ bitmask[0] & nfs4_fattr_bitmap[0],
+ bitmask[1] & nfs4_fattr_bitmap[1],
+ bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD,
+ hdr);
+}
+
static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
encode_getattr_three(xdr,
@@ -2161,7 +2174,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_putfh(xdr, args->fh, &hdr);
encode_open(xdr, args, &hdr);
encode_getfh(xdr, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
+ encode_getfattr_open(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}

@@ -4183,6 +4196,110 @@ xdr_error:
return status;
}

+static int decode_threshold_hint(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ uint64_t *res,
+ uint32_t hint_bit)
+{
+ __be32 *p;
+
+ *res = 0;
+ if (likely(bitmap[0] & hint_bit)) {
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
+ }
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_first_threshold_item4(struct xdr_stream *xdr,
+ struct nfs4_threshold *res)
+{
+ __be32 *p, *savep;
+ uint32_t bitmap[3] = {0,}, attrlen;
+ int status;
+
+ /* layout type */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ res->l_type = be32_to_cpup(p);
+
+ /* thi_hintset bitmap */
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
+
+ /* thi_hintlist length */
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
+ goto xdr_error;
+ /* thi_hintlist */
+ status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz,
+ THRESHOLD_RD_IO);
+ if (status < 0)
+ goto xdr_error;
+ status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz,
+ THRESHOLD_WR_IO);
+ if (status < 0)
+ goto xdr_error;
+
+ status = verify_attr_len(xdr, savep, attrlen);
+ res->bm = bitmap[0];
+
+ dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+ __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz,
+ res->wr_io_sz);
+xdr_error:
+ dprintk("%s ret=%d!\n", __func__, status);
+ return status;
+}
+
+/*
+ * Thresholds on pNFS direct I/O vrs MDS I/O
+ */
+static int decode_attr_mdsthreshold(struct xdr_stream *xdr,
+ uint32_t *bitmap,
+ struct nfs4_threshold *res)
+{
+ __be32 *p;
+ int status = 0;
+ uint32_t num;
+
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ num = be32_to_cpup(p);
+ if (num == 0)
+ return 0;
+ if (num > 1)
+ printk(KERN_INFO "%s: Warning: Multiple pNFS layout "
+ "drivers per filesystem not supported\n",
+ __func__);
+
+ status = decode_first_threshold_item4(xdr, res);
+ }
+ return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
struct nfs_fattr *fattr, struct nfs_fh *fh,
struct nfs4_fs_locations *fs_loc,
@@ -4289,6 +4406,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;

+ status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
+ if (status < 0)
+ goto xdr_error;
+
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
return status;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0987146..72b6bad 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -526,6 +526,13 @@ enum lock_type4 {
#define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23)
#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30)
#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
+#define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4)
+
+/* MDS threshold bitmap bits */
+#define THRESHOLD_RD (1UL << 0)
+#define THRESHOLD_WR (1UL << 1)
+#define THRESHOLD_RD_IO (1UL << 2)
+#define THRESHOLD_WR_IO (1UL << 3)

#define NFSPROC4_NULL 0
#define NFSPROC4_COMPOUND 1
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2e53a3f..5b8e42e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -35,6 +35,15 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
return a->major == b->major && a->minor == b->minor;
}

+struct nfs4_threshold {
+ __u32 bm;
+ __u32 l_type;
+ __u64 rd_sz;
+ __u64 wr_sz;
+ __u64 rd_io_sz;
+ __u64 wr_io_sz;
+};
+
struct nfs_fattr {
unsigned int valid; /* which fields are valid */
umode_t mode;
@@ -67,6 +76,7 @@ struct nfs_fattr {
unsigned long gencount;
struct nfs4_string *owner_name;
struct nfs4_string *group_name;
+ struct nfs4_threshold *mdsthreshold; /* pNFS threshold hints */
};

#define NFS_ATTR_FATTR_TYPE (1U << 0)
--
1.7.7.6



2012-05-23 13:03:40

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 3/4] NFSv4.1 add nfs_inode book keeping for mdsthreshold

From: Andy Adamson <[email protected]>

Keep track of the number of bytes read or written, including those queued
up to be flushed. For use by mdsthreshold i/o size hints.

No locking needed as this is used as hint information.

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/file.c | 8 ++++++--
fs/nfs/inode.c | 2 ++
fs/nfs/pnfs.c | 3 +++
include/linux/nfs_fs.h | 3 +++
4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8eda8a6..c4cc096 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -203,8 +203,10 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_aio_read(iocb, iov, nr_segs, pos);
- if (result > 0)
+ if (result > 0) {
+ NFS_I(inode)->read_io += result;
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+ }
}
return result;
}
@@ -613,8 +615,10 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0)
result = err;
}
- if (result > 0)
+ if (result > 0) {
+ NFS_I(inode)->write_io += written;
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+ }
out:
return result;

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 889f7e5..a6f5fbb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -323,6 +323,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_gid = -2;
inode->i_blocks = 0;
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ nfsi->write_io = 0;
+ nfsi->read_io = 0;

nfsi->read_cache_jiffies = fattr->time_start;
nfsi->attr_gencount = fattr->gencount;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cbcb6ae..6620606 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -395,6 +395,9 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
dprintk("%s:Begin lo %p\n", __func__, lo);

if (list_empty(&lo->plh_segs)) {
+ /* Reset MDS Threshold I/O counters */
+ NFS_I(lo->plh_inode)->write_io = 0;
+ NFS_I(lo->plh_inode)->read_io = 0;
if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
put_layout_hdr_locked(lo);
return 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ca4a707..c6954ac 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -201,6 +201,9 @@ struct nfs_inode {

/* pNFS layout information */
struct pnfs_layout_hdr *layout;
+ /* how many bytes have been written/read and how many bytes queued up */
+ __u64 write_io;
+ __u64 read_io;
#endif /* CONFIG_NFS_V4*/
#ifdef CONFIG_NFS_FSCACHE
struct fscache_cookie *fscache;
--
1.7.7.6


2012-05-23 13:03:40

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 2/4] NFSv4.1 cache mdsthreshold values on OPEN

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/inode.c | 2 ++
fs/nfs/nfs4proc.c | 38 +++++++++++++++++++++++++++++++++-----
fs/nfs/pnfs.c | 12 ++++++++++++
fs/nfs/pnfs.h | 21 +++++++++++++++++++++
include/linux/nfs_fs.h | 1 +
5 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9ad81ce..889f7e5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -641,6 +641,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f
nfs_init_lock_context(&ctx->lock_context);
ctx->lock_context.open_context = ctx;
INIT_LIST_HEAD(&ctx->list);
+ ctx->mdsthreshold = NULL;
return ctx;
}

@@ -669,6 +670,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
put_rpccred(ctx->cred);
dput(ctx->dentry);
nfs_sb_deactive(sb);
+ kfree(ctx->mdsthreshold);
kfree(ctx);
}

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78784e5..d84c633 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1781,7 +1781,14 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
/*
* Returns a referenced nfs4_state
*/
-static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir,
+ struct dentry *dentry,
+ fmode_t fmode,
+ int flags,
+ struct iattr *sattr,
+ struct rpc_cred *cred,
+ struct nfs4_state **res,
+ struct nfs4_threshold **ctx_th)
{
struct nfs4_state_owner *sp;
struct nfs4_state *state = NULL;
@@ -1806,6 +1813,11 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
if (opendata == NULL)
goto err_put_state_owner;

+ if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+ if (!opendata->f_attr.mdsthreshold)
+ goto err_opendata_put;
+ }
if (dentry->d_inode != NULL)
opendata->state = nfs4_get_open_state(dentry->d_inode, sp);

@@ -1831,11 +1843,19 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
nfs_setattr_update_inode(state->inode, sattr);
nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
}
+
+ if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
+ *ctx_th = opendata->f_attr.mdsthreshold;
+ else
+ kfree(opendata->f_attr.mdsthreshold);
+ opendata->f_attr.mdsthreshold = NULL;
+
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
*res = state;
return 0;
err_opendata_put:
+ kfree(opendata->f_attr.mdsthreshold);
nfs4_opendata_put(opendata);
err_put_state_owner:
nfs4_put_state_owner(sp);
@@ -1845,14 +1865,21 @@ out_err:
}


-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir,
+ struct dentry *dentry,
+ fmode_t fmode,
+ int flags,
+ struct iattr *sattr,
+ struct rpc_cred *cred,
+ struct nfs4_threshold **ctx_th)
{
struct nfs4_exception exception = { };
struct nfs4_state *res;
int status;

do {
- status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res);
+ status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
+ &res, ctx_th);
if (status == 0)
break;
/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2176,7 +2203,8 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags
struct nfs4_state *state;

/* Protect against concurrent sillydeletes */
- state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred);
+ state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
+ ctx->cred, &ctx->mdsthreshold);
if (IS_ERR(state))
return ERR_CAST(state);
ctx->state = state;
@@ -2778,7 +2806,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
fmode = ctx->mode;
}
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, de, fmode, flags, sattr, cred);
+ state = nfs4_do_open(dir, de, fmode, flags, sattr, cred, NULL);
d_drop(dentry);
if (IS_ERR(state)) {
status = PTR_ERR(state);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5d09a36..cbcb6ae 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1630,3 +1630,15 @@ out_free:
kfree(data);
goto out;
}
+
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+ struct nfs4_threshold *thp;
+
+ thp = kzalloc(sizeof(*thp), GFP_NOFS);
+ if (!thp) {
+ dprintk("%s mdsthreshold allocation failed\n", __func__);
+ return NULL;
+ }
+ return thp;
+}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 7980756..29fd23c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -227,6 +227,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
const struct nfs_pgio_completion_ops *compl_ops);
int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
const struct nfs_pgio_completion_ops *compl_ops);
+struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);

/* nfs4_deviceid_flags */
enum {
@@ -360,6 +361,14 @@ static inline int pnfs_return_layout(struct inode *ino)
return 0;
}

+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+ struct nfs_server *nfss)
+{
+ return (dst && src && src->bm != 0 &&
+ nfss->pnfs_curr_ld->id == src->l_type);
+}
+
#ifdef NFS_DEBUG
void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
#else
@@ -485,6 +494,18 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
return 0;
}

+static inline bool
+pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
+ struct nfs_server *nfss)
+{
+ return false;
+}
+
+static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
+{
+ return NULL;
+}
+
#endif /* CONFIG_NFS_V4_1 */

#endif /* FS_NFS_PNFS_H */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6cc7dba..ca4a707 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -102,6 +102,7 @@ struct nfs_open_context {
int error;

struct list_head list;
+ struct nfs4_threshold *mdsthreshold;
};

struct nfs_open_dir_context {
--
1.7.7.6


2012-05-23 13:25:51

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 4/4] NFSv4.1 test the mdsthreshold hint parameters

On 05/23/2012 12:02 PM, [email protected] wrote:

> From: Andy Adamson <[email protected]>
>
> Signed-off-by: Andy Adamson <[email protected]>
> ---
> fs/nfs/pnfs.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 79 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 6620606..b8323aa 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -936,6 +936,81 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
> }
>
> /*
> + * Use mdsthreshold hints set at each OPEN to determine if I/O should go
> + * to the MDS or over pNFS
> + *
> + * The nfs_inode read_io and write_io fields are cumulative counters reset
> + * when there are no layout segments. Note that in pnfs_update_layout iomode
> + * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
> + * WRITE request.
> + *
> + * A return of true means use MDS I/O.
> + *
> + * From rfc 5661:
> + * If a file's size is smaller than the file size threshold, data accesses
> + * SHOULD be sent to the metadata server. If an I/O request has a length that
> + * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
> + * server. If both file size and I/O size are provided, the client SHOULD
> + * reach or exceed both thresholds before sending its read or write
> + * requests to the data server.
> + */
> +static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
> + struct inode *ino, int iomode)
> +{
> + struct nfs4_threshold *t = ctx->mdsthreshold;
> + struct nfs_inode *nfsi = NFS_I(ino);
> + loff_t fsize = i_size_read(ino);
> + bool size = false, size_set = false, io = false, io_set = false, ret = false;
> +
> + if (t == NULL)
> + return ret;
> +
> + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
> + __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
> +
> + switch (iomode) {
> + case IOMODE_READ:
> + if (t->bm & THRESHOLD_RD) {
> + dprintk("%s fsize %llu\n", __func__, fsize);
> + size_set = true;
> + if (fsize < t->rd_sz)
> + size = true;
> + }
> + if (t->bm & THRESHOLD_RD_IO) {
> + dprintk("%s nfsi->read_io %llu\n", __func__,
> + nfsi->read_io);
> + io_set = true;
> + if (nfsi->read_io < t->rd_io_sz)
> + io = true;
> + }
> + break;
> + case IOMODE_RW:
> + if (t->bm & THRESHOLD_WR) {
> + dprintk("%s fsize %llu\n", __func__, fsize);
> + size_set = true;
> + if (fsize < t->wr_sz)
> + size = true;
> + }
> + if (t->bm & THRESHOLD_WR_IO) {
> + dprintk("%s nfsi->write_io %llu\n", __func__,
> + nfsi->write_io);
> + io_set = true;
> + if (nfsi->write_io < t->wr_io_sz)
> + io = true;
> + }
> + break;
> + }
> + if (size_set && io_set) {
> + if (size && io)
> + ret = true;
> + } else if (size || io)
> + ret = true;
> +
> + dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
> + return ret;
> +}
> +
> +/*
> * Layout segment is retreived from the server if not cached.
> * The appropriate layout segment is referenced and returned to the caller.
> */
> @@ -962,6 +1037,10 @@ pnfs_update_layout(struct inode *ino,
>
> if (!pnfs_enabled_sb(NFS_SERVER(ino)))
> return NULL;
> +
> + if (pnfs_within_mdsthreshold(ctx, ino, iomode))
> + return NULL;
> +


Would we want to use these counters as the recommended layout_size in
read and write, instead of current's PAGE_SIZE?

Boaz

> spin_lock(&ino->i_lock);
> lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
> if (lo == NULL) {



2012-05-23 18:20:15

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH 3/4] NFSv4.1 add nfs_inode book keeping for mdsthreshold

T24gV2VkLCAyMDEyLTA1LTIzIGF0IDA1OjAyIC0wNDAwLCBhbmRyb3NAbmV0YXBwLmNvbSB3cm90
ZToNCj4gRnJvbTogQW5keSBBZGFtc29uIDxhbmRyb3NAbmV0YXBwLmNvbT4NCj4gDQo+IEtlZXAg
dHJhY2sgb2YgdGhlIG51bWJlciBvZiBieXRlcyByZWFkIG9yIHdyaXR0ZW4sIGluY2x1ZGluZyB0
aG9zZSBxdWV1ZWQNCj4gdXAgdG8gYmUgZmx1c2hlZC4gRm9yIHVzZSBieSBtZHN0aHJlc2hvbGQg
aS9vIHNpemUgaGludHMuDQo+IA0KPiBObyBsb2NraW5nIG5lZWRlZCBhcyB0aGlzIGlzIHVzZWQg
YXMgaGludCBpbmZvcm1hdGlvbi4NCj4gDQo+IFNpZ25lZC1vZmYtYnk6IEFuZHkgQWRhbXNvbiA8
YW5kcm9zQG5ldGFwcC5jb20+DQo+IC0tLQ0KPiAgZnMvbmZzL2ZpbGUuYyAgICAgICAgICB8ICAg
IDggKysrKysrLS0NCj4gIGZzL25mcy9pbm9kZS5jICAgICAgICAgfCAgICAyICsrDQo+ICBmcy9u
ZnMvcG5mcy5jICAgICAgICAgIHwgICAgMyArKysNCj4gIGluY2x1ZGUvbGludXgvbmZzX2ZzLmgg
fCAgICAzICsrKw0KPiAgNCBmaWxlcyBjaGFuZ2VkLCAxNCBpbnNlcnRpb25zKCspLCAyIGRlbGV0
aW9ucygtKQ0KPiANCj4gZGlmZiAtLWdpdCBhL2ZzL25mcy9maWxlLmMgYi9mcy9uZnMvZmlsZS5j
DQo+IGluZGV4IDhlZGE4YTYuLmM0Y2MwOTYgMTAwNjQ0DQo+IC0tLSBhL2ZzL25mcy9maWxlLmMN
Cj4gKysrIGIvZnMvbmZzL2ZpbGUuYw0KPiBAQCAtMjAzLDggKzIwMywxMCBAQCBuZnNfZmlsZV9y
ZWFkKHN0cnVjdCBraW9jYiAqaW9jYiwgY29uc3Qgc3RydWN0IGlvdmVjICppb3YsDQo+ICAJcmVz
dWx0ID0gbmZzX3JldmFsaWRhdGVfbWFwcGluZyhpbm9kZSwgaW9jYi0+a2lfZmlscC0+Zl9tYXBw
aW5nKTsNCj4gIAlpZiAoIXJlc3VsdCkgew0KPiAgCQlyZXN1bHQgPSBnZW5lcmljX2ZpbGVfYWlv
X3JlYWQoaW9jYiwgaW92LCBucl9zZWdzLCBwb3MpOw0KPiAtCQlpZiAocmVzdWx0ID4gMCkNCj4g
KwkJaWYgKHJlc3VsdCA+IDApIHsNCj4gKwkJCU5GU19JKGlub2RlKS0+cmVhZF9pbyArPSByZXN1
bHQ7DQoNClNob3VsZCB3ZSBwZXJoYXBzIHJhdGhlciBkbyB0aGlzIGZyb20gbmZzX3JlYWRwYWdl
cygpLCBuZnNfcmVhZHBhZ2UoKQ0KYW5kIG5mc19kaXJlY3RfcmVhZCgpPw0KDQpJZiB3ZSBkbyBp
dCBoZXJlIGluIG5mc19maWxlX3JlYWQsIHdlIG1pc3MgbW1hcGVkIHJlYWRzLCBPX0RJUkVDVCBy
ZWFkcywNCmFzIHdlbGwgYXMgc3BsaWNlIHJlYWRzLiBXZSBhbHNvIGNvdW50IHJlYWQgY2FjaGUg
aGl0cyB3aGVyZSB3ZSBkb24ndA0KaGF2ZSB0byBhY3R1YWxseSBhY2Nlc3MgdGhlIHNlcnZlci4N
Cg0KPiAgCQkJbmZzX2FkZF9zdGF0cyhpbm9kZSwgTkZTSU9TX05PUk1BTFJFQURCWVRFUywgcmVz
dWx0KTsNCj4gKwkJfQ0KPiAgCX0NCj4gIAlyZXR1cm4gcmVzdWx0Ow0KPiAgfQ0KPiBAQCAtNjEz
LDggKzYxNSwxMCBAQCBzdGF0aWMgc3NpemVfdCBuZnNfZmlsZV93cml0ZShzdHJ1Y3Qga2lvY2Ig
KmlvY2IsIGNvbnN0IHN0cnVjdCBpb3ZlYyAqaW92LA0KPiAgCQlpZiAoZXJyIDwgMCkNCj4gIAkJ
CXJlc3VsdCA9IGVycjsNCj4gIAl9DQo+IC0JaWYgKHJlc3VsdCA+IDApDQo+ICsJaWYgKHJlc3Vs
dCA+IDApIHsNCj4gKwkJTkZTX0koaW5vZGUpLT53cml0ZV9pbyArPSB3cml0dGVuOw0KDQpGb3Ig
dGhlIHNhbWUgcmVhc29uLCBwZXJoYXBzIHdlIHNob3VsZCBtb3ZlIHRoaXMgdG8NCm5mc19kaXJl
Y3Rfd3JpdGVfc2NoZWR1bGVfaW92ZWMoKSwgYW5kIG5mc193cml0ZV9lbmQoKS4NCg0KPiAgCQlu
ZnNfYWRkX3N0YXRzKGlub2RlLCBORlNJT1NfTk9STUFMV1JJVFRFTkJZVEVTLCB3cml0dGVuKTsN
Cj4gKwl9DQo+ICBvdXQ6DQo+ICAJcmV0dXJuIHJlc3VsdDsNCj4gIA0KPiBkaWZmIC0tZ2l0IGEv
ZnMvbmZzL2lub2RlLmMgYi9mcy9uZnMvaW5vZGUuYw0KPiBpbmRleCA4ODlmN2U1Li5hNmY1ZmJi
IDEwMDY0NA0KPiAtLS0gYS9mcy9uZnMvaW5vZGUuYw0KPiArKysgYi9mcy9uZnMvaW5vZGUuYw0K
PiBAQCAtMzIzLDYgKzMyMyw4IEBAIG5mc19maGdldChzdHJ1Y3Qgc3VwZXJfYmxvY2sgKnNiLCBz
dHJ1Y3QgbmZzX2ZoICpmaCwgc3RydWN0IG5mc19mYXR0ciAqZmF0dHIpDQo+ICAJCWlub2RlLT5p
X2dpZCA9IC0yOw0KPiAgCQlpbm9kZS0+aV9ibG9ja3MgPSAwOw0KPiAgCQltZW1zZXQobmZzaS0+
Y29va2lldmVyZiwgMCwgc2l6ZW9mKG5mc2ktPmNvb2tpZXZlcmYpKTsNCj4gKwkJbmZzaS0+d3Jp
dGVfaW8gPSAwOw0KPiArCQluZnNpLT5yZWFkX2lvID0gMDsNCj4gIA0KPiAgCQluZnNpLT5yZWFk
X2NhY2hlX2ppZmZpZXMgPSBmYXR0ci0+dGltZV9zdGFydDsNCj4gIAkJbmZzaS0+YXR0cl9nZW5j
b3VudCA9IGZhdHRyLT5nZW5jb3VudDsNCj4gZGlmZiAtLWdpdCBhL2ZzL25mcy9wbmZzLmMgYi9m
cy9uZnMvcG5mcy5jDQo+IGluZGV4IGNiY2I2YWUuLjY2MjA2MDYgMTAwNjQ0DQo+IC0tLSBhL2Zz
L25mcy9wbmZzLmMNCj4gKysrIGIvZnMvbmZzL3BuZnMuYw0KPiBAQCAtMzk1LDYgKzM5NSw5IEBA
IG1hcmtfbWF0Y2hpbmdfbHNlZ3NfaW52YWxpZChzdHJ1Y3QgcG5mc19sYXlvdXRfaGRyICpsbywN
Cj4gIAlkcHJpbnRrKCIlczpCZWdpbiBsbyAlcFxuIiwgX19mdW5jX18sIGxvKTsNCj4gIA0KPiAg
CWlmIChsaXN0X2VtcHR5KCZsby0+cGxoX3NlZ3MpKSB7DQo+ICsJCS8qIFJlc2V0IE1EUyBUaHJl
c2hvbGQgSS9PIGNvdW50ZXJzICovDQo+ICsJCU5GU19JKGxvLT5wbGhfaW5vZGUpLT53cml0ZV9p
byA9IDA7DQo+ICsJCU5GU19JKGxvLT5wbGhfaW5vZGUpLT5yZWFkX2lvID0gMDsNCj4gIAkJaWYg
KCF0ZXN0X2FuZF9zZXRfYml0KE5GU19MQVlPVVRfREVTVFJPWUVELCAmbG8tPnBsaF9mbGFncykp
DQo+ICAJCQlwdXRfbGF5b3V0X2hkcl9sb2NrZWQobG8pOw0KPiAgCQlyZXR1cm4gMDsNCj4gZGlm
ZiAtLWdpdCBhL2luY2x1ZGUvbGludXgvbmZzX2ZzLmggYi9pbmNsdWRlL2xpbnV4L25mc19mcy5o
DQo+IGluZGV4IGNhNGE3MDcuLmM2OTU0YWMgMTAwNjQ0DQo+IC0tLSBhL2luY2x1ZGUvbGludXgv
bmZzX2ZzLmgNCj4gKysrIGIvaW5jbHVkZS9saW51eC9uZnNfZnMuaA0KPiBAQCAtMjAxLDYgKzIw
MSw5IEBAIHN0cnVjdCBuZnNfaW5vZGUgew0KPiAgDQo+ICAJLyogcE5GUyBsYXlvdXQgaW5mb3Jt
YXRpb24gKi8NCj4gIAlzdHJ1Y3QgcG5mc19sYXlvdXRfaGRyICpsYXlvdXQ7DQo+ICsJLyogaG93
IG1hbnkgYnl0ZXMgaGF2ZSBiZWVuIHdyaXR0ZW4vcmVhZCBhbmQgaG93IG1hbnkgYnl0ZXMgcXVl
dWVkIHVwICovDQo+ICsJX191NjQgd3JpdGVfaW87DQo+ICsJX191NjQgcmVhZF9pbzsNCj4gICNl
bmRpZiAvKiBDT05GSUdfTkZTX1Y0Ki8NCg0KXl5eXiBUaGlzIGRvZXNuJ3QgbG9vayBhcyBpZiBp
dCB3aWxsIGNvbXBpbGUgd2l0aG91dCBDT05GSUdfTkZTX1Y0Lg0KDQo+ICAjaWZkZWYgQ09ORklH
X05GU19GU0NBQ0hFDQo+ICAJc3RydWN0IGZzY2FjaGVfY29va2llCSpmc2NhY2hlOw0KDQotLSAN
ClRyb25kIE15a2xlYnVzdA0KTGludXggTkZTIGNsaWVudCBtYWludGFpbmVyDQoNCk5ldEFwcA0K
VHJvbmQuTXlrbGVidXN0QG5ldGFwcC5jb20NCnd3dy5uZXRhcHAuY29tDQoNCg==

2012-05-23 13:03:40

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 4/4] NFSv4.1 test the mdsthreshold hint parameters

From: Andy Adamson <[email protected]>

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/pnfs.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 79 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6620606..b8323aa 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -936,6 +936,81 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
}

/*
+ * Use mdsthreshold hints set at each OPEN to determine if I/O should go
+ * to the MDS or over pNFS
+ *
+ * The nfs_inode read_io and write_io fields are cumulative counters reset
+ * when there are no layout segments. Note that in pnfs_update_layout iomode
+ * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
+ * WRITE request.
+ *
+ * A return of true means use MDS I/O.
+ *
+ * From rfc 5661:
+ * If a file's size is smaller than the file size threshold, data accesses
+ * SHOULD be sent to the metadata server. If an I/O request has a length that
+ * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
+ * server. If both file size and I/O size are provided, the client SHOULD
+ * reach or exceed both thresholds before sending its read or write
+ * requests to the data server.
+ */
+static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
+ struct inode *ino, int iomode)
+{
+ struct nfs4_threshold *t = ctx->mdsthreshold;
+ struct nfs_inode *nfsi = NFS_I(ino);
+ loff_t fsize = i_size_read(ino);
+ bool size = false, size_set = false, io = false, io_set = false, ret = false;
+
+ if (t == NULL)
+ return ret;
+
+ dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
+ __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
+
+ switch (iomode) {
+ case IOMODE_READ:
+ if (t->bm & THRESHOLD_RD) {
+ dprintk("%s fsize %llu\n", __func__, fsize);
+ size_set = true;
+ if (fsize < t->rd_sz)
+ size = true;
+ }
+ if (t->bm & THRESHOLD_RD_IO) {
+ dprintk("%s nfsi->read_io %llu\n", __func__,
+ nfsi->read_io);
+ io_set = true;
+ if (nfsi->read_io < t->rd_io_sz)
+ io = true;
+ }
+ break;
+ case IOMODE_RW:
+ if (t->bm & THRESHOLD_WR) {
+ dprintk("%s fsize %llu\n", __func__, fsize);
+ size_set = true;
+ if (fsize < t->wr_sz)
+ size = true;
+ }
+ if (t->bm & THRESHOLD_WR_IO) {
+ dprintk("%s nfsi->write_io %llu\n", __func__,
+ nfsi->write_io);
+ io_set = true;
+ if (nfsi->write_io < t->wr_io_sz)
+ io = true;
+ }
+ break;
+ }
+ if (size_set && io_set) {
+ if (size && io)
+ ret = true;
+ } else if (size || io)
+ ret = true;
+
+ dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
+ return ret;
+}
+
+/*
* Layout segment is retreived from the server if not cached.
* The appropriate layout segment is referenced and returned to the caller.
*/
@@ -962,6 +1037,10 @@ pnfs_update_layout(struct inode *ino,

if (!pnfs_enabled_sb(NFS_SERVER(ino)))
return NULL;
+
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+ return NULL;
+
spin_lock(&ino->i_lock);
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
--
1.7.7.6