2016-12-08 20:58:59

by Stefano Stabellini

[permalink] [raw]
Subject: [PATCH 0/5] async requests support for 9pfs

Hi all,

This patch series introduces async requests for read and write
operations. If the read, or the write, is an async operation to begin
with (aio), we can avoid waiting for the server response.

This is my first contribution to 9p, so feedback and suggestions are
welcome!


Stefano Stabellini (5):
9p: add iocb parameter to p9_client_read and p9_client_write
9p: store req details and callback in struct p9_req_t
9p: introduce p9_client_get_req
9p: introduce async read requests
9p: introduce async write requests

fs/9p/vfs_addr.c | 8 +-
fs/9p/vfs_dir.c | 2 +-
fs/9p/vfs_file.c | 4 +-
fs/9p/xattr.c | 4 +-
include/net/9p/client.h | 15 +++-
net/9p/client.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++--
6 files changed, 211 insertions(+), 17 deletions(-)


2016-12-08 20:59:16

by Stefano Stabellini

[permalink] [raw]
Subject: [PATCH 4/5] 9p: introduce async read requests

If the read is an async operation, send a 9p request and return
EIOCBQUEUED. Do not wait for completion.

Complete the read operation from a callback instead.

Signed-off-by: Stefano Stabellini <[email protected]>
---
net/9p/client.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index eb589ef..f9f09db 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -28,6 +28,7 @@
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/pagemap.h>
#include <linux/poll.h>
#include <linux/idr.h>
#include <linux/mutex.h>
@@ -1554,13 +1555,68 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
}
EXPORT_SYMBOL(p9_client_unlinkat);

+static void
+p9_client_read_complete(struct p9_client *clnt, struct p9_req_t *req, int status)
+{
+ int err, count, n, i, total = 0;
+ char *dataptr, *to;
+
+ if (req->status == REQ_STATUS_ERROR) {
+ p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+ err = req->t_err;
+ goto out;
+ }
+ err = p9_check_errors(clnt, req);
+ if (err)
+ goto out;
+
+ err = p9pdu_readf(req->rc, clnt->proto_version,
+ "D", &count, &dataptr);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto out;
+ }
+ if (!count) {
+ p9_debug(P9_DEBUG_ERROR, "count=%d\n", count);
+ err = 0;
+ goto out;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+ if (count > req->rsize)
+ count = req->rsize;
+
+ for (i = 0; i < ((req->rsize + PAGE_SIZE - 1) / PAGE_SIZE); i++) {
+ to = kmap(req->pagevec[i]);
+ to += req->offset;
+ n = PAGE_SIZE - req->offset;
+ if (n > count)
+ n = count;
+ memcpy(to, dataptr, n);
+ kunmap(req->pagevec[i]);
+ req->offset = 0;
+ count -= n;
+ total += n;
+ }
+
+ err = total;
+ req->kiocb->ki_pos += total;
+
+out:
+ req->kiocb->ki_complete(req->kiocb, err, 0);
+
+ release_pages(req->pagevec, (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, false);
+ kvfree(req->pagevec);
+ p9_free_req(clnt, req);
+}
+
int
p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
struct iov_iter *to, int *err)
{
struct p9_client *clnt = fid->clnt;
struct p9_req_t *req;
- int total = 0;
+ int total = 0, i;
*err = 0;

p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
@@ -1587,10 +1643,38 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
0, 11, "dqd", fid->fid,
offset, rsize);
- } else {
+ /* sync request */
+ } else if(iocb == NULL || is_sync_kiocb(iocb)) {
non_zc = 1;
req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
rsize);
+ /* async request */
+ } else {
+ req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize);
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ break;
+ }
+ req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec,
+ (size_t)rsize, &req->offset);
+ req->kiocb = iocb;
+ for (i = 0; i < req->rsize; i += PAGE_SIZE)
+ page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]);
+ req->callback = p9_client_read_complete;
+
+ *err = clnt->trans_mod->request(clnt, req);
+ if (*err < 0) {
+ clnt->status = Disconnected;
+ release_pages(req->pagevec,
+ (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE,
+ true);
+ kvfree(req->pagevec);
+ p9_free_req(clnt, req);
+ break;
+ }
+
+ *err = -EIOCBQUEUED;
+ break;
}
if (IS_ERR(req)) {
*err = PTR_ERR(req);
--
1.9.1

2016-12-08 20:59:26

by Stefano Stabellini

[permalink] [raw]
Subject: [PATCH 5/5] 9p: introduce async write requests

If the write is an async operation, send a 9p request and return
EIOCBQUEUED. Do not wait for completion.

Complete the write operation from a callback instead.

Signed-off-by: Stefano Stabellini <[email protected]>
---
net/9p/client.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index f9f09db..18942b7 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1719,6 +1719,61 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
}
EXPORT_SYMBOL(p9_client_read);

+static void
+p9_client_write_complete(struct p9_client *clnt, struct p9_req_t *req, int status)
+{
+ int err, count;
+ struct file *file;
+ struct inode *inode;
+ unsigned long pg_start, pg_end;
+ loff_t i_size;
+
+ if (req->status == REQ_STATUS_ERROR) {
+ p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+ err = req->t_err;
+ goto out;
+ }
+ err = p9_check_errors(clnt, req);
+ if (err)
+ goto out;
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto out;
+ }
+ if (!count) {
+ p9_debug(P9_DEBUG_ERROR, "count=%d\n", count);
+ err = 0;
+ goto out;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+
+ if (count > req->rsize)
+ count = req->rsize;
+
+ err = count;
+ file = req->kiocb->ki_filp;
+ inode = file_inode(file);
+ pg_start = req->kiocb->ki_pos >> PAGE_SHIFT;
+ pg_end = (req->kiocb->ki_pos + count - 1) >> PAGE_SHIFT;
+
+ if (inode->i_mapping && inode->i_mapping->nrpages)
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pg_start, pg_end);
+ req->kiocb->ki_pos += count;
+ i_size = i_size_read(inode);
+ if (req->kiocb->ki_pos > i_size) {
+ inode_add_bytes(inode, req->kiocb->ki_pos - i_size);
+ i_size_write(inode, req->kiocb->ki_pos);
+ }
+out:
+ req->kiocb->ki_complete(req->kiocb, err, 0);
+
+ p9_free_req(clnt, req);
+}
+
int
p9_client_write(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
struct iov_iter *from, int *err)
@@ -1746,9 +1801,32 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
rsize, P9_ZC_HDR_SZ, "dqd",
fid->fid, offset, rsize);
- } else {
+ /* sync request */
+ } else if(iocb == NULL || is_sync_kiocb(iocb)) {
req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
offset, rsize, from);
+ /* async request */
+ } else {
+ req = p9_client_get_req(clnt, P9_TWRITE, "dqV", fid->fid,
+ offset, rsize, from);
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ break;
+ }
+ req->rsize = rsize;
+ req->kiocb = iocb;
+ req->callback = p9_client_write_complete;
+
+ *err = clnt->trans_mod->request(clnt, req);
+ if (*err < 0) {
+ clnt->status = Disconnected;
+ p9_free_req(clnt, req);
+ break;
+ }
+
+ iov_iter_advance(from, rsize);
+ *err = -EIOCBQUEUED;
+ break;
}
if (IS_ERR(req)) {
*err = PTR_ERR(req);
--
1.9.1

2016-12-08 20:59:14

by Stefano Stabellini

[permalink] [raw]
Subject: [PATCH 2/5] 9p: store req details and callback in struct p9_req_t

Add a few fields to struct p9_req_t. Callback is the function which will
be called upon requestion completion. offset, rsize, pagevec and kiocb
store important information regarding the read or write request,
essential to complete the request.

Currently not utilized, but they will be used in a later patch.

Signed-off-by: Stefano Stabellini <[email protected]>
---
include/net/9p/client.h | 8 ++++++++
net/9p/client.c | 9 ++++++++-
2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index aef19c6..69fc2f0 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -110,6 +110,7 @@ enum p9_req_status_t {
*
*/

+struct p9_client;
struct p9_req_t {
int status;
int t_err;
@@ -118,6 +119,13 @@ struct p9_req_t {
struct p9_fcall *rc;
void *aux;

+ /* Used for async requests */
+ void (*callback)(struct p9_client *c, struct p9_req_t *req, int status);
+ size_t offset;
+ u64 rsize;
+ struct page **pagevec;
+ struct kiocb *kiocb;
+
struct list_head req_list;
};

diff --git a/net/9p/client.c b/net/9p/client.c
index b5ea9a3..bfe1715 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -405,6 +405,10 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
int tag = r->tc->tag;
p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);

+ r->offset = 0;
+ r->rsize = 0;
+ r->kiocb = NULL;
+ r->callback = NULL;
r->status = REQ_STATUS_IDLE;
if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
p9_idpool_put(tag, c->tagpool);
@@ -427,7 +431,10 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
smp_wmb();
req->status = status;

- wake_up(req->wq);
+ if (req->callback != NULL)
+ req->callback(c, req, status);
+ else
+ wake_up(req->wq);
p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
}
EXPORT_SYMBOL(p9_client_cb);
--
1.9.1

2016-12-08 20:59:13

by Stefano Stabellini

[permalink] [raw]
Subject: [PATCH 1/5] 9p: add iocb parameter to p9_client_read and p9_client_write

The parameter can be NULL.
Currently not utilized, but it will be used in later patches.

Signed-off-by: Stefano Stabellini <[email protected]>
---
fs/9p/vfs_addr.c | 8 ++++----
fs/9p/vfs_dir.c | 2 +-
fs/9p/vfs_file.c | 4 ++--
fs/9p/xattr.c | 4 ++--
include/net/9p/client.h | 7 +++++--
net/9p/client.c | 6 ++++--
6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad7..99ba284 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -66,7 +66,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)

iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE);

- retval = p9_client_read(fid, page_offset(page), &to, &err);
+ retval = p9_client_read(fid, NULL, page_offset(page), &to, &err);
if (err) {
v9fs_uncache_page(inode, page);
retval = err;
@@ -181,7 +181,7 @@ static int v9fs_vfs_writepage_locked(struct page *page)

set_page_writeback(page);

- p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err);
+ p9_client_write(v9inode->writeback_fid, NULL, page_offset(page), &from, &err);

end_page_writeback(page);
return err;
@@ -251,7 +251,7 @@ static int v9fs_launder_page(struct page *page)
ssize_t n;
int err = 0;
if (iov_iter_rw(iter) == WRITE) {
- n = p9_client_write(file->private_data, pos, iter, &err);
+ n = p9_client_write(file->private_data, iocb, pos, iter, &err);
if (n) {
struct inode *inode = file_inode(file);
loff_t i_size = i_size_read(inode);
@@ -259,7 +259,7 @@ static int v9fs_launder_page(struct page *page)
inode_add_bytes(inode, pos + n - i_size);
}
} else {
- n = p9_client_read(file->private_data, pos, iter, &err);
+ n = p9_client_read(file->private_data, iocb, pos, iter, &err);
}
return n ? n : err;
}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b0405d6..68557e0 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -134,7 +134,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
struct iov_iter to;
int n;
iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen);
- n = p9_client_read(file->private_data, ctx->pos, &to,
+ n = p9_client_read(file->private_data, NULL, ctx->pos, &to,
&err);
if (err)
return err;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index d7b78d5..79e8c7d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -387,7 +387,7 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
iov_iter_count(to), iocb->ki_pos);

- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
+ ret = p9_client_read(fid, iocb, iocb->ki_pos, to, &err);
if (!ret)
return err;

@@ -416,7 +416,7 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
return retval;

origin = iocb->ki_pos;
- retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
+ retval = p9_client_write(file->private_data, iocb, iocb->ki_pos, from, &err);
if (retval > 0) {
struct inode *inode = file_inode(file);
loff_t i_size;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f329eee..5e137c0 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -48,7 +48,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
retval = -ERANGE;
} else {
iov_iter_truncate(&to, attr_size);
- retval = p9_client_read(attr_fid, 0, &to, &err);
+ retval = p9_client_read(attr_fid, NULL, 0, &to, &err);
if (err)
retval = err;
}
@@ -125,7 +125,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
retval);
else
- p9_client_write(fid, 0, &from, &retval);
+ p9_client_write(fid, NULL, 0, &from, &retval);
p9_client_clunk(fid);
return retval;
}
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index c6b97e5..aef19c6 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -26,6 +26,7 @@
#ifndef NET_9P_CLIENT_H
#define NET_9P_CLIENT_H

+#include <linux/fs.h>
#include <linux/utsname.h>

/* Number of requests per row */
@@ -238,8 +239,10 @@ int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
int p9_client_fsync(struct p9_fid *fid, int datasync);
int p9_client_remove(struct p9_fid *fid);
int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags);
-int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err);
-int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err);
+int p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
+ struct iov_iter *to, int *err);
+int p9_client_write(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
+ struct iov_iter *from, int *err);
int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset);
int p9dirent_read(struct p9_client *clnt, char *buf, int len,
struct p9_dirent *dirent);
diff --git a/net/9p/client.c b/net/9p/client.c
index 3fc94a4..b5ea9a3 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1536,7 +1536,8 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
EXPORT_SYMBOL(p9_client_unlinkat);

int
-p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
+ struct iov_iter *to, int *err)
{
struct p9_client *clnt = fid->clnt;
struct p9_req_t *req;
@@ -1616,7 +1617,8 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
EXPORT_SYMBOL(p9_client_read);

int
-p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+p9_client_write(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
+ struct iov_iter *from, int *err)
{
struct p9_client *clnt = fid->clnt;
struct p9_req_t *req;
--
1.9.1

2016-12-08 20:59:53

by Stefano Stabellini

[permalink] [raw]
Subject: [PATCH 3/5] 9p: introduce p9_client_get_req

Introduce a simple helper function to only prepare a p9 client request,
without any waiting involved.

Currently not utilized, but it will be used by a later patch.

Signed-off-by: Stefano Stabellini <[email protected]>
---
net/9p/client.c | 12 ++++++++++++
1 file changed, 12 insertions(+)

diff --git a/net/9p/client.c b/net/9p/client.c
index bfe1715..eb589ef 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -719,6 +719,18 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
return ERR_PTR(err);
}

+static struct p9_req_t *
+p9_client_get_req(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+ va_list ap;
+ struct p9_req_t *req;
+
+ va_start(ap, fmt);
+ req = p9_client_prepare_req(c, type, c->msize, fmt, ap);
+ va_end(ap);
+ return req;
+}
+
/**
* p9_client_rpc - issue a request and wait for a response
* @c: client session
--
1.9.1

2016-12-09 07:27:35

by Dominique Martinet

[permalink] [raw]
Subject: Re: [V9fs-developer] [PATCH 4/5] 9p: introduce async read requests

Stefano Stabellini wrote on Thu, Dec 08, 2016:
> If the read is an async operation, send a 9p request and return
> EIOCBQUEUED. Do not wait for completion.
>
> Complete the read operation from a callback instead.
>
> Signed-off-by: Stefano Stabellini <[email protected]>
> ---
> net/9p/client.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 86 insertions(+), 2 deletions(-)
>
> diff --git a/net/9p/client.c b/net/9p/client.c
> index eb589ef..f9f09db 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -28,6 +28,7 @@
> #include <linux/module.h>
> #include <linux/errno.h>
> #include <linux/fs.h>
> +#include <linux/pagemap.h>
> #include <linux/poll.h>
> #include <linux/idr.h>
> #include <linux/mutex.h>
> @@ -1554,13 +1555,68 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
> }
> EXPORT_SYMBOL(p9_client_unlinkat);
>
> +static void
> +p9_client_read_complete(struct p9_client *clnt, struct p9_req_t *req, int status)
> +{
> + int err, count, n, i, total = 0;
> + char *dataptr, *to;
> +
> + if (req->status == REQ_STATUS_ERROR) {
> + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
> + err = req->t_err;
> + goto out;
> + }
> + err = p9_check_errors(clnt, req);
> + if (err)
> + goto out;
> +
> + err = p9pdu_readf(req->rc, clnt->proto_version,
> + "D", &count, &dataptr);
> + if (err) {
> + trace_9p_protocol_dump(clnt, req->rc);
> + goto out;
> + }
> + if (!count) {
> + p9_debug(P9_DEBUG_ERROR, "count=%d\n", count);
> + err = 0;
> + goto out;
> + }
> +
> + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
> + if (count > req->rsize)
> + count = req->rsize;
> +
> + for (i = 0; i < ((req->rsize + PAGE_SIZE - 1) / PAGE_SIZE); i++) {
> + to = kmap(req->pagevec[i]);
> + to += req->offset;
> + n = PAGE_SIZE - req->offset;
> + if (n > count)
> + n = count;
> + memcpy(to, dataptr, n);
> + kunmap(req->pagevec[i]);
> + req->offset = 0;
> + count -= n;
> + total += n;
> + }
> +
> + err = total;
> + req->kiocb->ki_pos += total;
> +
> +out:
> + req->kiocb->ki_complete(req->kiocb, err, 0);
> +
> + release_pages(req->pagevec, (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, false);
> + kvfree(req->pagevec);
> + p9_free_req(clnt, req);
> +}
> +
> int
> p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
> struct iov_iter *to, int *err)
> {
> struct p9_client *clnt = fid->clnt;
> struct p9_req_t *req;
> - int total = 0;
> + int total = 0, i;
> *err = 0;
>
> p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
> @@ -1587,10 +1643,38 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
> req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
> 0, 11, "dqd", fid->fid,
> offset, rsize);
> - } else {
> + /* sync request */
> + } else if(iocb == NULL || is_sync_kiocb(iocb)) {
> non_zc = 1;
> req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
> rsize);
> + /* async request */
> + } else {

I'm not too familiar with iocb/how async IOs should work, but a logic
question just to make sure that has been thought out:
We prefer zc here to async, even if zc can be slow?

Ideally at some point zc and async aren't exclusive so we'll have async
zc and async normal, but for now I'd say async comes before zc - yes
there will be an extra copy in memory, but it will be done
asynchronously.
Was it intentional to prefer zc here?

> + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize);
> + if (IS_ERR(req)) {
> + *err = PTR_ERR(req);
> + break;
> + }
> + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec,
> + (size_t)rsize, &req->offset);
> + req->kiocb = iocb;
> + for (i = 0; i < req->rsize; i += PAGE_SIZE)
> + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]);
> + req->callback = p9_client_read_complete;
> +
> + *err = clnt->trans_mod->request(clnt, req);
> + if (*err < 0) {
> + clnt->status = Disconnected;
> + release_pages(req->pagevec,
> + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE,
> + true);
> + kvfree(req->pagevec);
> + p9_free_req(clnt, req);
> + break;
> + }
> +
> + *err = -EIOCBQUEUED;
> + break;

(Just a note to myself (or anyone who wants to do this) that a couple of
places only look at err if size written is 0... They should check if err
has been set)

> }
> if (IS_ERR(req)) {
> *err = PTR_ERR(req);

2016-12-09 07:28:12

by Dominique Martinet

[permalink] [raw]
Subject: Re: [V9fs-developer] [PATCH 2/5] 9p: store req details and callback in struct p9_req_t


Nice. I like the idea of async I/Os :)

Stefano Stabellini wrote on Thu, Dec 08, 2016:
> Add a few fields to struct p9_req_t. Callback is the function which will
> be called upon requestion completion. offset, rsize, pagevec and kiocb
> store important information regarding the read or write request,
> essential to complete the request.
>
> Currently not utilized, but they will be used in a later patch.
>
> Signed-off-by: Stefano Stabellini <[email protected]>
> ---
> include/net/9p/client.h | 8 ++++++++
> net/9p/client.c | 9 ++++++++-
> 2 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> index aef19c6..69fc2f0 100644
> --- a/include/net/9p/client.h
> +++ b/include/net/9p/client.h
> @@ -110,6 +110,7 @@ enum p9_req_status_t {
> *
> */
>
> +struct p9_client;
> struct p9_req_t {
> int status;
> int t_err;
> @@ -118,6 +119,13 @@ struct p9_req_t {
> struct p9_fcall *rc;
> void *aux;
>
> + /* Used for async requests */
> + void (*callback)(struct p9_client *c, struct p9_req_t *req, int status);
> + size_t offset;
> + u64 rsize;
> + struct page **pagevec;
> + struct kiocb *kiocb;
> +
> struct list_head req_list;
> };
>
> diff --git a/net/9p/client.c b/net/9p/client.c
> index b5ea9a3..bfe1715 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -405,6 +405,10 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
> int tag = r->tc->tag;
> p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
>
> + r->offset = 0;
> + r->rsize = 0;
> + r->kiocb = NULL;
> + r->callback = NULL;

Probably want to cleanup r->pagevec here too, even if that doesn't seem
to have any implication short-term (e.g. only looked at if callback is
not empty from what I've seen)

> r->status = REQ_STATUS_IDLE;
> if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
> p9_idpool_put(tag, c->tagpool);
> @@ -427,7 +431,10 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
> smp_wmb();
> req->status = status;
>
> - wake_up(req->wq);
> + if (req->callback != NULL)
> + req->callback(c, req, status);
> + else
> + wake_up(req->wq);
> p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
> }
> EXPORT_SYMBOL(p9_client_cb);

Mostly a warning here, but p9_client_cb is called from an interrupt
context in 9P/RDMA.
This has been working up till now because we only do a wake_up and
there's no waiting, but (looking at later patches),
p9_client_read_complete for example does allocations and possibly other
unsafe operations from an interrupt context.

I don't know if the way forward is to move p9_client_cb from that
context or to have the callback be scheduled in a work queue instead;
but we'll need to fix that later.

--
Dominique Martinet

2016-12-09 22:23:01

by Stefano Stabellini

[permalink] [raw]
Subject: Re: [V9fs-developer] [PATCH 4/5] 9p: introduce async read requests

On Fri, 9 Dec 2016, Dominique Martinet wrote:
> Stefano Stabellini wrote on Thu, Dec 08, 2016:
> > If the read is an async operation, send a 9p request and return
> > EIOCBQUEUED. Do not wait for completion.
> >
> > Complete the read operation from a callback instead.
> >
> > Signed-off-by: Stefano Stabellini <[email protected]>
> > ---
> > net/9p/client.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 86 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/9p/client.c b/net/9p/client.c
> > index eb589ef..f9f09db 100644
> > --- a/net/9p/client.c
> > +++ b/net/9p/client.c
> > @@ -28,6 +28,7 @@
> > #include <linux/module.h>
> > #include <linux/errno.h>
> > #include <linux/fs.h>
> > +#include <linux/pagemap.h>
> > #include <linux/poll.h>
> > #include <linux/idr.h>
> > #include <linux/mutex.h>
> > @@ -1554,13 +1555,68 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
> > }
> > EXPORT_SYMBOL(p9_client_unlinkat);
> >
> > +static void
> > +p9_client_read_complete(struct p9_client *clnt, struct p9_req_t *req, int status)
> > +{
> > + int err, count, n, i, total = 0;
> > + char *dataptr, *to;
> > +
> > + if (req->status == REQ_STATUS_ERROR) {
> > + p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
> > + err = req->t_err;
> > + goto out;
> > + }
> > + err = p9_check_errors(clnt, req);
> > + if (err)
> > + goto out;
> > +
> > + err = p9pdu_readf(req->rc, clnt->proto_version,
> > + "D", &count, &dataptr);
> > + if (err) {
> > + trace_9p_protocol_dump(clnt, req->rc);
> > + goto out;
> > + }
> > + if (!count) {
> > + p9_debug(P9_DEBUG_ERROR, "count=%d\n", count);
> > + err = 0;
> > + goto out;
> > + }
> > +
> > + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
> > + if (count > req->rsize)
> > + count = req->rsize;
> > +
> > + for (i = 0; i < ((req->rsize + PAGE_SIZE - 1) / PAGE_SIZE); i++) {
> > + to = kmap(req->pagevec[i]);
> > + to += req->offset;
> > + n = PAGE_SIZE - req->offset;
> > + if (n > count)
> > + n = count;
> > + memcpy(to, dataptr, n);
> > + kunmap(req->pagevec[i]);
> > + req->offset = 0;
> > + count -= n;
> > + total += n;
> > + }
> > +
> > + err = total;
> > + req->kiocb->ki_pos += total;
> > +
> > +out:
> > + req->kiocb->ki_complete(req->kiocb, err, 0);
> > +
> > + release_pages(req->pagevec, (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE, false);
> > + kvfree(req->pagevec);
> > + p9_free_req(clnt, req);
> > +}
> > +
> > int
> > p9_client_read(struct p9_fid *fid, struct kiocb *iocb, u64 offset,
> > struct iov_iter *to, int *err)
> > {
> > struct p9_client *clnt = fid->clnt;
> > struct p9_req_t *req;
> > - int total = 0;
> > + int total = 0, i;
> > *err = 0;
> >
> > p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
> > @@ -1587,10 +1643,38 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
> > req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
> > 0, 11, "dqd", fid->fid,
> > offset, rsize);
> > - } else {
> > + /* sync request */
> > + } else if(iocb == NULL || is_sync_kiocb(iocb)) {
> > non_zc = 1;
> > req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
> > rsize);
> > + /* async request */
> > + } else {
>
> I'm not too familiar with iocb/how async IOs should work, but a logic
> question just to make sure that has been thought out:
> We prefer zc here to async, even if zc can be slow?
>
> Ideally at some point zc and async aren't exclusive so we'll have async
> zc and async normal, but for now I'd say async comes before zc - yes
> there will be an extra copy in memory, but it will be done
> asynchronously.
> Was it intentional to prefer zc here?

I wasn't sure what to do about zc. The backends I am testing with don't
support zc, so I didn't feel confident in changing its behavior. I think
whether zc is faster than async+copy depends on the specific benchmark.
iodepth and blocksize parameters in fio, for example. With iodepth=1, zc
would be faster, the higher the iodepth, the faster async+copy would
become in comparison. At some point async+copy will be faster than zc,
but I am not sure where is the threshold, it would probably be storage
backend dependent too. Maybe around iodepth=3. This is a reasonable
guess but I haven't run any numbers to confirm it.

That said, I am happy to follow any strategy you suggest in regards to zc.


> > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize);
> > + if (IS_ERR(req)) {
> > + *err = PTR_ERR(req);
> > + break;
> > + }
> > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec,
> > + (size_t)rsize, &req->offset);
> > + req->kiocb = iocb;
> > + for (i = 0; i < req->rsize; i += PAGE_SIZE)
> > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]);
> > + req->callback = p9_client_read_complete;
> > +
> > + *err = clnt->trans_mod->request(clnt, req);
> > + if (*err < 0) {
> > + clnt->status = Disconnected;
> > + release_pages(req->pagevec,
> > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE,
> > + true);
> > + kvfree(req->pagevec);
> > + p9_free_req(clnt, req);
> > + break;
> > + }
> > +
> > + *err = -EIOCBQUEUED;
> > + break;
>
> (Just a note to myself (or anyone who wants to do this) that a couple of
> places only look at err if size written is 0... They should check if err
> has been set)
>
> > }
> > if (IS_ERR(req)) {
> > *err = PTR_ERR(req);
>

2016-12-09 23:24:39

by Stefano Stabellini

[permalink] [raw]
Subject: Re: [V9fs-developer] [PATCH 2/5] 9p: store req details and callback in struct p9_req_t

On Fri, 9 Dec 2016, Dominique Martinet wrote:
> Nice. I like the idea of async I/Os :)
>
> Stefano Stabellini wrote on Thu, Dec 08, 2016:
> > Add a few fields to struct p9_req_t. Callback is the function which will
> > be called upon requestion completion. offset, rsize, pagevec and kiocb
> > store important information regarding the read or write request,
> > essential to complete the request.
> >
> > Currently not utilized, but they will be used in a later patch.
> >
> > Signed-off-by: Stefano Stabellini <[email protected]>
> > ---
> > include/net/9p/client.h | 8 ++++++++
> > net/9p/client.c | 9 ++++++++-
> > 2 files changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> > index aef19c6..69fc2f0 100644
> > --- a/include/net/9p/client.h
> > +++ b/include/net/9p/client.h
> > @@ -110,6 +110,7 @@ enum p9_req_status_t {
> > *
> > */
> >
> > +struct p9_client;
> > struct p9_req_t {
> > int status;
> > int t_err;
> > @@ -118,6 +119,13 @@ struct p9_req_t {
> > struct p9_fcall *rc;
> > void *aux;
> >
> > + /* Used for async requests */
> > + void (*callback)(struct p9_client *c, struct p9_req_t *req, int status);
> > + size_t offset;
> > + u64 rsize;
> > + struct page **pagevec;
> > + struct kiocb *kiocb;
> > +
> > struct list_head req_list;
> > };
> >
> > diff --git a/net/9p/client.c b/net/9p/client.c
> > index b5ea9a3..bfe1715 100644
> > --- a/net/9p/client.c
> > +++ b/net/9p/client.c
> > @@ -405,6 +405,10 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
> > int tag = r->tc->tag;
> > p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
> >
> > + r->offset = 0;
> > + r->rsize = 0;
> > + r->kiocb = NULL;
> > + r->callback = NULL;
>
> Probably want to cleanup r->pagevec here too, even if that doesn't seem
> to have any implication short-term (e.g. only looked at if callback is
> not empty from what I've seen)

Thanks, I missed it.


> > r->status = REQ_STATUS_IDLE;
> > if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
> > p9_idpool_put(tag, c->tagpool);
> > @@ -427,7 +431,10 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
> > smp_wmb();
> > req->status = status;
> >
> > - wake_up(req->wq);
> > + if (req->callback != NULL)
> > + req->callback(c, req, status);
> > + else
> > + wake_up(req->wq);
> > p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
> > }
> > EXPORT_SYMBOL(p9_client_cb);
>
> Mostly a warning here, but p9_client_cb is called from an interrupt
> context in 9P/RDMA.
> This has been working up till now because we only do a wake_up and
> there's no waiting, but (looking at later patches),
> p9_client_read_complete for example does allocations and possibly other
> unsafe operations from an interrupt context.
>
> I don't know if the way forward is to move p9_client_cb from that
> context or to have the callback be scheduled in a work queue instead;
> but we'll need to fix that later.

Either would work. It might be simpler to have the callback run as a
work queue. I'll make the change. Maybe I'll use kiocb to figure out if
we have to schedule_work.

2016-12-10 01:51:16

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 4/5] 9p: introduce async read requests

On Thu, Dec 08, 2016 at 12:59:05PM -0800, Stefano Stabellini wrote:


> + } else {
> + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize);
> + if (IS_ERR(req)) {
> + *err = PTR_ERR(req);
> + break;
> + }
> + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec,
> + (size_t)rsize, &req->offset);
> + req->kiocb = iocb;
> + for (i = 0; i < req->rsize; i += PAGE_SIZE)
> + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]);
> + req->callback = p9_client_read_complete;
> +
> + *err = clnt->trans_mod->request(clnt, req);
> + if (*err < 0) {
> + clnt->status = Disconnected;
> + release_pages(req->pagevec,
> + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE,
> + true);
> + kvfree(req->pagevec);
> + p9_free_req(clnt, req);
> + break;
> + }
> +
> + *err = -EIOCBQUEUED;

IDGI. AFAICS, your code will result in shitloads of short reads - every
time when you give it a multi-iovec array, only the first one will be
issued and the rest won't be even looked at. Sure, it is technically
legal, but I very much doubt that aio users will be happy with that.

What am I missing here?

2016-12-13 01:16:07

by Stefano Stabellini

[permalink] [raw]
Subject: Re: [PATCH 4/5] 9p: introduce async read requests

Hi Al,
thanks for looking at the patch.

On Sat, 10 Dec 2016, Al Viro wrote:
> On Thu, Dec 08, 2016 at 12:59:05PM -0800, Stefano Stabellini wrote:
>
>
> > + } else {
> > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize);
> > + if (IS_ERR(req)) {
> > + *err = PTR_ERR(req);
> > + break;
> > + }
> > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec,
> > + (size_t)rsize, &req->offset);
> > + req->kiocb = iocb;
> > + for (i = 0; i < req->rsize; i += PAGE_SIZE)
> > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]);
> > + req->callback = p9_client_read_complete;
> > +
> > + *err = clnt->trans_mod->request(clnt, req);
> > + if (*err < 0) {
> > + clnt->status = Disconnected;
> > + release_pages(req->pagevec,
> > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE,
> > + true);
> > + kvfree(req->pagevec);
> > + p9_free_req(clnt, req);
> > + break;
> > + }
> > +
> > + *err = -EIOCBQUEUED;
>
> IDGI. AFAICS, your code will result in shitloads of short reads - every
> time when you give it a multi-iovec array, only the first one will be
> issued and the rest won't be even looked at. Sure, it is technically
> legal, but I very much doubt that aio users will be happy with that.
>
> What am I missing here?

There is a problem with short reads, but I don't think it is the one
you described, unless I made a mistake somewhere.

The code above will issue one request as big as possible (size is
limited by clnt->msize, which is the size of the channel). No matter how
many segs in the iov_iter, the request will cover the maximum amount of
bytes allowed by the channel, typically 4K but can be larger. So
multi-iovec arrays should be handled correctly up to msize. Please let
me know if I am wrong, I am not an expert on this. I tried, but couldn't
actually get any multi-iovec arrays in my tests.


That said, reading the code again, there are indeed two possible causes
of short reads. The first one are responses which have a smaller byte
count than requested. Currently this case is not handled, forwarding
the short read up to the user. But I wrote and tested successfully a
patch that issues another follow-up request from the completion
function. Example:
p9_client_read, issue request, size 4K
p9_client_read_complete, receive response, count 2K
p9_client_read_complete, issue request size 4K-2K = 2K
p9_client_read_complete, receive response size 2K
p9_client_read_complete, call ki_complete


The second possible cause of short reads are requests which are bigger
than msize. For example a 2MB read over a 4K channel. In this patch we
simply issue one request as big as msize, and eventually return to the
caller a smaller byte count. One option is to simply fall back to sync
IO in these cases. Another solution is to use the same technique
described above: we issue the first request as big as msize, then, from
the callback function, we issue a follow-up request, and again, and
again, until we fully complete the large read. This way, although we are
not issuing any simultaneous requests for a specific large read, at
least we can issue other aio requests in parallel for other reads or
writes. It should still be more performant than sync IO.

I am thinking of implementing this second option in the next version of
the series.

2016-12-13 14:29:37

by Latchesar Ionkov

[permalink] [raw]
Subject: Re: [PATCH 4/5] 9p: introduce async read requests

If the user asked for more than msize-iohdrsz (or rather iounit)
bytes, you should do your best to read as much as possible before you
return to user space. That means that if a read returns the maximum
possible count you HAVE to issue another read until you get a short
read.

What Al is hinting at is that you can issue multiple read requests
simultaneously, assuming that most of them will return data.

So the way I see your options are two, with an additional tweak. The
first option is to issue more read calls when the previous ones
complete (your second option). The second option is at the beginning
to issue all the read calls simultaneously. And the tweak is to do
something in between and issue up to a limit of simultaneous read
calls.

Thanks,
Lucho

On Mon, Dec 12, 2016 at 6:15 PM, Stefano Stabellini
<[email protected]> wrote:
> Hi Al,
> thanks for looking at the patch.
>
> On Sat, 10 Dec 2016, Al Viro wrote:
>> On Thu, Dec 08, 2016 at 12:59:05PM -0800, Stefano Stabellini wrote:
>>
>>
>> > + } else {
>> > + req = p9_client_get_req(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize);
>> > + if (IS_ERR(req)) {
>> > + *err = PTR_ERR(req);
>> > + break;
>> > + }
>> > + req->rsize = iov_iter_get_pages_alloc(to, &req->pagevec,
>> > + (size_t)rsize, &req->offset);
>> > + req->kiocb = iocb;
>> > + for (i = 0; i < req->rsize; i += PAGE_SIZE)
>> > + page_cache_get_speculative(req->pagevec[i/PAGE_SIZE]);
>> > + req->callback = p9_client_read_complete;
>> > +
>> > + *err = clnt->trans_mod->request(clnt, req);
>> > + if (*err < 0) {
>> > + clnt->status = Disconnected;
>> > + release_pages(req->pagevec,
>> > + (req->rsize + PAGE_SIZE - 1) / PAGE_SIZE,
>> > + true);
>> > + kvfree(req->pagevec);
>> > + p9_free_req(clnt, req);
>> > + break;
>> > + }
>> > +
>> > + *err = -EIOCBQUEUED;
>>
>> IDGI. AFAICS, your code will result in shitloads of short reads - every
>> time when you give it a multi-iovec array, only the first one will be
>> issued and the rest won't be even looked at. Sure, it is technically
>> legal, but I very much doubt that aio users will be happy with that.
>>
>> What am I missing here?
>
> There is a problem with short reads, but I don't think it is the one
> you described, unless I made a mistake somewhere.
>
> The code above will issue one request as big as possible (size is
> limited by clnt->msize, which is the size of the channel). No matter how
> many segs in the iov_iter, the request will cover the maximum amount of
> bytes allowed by the channel, typically 4K but can be larger. So
> multi-iovec arrays should be handled correctly up to msize. Please let
> me know if I am wrong, I am not an expert on this. I tried, but couldn't
> actually get any multi-iovec arrays in my tests.
>
>
> That said, reading the code again, there are indeed two possible causes
> of short reads. The first one are responses which have a smaller byte
> count than requested. Currently this case is not handled, forwarding
> the short read up to the user. But I wrote and tested successfully a
> patch that issues another follow-up request from the completion
> function. Example:
> p9_client_read, issue request, size 4K
> p9_client_read_complete, receive response, count 2K
> p9_client_read_complete, issue request size 4K-2K = 2K
> p9_client_read_complete, receive response size 2K
> p9_client_read_complete, call ki_complete
>
>
> The second possible cause of short reads are requests which are bigger
> than msize. For example a 2MB read over a 4K channel. In this patch we
> simply issue one request as big as msize, and eventually return to the
> caller a smaller byte count. One option is to simply fall back to sync
> IO in these cases. Another solution is to use the same technique
> described above: we issue the first request as big as msize, then, from
> the callback function, we issue a follow-up request, and again, and
> again, until we fully complete the large read. This way, although we are
> not issuing any simultaneous requests for a specific large read, at
> least we can issue other aio requests in parallel for other reads or
> writes. It should still be more performant than sync IO.
>
> I am thinking of implementing this second option in the next version of
> the series.