2002-10-15 19:13:51

by Christoph Hellwig

[permalink] [raw]
Subject: [RFC] iovec in ->aio_read/->aio_write

I've recently looked into implementing the aio read and write
methods for XFS. Although all of read/write readv/writev
and aio_read/aio_write end up calling the exactly same code
in filemap.c (for the generic filesystem I/O code). Filesystems
like XFS that need additional code before calling the generic
functionality have to duplicated it though.

I don't think it makes sense to keep all those interfaces. As
the read/write entry points are used by most drivers I suggest
starting with the other two, that have far less users. The
patch below (compiled but not booted!) changes the aio_read/
aio_write interface to take the same array of iovec like
readv/write and updates all users. Note that we don't support
vectored I/O for the aio interface yet, but it seems like a
logical addition to me.

Proposed next steps: Convert over all readv/writev users
to aio_read/aio_write and remove the methods. Implement
aio_read/aio_write in all filesystems using the generic
pagecache code and kill the "normal" generic_file_read
and generic_file_write.

Comments?

--- 1.22/fs/aio.c Sun Oct 13 17:39:40 2002
+++ edited/fs/aio.c Tue Oct 15 20:45:37 2002
@@ -987,6 +987,28 @@
return -EINVAL;
}

+ssize_t aio_read_single(struct kiocb *iocb, char *buf,
+ size_t count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+
+ iocb->ki_single.iov_base = buf;
+ iocb->ki_single.iov_len = count;
+
+ return file->f_op->aio_read(iocb, &iocb->ki_single, 1, iocb->ki_pos);
+}
+
+ssize_t aio_write_single(struct kiocb *iocb, const char *buf,
+ size_t count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+
+ iocb->ki_single.iov_base = (void *)buf;
+ iocb->ki_single.iov_len = count;
+
+ return file->f_op->aio_write(iocb, &iocb->ki_single, 1, iocb->ki_pos);
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -1048,7 +1070,7 @@
goto out_put_req;
ret = -EINVAL;
if (file->f_op->aio_read)
- ret = file->f_op->aio_read(req, buf,
+ ret = aio_read_single(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;
case IOCB_CMD_PWRITE:
@@ -1060,7 +1082,7 @@
goto out_put_req;
ret = -EINVAL;
if (file->f_op->aio_write)
- ret = file->f_op->aio_write(req, buf,
+ ret = aio_write_single(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;
case IOCB_CMD_FDSYNC:
--- 1.19/fs/read_write.c Thu Oct 10 23:36:26 2002
+++ edited/fs/read_write.c Tue Oct 15 20:44:24 2002
@@ -184,7 +184,7 @@

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos);
+ ret = aio_read_single(&kiocb, buf, len, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -224,7 +224,7 @@

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos);
+ ret = aio_write_single(&kiocb, buf, len, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -340,6 +340,37 @@
}
return seg;
}
+
+ssize_t do_sync_writev(struct file *filp, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ ret = filp->f_op->aio_write(&kiocb, iov, nr_segs, kiocb.ki_pos);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
+ssize_t do_sync_readv(struct file *filp, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ ret = filp->f_op->aio_write(&kiocb, iov, nr_segs, kiocb.ki_pos);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+

static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec * vector,
--- 1.9/fs/ext3/file.c Wed Oct 9 20:32:29 2002
+++ edited/fs/ext3/file.c Tue Oct 15 20:46:54 2002
@@ -61,7 +61,8 @@
*/

static ssize_t
-ext3_file_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
+ext3_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_dentry->d_inode;
@@ -76,15 +77,15 @@
if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
mark_inode_dirty(inode);

- return generic_file_aio_write(iocb, buf, count, pos);
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
}

struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext3_file_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = ext3_file_aio_write,
.readv = generic_file_readv,
.writev = generic_file_writev,
.ioctl = ext3_ioctl,
--- 1.21/fs/nfs/file.c Tue Oct 8 23:37:02 2002
+++ edited/fs/nfs/file.c Tue Oct 15 20:55:56 2002
@@ -35,8 +35,10 @@
#define NFSDBG_FACILITY NFSDBG_FILE

static int nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_read(struct kiocb *, char *, size_t, loff_t);
-static ssize_t nfs_file_write(struct kiocb *, const char *, size_t, loff_t);
+static ssize_t nfs_file_aio_read(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+static ssize_t nfs_file_aio_write(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
static int nfs_file_flush(struct file *);
static int nfs_fsync(struct file *, struct dentry *dentry, int datasync);

@@ -44,8 +46,8 @@
.llseek = remote_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
+ .aio_read = nfs_file_aio_read,
+ .aio_write = nfs_file_aio_write,
.mmap = nfs_file_mmap,
.open = nfs_open,
.flush = nfs_file_flush,
@@ -91,19 +93,20 @@
}

static ssize_t
-nfs_file_read(struct kiocb *iocb, char * buf, size_t count, loff_t pos)
+nfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;

- dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+ dfprintk(VFS, "nfs: read(%s/%s, %lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (unsigned long) count, (unsigned long) pos);
+ (unsigned long)pos);

result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (!result)
- result = generic_file_aio_read(iocb, buf, count, pos);
+ result = generic_file_aio_read(iocb, iov, nr_segs, pos);
return result;
}

@@ -211,15 +214,16 @@
* Write to a file (through the page cache).
*/
static ssize_t
-nfs_file_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
+nfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;

- dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%lu)\n",
+ dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- inode->i_ino, (unsigned long) count, (unsigned long) pos);
+ inode->i_ino, (unsigned long) pos);

result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -228,11 +232,7 @@
if (result)
goto out;

- result = count;
- if (!count)
- goto out;
-
- result = generic_file_aio_write(iocb, buf, count, pos);
+ result = generic_file_aio_write(iocb, iov, nr_segs, pos);
out:
return result;

--- 1.6/include/linux/aio.h Thu Oct 3 22:19:27 2002
+++ edited/include/linux/aio.h Tue Oct 15 20:38:00 2002
@@ -2,6 +2,7 @@
#define __LINUX__AIO_H

#include <linux/list.h>
+#include <linux/uio.h>
#include <linux/workqueue.h>
#include <linux/aio_abi.h>

@@ -60,6 +61,7 @@
* for cancellation */

void *ki_user_obj; /* pointer to userland's iocb */
+ struct iovec ki_single; /* iovec for non-vectored I/O */
__u64 ki_user_data; /* user's data for completion */
loff_t ki_pos;

@@ -145,6 +147,10 @@
extern void FASTCALL(kick_iocb(struct kiocb *iocb));
extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2));
extern void FASTCALL(__put_ioctx(struct kioctx *ctx));
+extern ssize_t aio_read_single(struct kiocb *iocb, char *buf,
+ size_t count, loff_t pos);
+extern ssize_t aio_write_single(struct kiocb *iocb, const char *buf,
+ size_t count, loff_t pos);
struct mm_struct;
extern void FASTCALL(exit_aio(struct mm_struct *mm));

--- 1.170/include/linux/fs.h Fri Oct 11 10:49:46 2002
+++ edited/include/linux/fs.h Tue Oct 15 20:37:27 2002
@@ -744,9 +744,11 @@
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char *, size_t, loff_t *);
- ssize_t (*aio_read) (struct kiocb *, char *, size_t, loff_t);
+ ssize_t (*aio_read) (struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
- ssize_t (*aio_write) (struct kiocb *, const char *, size_t, loff_t);
+ ssize_t (*aio_write) (struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
@@ -1242,8 +1244,10 @@
extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
-extern ssize_t generic_file_aio_read(struct kiocb *, char *, size_t, loff_t);
-extern ssize_t generic_file_aio_write(struct kiocb *, const char *, size_t, loff_t);
+extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
extern ssize_t do_sync_read(struct file *filp, char *buf, size_t len, loff_t *ppos);
extern ssize_t do_sync_write(struct file *filp, const char *buf, size_t len, loff_t *ppos);
ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
--- 1.23/include/net/sock.h Fri Oct 11 01:14:45 2002
+++ edited/include/net/sock.h Tue Oct 15 20:06:13 2002
@@ -303,7 +303,6 @@
struct socket *sock;
struct sock *sk;
struct msghdr *msg, async_msg;
- struct iovec async_iov;
struct scm_cookie *scm, async_scm;
};

--- 1.147/mm/filemap.c Sun Oct 13 17:39:40 2002
+++ edited/mm/filemap.c Tue Oct 15 20:41:07 2002
@@ -884,12 +884,11 @@
}

ssize_t
-generic_file_aio_read(struct kiocb *iocb, char *buf, size_t count, loff_t pos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-
BUG_ON(iocb->ki_pos != pos);
- return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
+ return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos);
}

ssize_t
@@ -1645,10 +1644,10 @@
return err;
}

-ssize_t generic_file_aio_write(struct kiocb *iocb, const char *buf,
- size_t count, loff_t pos)
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- return generic_file_write(iocb->ki_filp, buf, count, &iocb->ki_pos);
+ return generic_file_writev(iocb->ki_filp, iov, nr_segs, &iocb->ki_pos);
}

ssize_t generic_file_write(struct file *file, const char *buf,
--- 1.30/net/socket.c Sat Oct 12 09:37:17 2002
+++ edited/net/socket.c Tue Oct 15 21:03:19 2002
@@ -90,10 +90,10 @@
#include <linux/netfilter.h>

static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
-static ssize_t sock_aio_read(struct kiocb *iocb, char *buf,
- size_t size, loff_t pos);
-static ssize_t sock_aio_write(struct kiocb *iocb, const char *buf,
- size_t size, loff_t pos);
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
static int sock_mmap(struct file *file, struct vm_area_struct * vma);

static int sock_close(struct inode *inode, struct file *file);
@@ -586,31 +586,31 @@
* area ubuf...ubuf+size-1 is writable before asking the protocol.
*/

-static ssize_t sock_aio_read(struct kiocb *iocb, char *ubuf,
- size_t size, loff_t pos)
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct sock_iocb *x = kiocb_to_siocb(iocb);
struct socket *sock;
int flags;
+ size_t tot_len = 0;
+ int i;

if (pos != 0)
return -ESPIPE;
- if (size==0) /* Match SYS5 behaviour */
- return 0;

sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);

x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
+ x->async_msg.msg_iov = (struct iovec *)iov;
+ x->async_msg.msg_iovlen = nr_segs;
x->async_msg.msg_control = NULL;
x->async_msg.msg_controllen = 0;
- x->async_iov.iov_base = ubuf;
- x->async_iov.iov_len = size;
flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;

- return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags);
+ for (i = 0 ; i < nr_segs ; i++)
+ tot_len += iov[i].iov_len;
+ return __sock_recvmsg(iocb, sock, &x->async_msg, flags, tot_len);
}


@@ -619,32 +619,32 @@
* is readable by the user process.
*/

-static ssize_t sock_aio_write(struct kiocb *iocb, const char *ubuf,
- size_t size, loff_t pos)
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct sock_iocb *x = kiocb_to_siocb(iocb);
struct socket *sock;
+ size_t tot_len = 0;
+ int i;

if (pos != 0)
return -ESPIPE;
- if(size==0) /* Match SYS5 behaviour */
- return 0;

sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);

x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
+ x->async_msg.msg_iov = (struct iovec *)iov;
+ x->async_msg.msg_iovlen = nr_segs;
x->async_msg.msg_control = NULL;
x->async_msg.msg_controllen = 0;
x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
if (sock->type == SOCK_SEQPACKET)
x->async_msg.msg_flags |= MSG_EOR;
- x->async_iov.iov_base = (void *)ubuf;
- x->async_iov.iov_len = size;

- return __sock_sendmsg(iocb, sock, &x->async_msg, size);
+ for (i = 0 ; i < nr_segs ; i++)
+ tot_len += iov[i].iov_len;
+ return __sock_sendmsg(iocb, sock, &x->async_msg, tot_len);
}

ssize_t sock_sendpage(struct file *file, struct page *page,


2002-10-15 19:28:42

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write

On Tue, Oct 15, 2002 at 10:33:15PM -0400, Christoph Hellwig wrote:
> Proposed next steps: Convert over all readv/writev users
> to aio_read/aio_write and remove the methods. Implement
> aio_read/aio_write in all filesystems using the generic
> pagecache code and kill the "normal" generic_file_read
> and generic_file_write.
>
> Comments?

Please not right now? ;-) At least introduce it as aio_readv/aio_writev
as currently the way we deal with iovecs uglifies a lot of code, with no
benefit for the vast majority of applications. As for killing
generic_file_read/write and using the aio counterparts, that is the plan
pending a bit more testing of things. I want to get there step by step
without breaking everything along the way.

-ben

2002-10-16 06:46:04

by Janet Morgan

[permalink] [raw]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write

>
> On Tue, Oct 15, 2002 at 10:33:15PM -0400, Christoph Hellwig wrote:
> > Proposed next steps: Convert over all readv/writev users
> > to aio_read/aio_write and remove the methods. Implement
> > aio_read/aio_write in all filesystems using the generic
> > pagecache code and kill the "normal" generic_file_read
> > and generic_file_write.
> >
> > Comments?
>
> Please not right now? ;-) At least introduce it as aio_readv/aio_writev
> as currently the way we deal with iovecs uglifies a lot of code, with no
> benefit for the vast majority of applications. As for killing
> generic_file_read/write and using the aio counterparts, that is the plan
> pending a bit more testing of things. I want to get there step by step
> without breaking everything along the way.
>
> -ben

Here's a patch for aio readv/writev support. Basically it adds:

- two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
- a field to the iocb for the user vector
- aio_readv/writev methods to the file_operations structure
- routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.

I tested this using the aio dio patch that Badari submitted a while back.
I compared:
readv/writev io_submit for a vector of N iovecs
vs read/write io_submit for N iocbs.

My performance data is only preliminary at this point, but aio readv/writev
appears to outperform aio read/write -- twice as fast in some cases. The
results generally make sense to me: while there is only one io_submit in both
cases, aio readv/writev shortens codepath (one instead of N calls to the
underlying filesystem routine) and should normally result in fewer
bios/callbacks (at least for direct-io). As importantly, aio readv/writev
in my testing also reduces the number of (system) calls to io_getevents.

-Janet


fs/aio.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/aio_abi.h | 12 +++++
include/linux/fs.h | 2
3 files changed, 124 insertions(+)

--- linux-2.5.42/include/linux/aio_abi.h Fri Oct 11 21:21:33 2002
+++ aiov/include/linux/aio_abi.h Tue Oct 15 22:19:05 2002
@@ -41,6 +41,8 @@
* IOCB_CMD_POLL = 5,
*/
IOCB_CMD_NOOP = 6,
+ IOCB_CMD_PREADV = 7,
+ IOCB_CMD_PWRITEV = 8,
};

/* read() from /dev/aio returns these structures. */
@@ -65,6 +67,12 @@
* proper padding and aio_error abstraction
*/

+struct io_iocb_vector {
+ const struct iovec *vec;
+ __s32 nr;
+ __s64 offset;
+};
+
struct iocb {
/* these are internal to the kernel/libc. */
__u64 aio_data; /* data to be returned in event's data */
@@ -80,6 +88,10 @@
__u64 aio_nbytes;
__s64 aio_offset;

+ union {
+ struct io_iocb_vector v;
+ } u;
+
/* extra parameters */
__u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
__u64 aio_reserved3;
--- linux-2.5.42/include/linux/fs.h Fri Oct 11 21:21:35 2002
+++ aiov/include/linux/fs.h Tue Oct 15 22:14:19 2002
@@ -763,6 +763,8 @@
ssize_t (*sendfile) (struct file *, struct file *, loff_t *, size_t);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ ssize_t (*aio_readv) (struct kiocb *, const struct iovec *, unsigned long , loff_t *);
+ ssize_t (*aio_writev) (struct kiocb *, const struct iovec *, unsigned long, loff_t *);
};

struct inode_operations {
--- linux-2.5.42/fs/aio.c Fri Oct 11 21:22:07 2002
+++ aiov/fs/aio.c Tue Oct 15 22:37:29 2002
@@ -33,6 +33,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
+#include <linux/dnotify.h>

#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -992,6 +993,97 @@
return -EINVAL;
}

+#define iov(iocb) iocb->u.v.vec
+#define nr_segs(iocb) iocb->u.v.nr
+
+static ssize_t io_readv_writev(int type, struct kiocb *req,
+ const struct iovec * vector, unsigned long nr_segs, loff_t pos)
+{
+ size_t tot_len;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov=iovstack;
+ ssize_t ret;
+ int seg;
+ struct file *file = req->ki_filp;
+ struct inode *inode = file->f_dentry->d_inode;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ ret = 0;
+ if (nr_segs == 0)
+ goto out;
+
+ ret = -EINVAL;
+ if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
+ goto out;
+
+ if (nr_segs > UIO_FASTIOV) {
+ ret = -ENOMEM;
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (!iov)
+ goto out;
+ }
+
+ if (copy_from_user(iov, vector, nr_segs*sizeof(*vector))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 or if
+ * the total length does not fit a ssize_t.
+ *
+ * Be careful here because iov_len is a size_t not a ssize_t.
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0 ; seg < nr_segs; seg++) {
+ ssize_t tmp = tot_len;
+ ssize_t len = (ssize_t)iov[seg].iov_len;
+ void * base = iov[seg].iov_base;
+
+ if (len < 0) /* size_t not fitting an ssize_t .. */
+ goto out;
+ tot_len += len;
+ if (tot_len < tmp) /* maths overflow on the ssize_t */
+ goto out;
+
+ if (unlikely(!access_ok(type == IOCB_CMD_PREAD ?
+ VERIFY_WRITE : VERIFY_READ, base, len))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ /* return zero if total length is zero */
+ if (tot_len == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = locks_verify_area((type == IOCB_CMD_PREADV
+ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE),
+ inode, file, file->f_pos, tot_len);
+ if (ret)
+ goto out;
+
+ if (type == IOCB_CMD_PREADV)
+ ret = file->f_op->aio_readv(req, iov, nr_segs, &pos);
+ else
+ ret = file->f_op->aio_writev(req, iov, nr_segs, &pos);
+
+out:
+ if (iov != iovstack)
+ kfree(iov);
+ if ((ret + (type == IOCB_CMD_PREADV)) > 0)
+ dnotify_parent(file->f_dentry,
+ (type == IOCB_CMD_PREADV) ? DN_MODIFY : DN_ACCESS);
+ return ret;
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -1068,6 +1160,24 @@
ret = file->f_op->aio_write(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;
+ case IOCB_CMD_PREADV:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_readv)
+ ret = io_readv_writev(IOCB_CMD_PREADV, req,
+ iov(iocb), nr_segs(iocb), iocb->aio_offset);
+ break;
+ case IOCB_CMD_PWRITEV:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_writev)
+ ret = io_readv_writev(IOCB_CMD_PWRITEV, req,
+ iov(iocb), nr_segs(iocb), iocb->aio_offset);
+ break;
case IOCB_CMD_FDSYNC:
ret = -EINVAL;
if (file->f_op->aio_fsync)

2002-10-16 13:44:51

by Shailabh Nagar

[permalink] [raw]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write


Janet Morgan wrote:

> Here's a patch for aio readv/writev support. Basically it adds:
>
> - two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
> - a field to the iocb for the user vector
> - aio_readv/writev methods to the file_operations structure

I presume f_op->aio_readv could point to __generic_file_aio_read for most
filesystems.

Would f_op->aio_writev need a new wrapper function for 2.5.42 ?
f_op->aio_write eventually calls generic_file_write which uses a different inode
from generic_file_writev. So f_op->aio_writev might need to point to a function
like generic_file_writev but using the same inode as generic_file_write.


> - routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.
>
> I tested this using the aio dio patch that Badari submitted a while back.
> I compared:
> readv/writev io_submit for a vector of N iovecs
> vs read/write io_submit for N iocbs.
>
> My performance data is only preliminary at this point, but aio readv/writev
> appears to outperform aio read/write -- twice as fast in some cases. The
> results generally make sense to me: while there is only one io_submit in both
> cases, aio readv/writev shortens codepath (one instead of N calls to the
> underlying filesystem routine) and should normally result in fewer

Twice as fast looks good !

> bios/callbacks (at least for direct-io). As importantly, aio readv/writev
> in my testing also reduces the number of (system) calls to io_getevents.

It would be interesting to see the performance boost when <iov length> events
are retrieved at once, using the min_nr parameter of io_getevents.


--Shailabh

2002-10-16 14:36:26

by Helen Pang

[permalink] [raw]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write


Shailabh Nagar wrote:

> It would be interesting to see the performance boost when <iov length>
events
> are retrieved at once, using the min_nr parameter of io_getevents

My experience is that specified minimum number of events (min_nr) of
io_getevents is not quite working yet in kernel 2.4.19.
I haven't started to exercise this in 2.5.42, but if it is working,
logically it will help the performance indeed.

-Helen






Shailabh Nagar <[email protected]>@kvack.org on 10/16/2002 08:41:03 AM

Sent by: [email protected]


To: [email protected]
cc: Benjamin LaHaise <[email protected]>, Christoph Hellwig <[email protected]>,
[email protected], [email protected],
[email protected], [email protected]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write



Janet Morgan wrote:

> Here's a patch for aio readv/writev support. Basically it adds:
>
> - two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
> - a field to the iocb for the user vector
> - aio_readv/writev methods to the file_operations structure

I presume f_op->aio_readv could point to __generic_file_aio_read for most
filesystems.

Would f_op->aio_writev need a new wrapper function for 2.5.42 ?
f_op->aio_write eventually calls generic_file_write which uses a different
inode
from generic_file_writev. So f_op->aio_writev might need to point to a
function
like generic_file_writev but using the same inode as generic_file_write.


> - routine aio.c/io_readv_writev, which borrows heavily from
do_readv_writev.
>
> I tested this using the aio dio patch that Badari submitted a while back.
> I compared:
> readv/writev io_submit for a vector of N iovecs
> vs read/write io_submit for N iocbs.
>
> My performance data is only preliminary at this point, but aio
readv/writev
> appears to outperform aio read/write -- twice as fast in some cases. The
> results generally make sense to me: while there is only one io_submit in
both
> cases, aio readv/writev shortens codepath (one instead of N calls to the
> underlying filesystem routine) and should normally result in fewer

Twice as fast looks good !

> bios/callbacks (at least for direct-io). As importantly, aio
readv/writev
> in my testing also reduces the number of (system) calls to io_getevents.

It would be interesting to see the performance boost when <iov length>
events
are retrieved at once, using the min_nr parameter of io_getevents.


--Shailabh

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to [email protected]. For more info on Linux AIO,
see: http://www.kvack.org/aio/




2002-10-16 15:00:48

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write

On Tue, Oct 15, 2002 at 11:51:21PM -0700, Janet Morgan wrote:
> - two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
> - a field to the iocb for the user vector
> - aio_readv/writev methods to the file_operations structure
> - routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.

A few comments about the interface to userland: don't add fields to the
iocb, that breaks the abi. Also, the iovec should be pointed to by
iocb->aio_buf, with aio_nbytes containing the length of the iovec (that
avoids the need for an additional field). The structure of the iovec
itself should probably match the iovec struct, but that means we'll need
different opcodes for the 32 bit and 64 bit variants. Alternatively a
new iovec that is the same on both (like the iocb) could be defined, but
that would be inconvenient for userland.

Within the kernel, the loff_t *pos shouldn't be a pointer -- I'm trying to
get rid of extraneous pointers as that means an additional chunk of memory
has to be held over the entirety of the aio request. Similarly, the iovec
passed into the operation itself can never be allocated on the stack. This
probably works for direct io where the iovec gets translated into a scatter
gather list of pages, but wouldn't work for something like networking where
the iovec itself gets used to determine where in the user address space to
copy data (as this can occur after the submit operation has returned).

Aside from those details, I guess if people really need vectored ios it's
okay.

-ben

2002-10-17 11:17:17

by Janet Morgan

[permalink] [raw]
Subject: Re: [RFC] iovec in ->aio_read/->aio_write

>
> On Tue, Oct 15, 2002 at 11:51:21PM -0700, Janet Morgan wrote:
> > - two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
> > - a field to the iocb for the user vector
> > - aio_readv/writev methods to the file_operations structure
> > - routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.
>
> A few comments about the interface to userland: don't add fields to the
> iocb, that breaks the abi. Also, the iovec should be pointed to by
> iocb->aio_buf, with aio_nbytes containing the length of the iovec (that
> avoids the need for an additional field). The structure of the iovec
> itself should probably match the iovec struct, but that means we'll need
> different opcodes for the 32 bit and 64 bit variants. Alternatively a
> new iovec that is the same on both (like the iocb) could be defined, but
> that would be inconvenient for userland.
>
> Within the kernel, the loff_t *pos shouldn't be a pointer -- I'm trying to
> get rid of extraneous pointers as that means an additional chunk of memory
> has to be held over the entirety of the aio request. Similarly, the iovec
> passed into the operation itself can never be allocated on the stack. This
> probably works for direct io where the iovec gets translated into a scatter
> gather list of pages, but wouldn't work for something like networking where
> the iovec itself gets used to determine where in the user address space to
> copy data (as this can occur after the submit operation has returned).
>
> Aside from those details, I guess if people really need vectored ios it's
> okay.
>
> -ben
>

Thanks for the really helpful comments. I've attached another version
of the patch.

In addition to (hopefully) addressing the issues you raised, I also
removed the calls to locks_verify_area and dnotify_parent from my original
patch. Please let me know if you think I should re-instate those calls or
if you see any new problems.

-Janet


fs/aio.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/aio.h | 1
include/linux/aio_abi.h | 14 ++++
include/linux/fs.h | 2
4 files changed, 173 insertions(+)


--- linux-2.5.43/include/linux/aio_abi.h Tue Oct 15 20:27:18 2002
+++ aiov/include/linux/aio_abi.h Wed Oct 16 23:27:40 2002
@@ -41,6 +41,10 @@
* IOCB_CMD_POLL = 5,
*/
IOCB_CMD_NOOP = 6,
+ IOCB_CMD_PREADV32 = 7,
+ IOCB_CMD_PREADV64 = 8,
+ IOCB_CMD_PWRITEV32 = 9,
+ IOCB_CMD_PWRITEV64 = 10,
};

/* read() from /dev/aio returns these structures. */
@@ -85,6 +89,16 @@
__u64 aio_reserved3;
}; /* 64 bytes */

+struct iovec32 {
+ __u32 iov_base;
+ __u32 iov_len;
+};
+
+struct iovec64 {
+ __u64 iov_base;
+ __u64 iov_len;
+};
+
#undef IFBIG
#undef IFLITTLE

--- linux-2.5.43/include/linux/fs.h Tue Oct 15 20:27:45 2002
+++ aiov/include/linux/fs.h Wed Oct 16 20:41:33 2002
@@ -764,6 +764,8 @@
ssize_t (*sendfile) (struct file *, struct file *, loff_t *, size_t);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ ssize_t (*aio_readv) (struct kiocb *, const struct iovec *, unsigned long , loff_t);
+ ssize_t (*aio_writev) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
};

struct inode_operations {
--- linux-2.5.43/include/linux/aio.h Tue Oct 15 20:27:09 2002
+++ aiov/include/linux/aio.h Wed Oct 16 20:41:33 2002
@@ -51,6 +51,7 @@
int ki_users;
unsigned ki_key; /* id of this request */

+ struct iovec *ki_iov; /* io vector for readv/writev */
struct file *ki_filp;
struct kioctx *ki_ctx; /* may be NULL for sync ops */
int (*ki_cancel)(struct kiocb *, struct io_event *);
--- linux-2.5.43/fs/aio.c Tue Oct 15 20:27:57 2002
+++ aiov/fs/aio.c Thu Oct 17 02:40:49 2002
@@ -28,6 +28,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
+#include <linux/dnotify.h>

#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -393,6 +394,7 @@
req->ki_users = 1;
req->ki_key = 0;
req->ki_ctx = ctx;
+ req->ki_iov = NULL;
req->ki_cancel = NULL;
req->ki_retry = NULL;
req->ki_user_obj = NULL;
@@ -444,6 +446,9 @@
req->ki_ctx = NULL;
req->ki_filp = NULL;
req->ki_user_obj = NULL;
+ if (req->ki_iov)
+ kfree(req->ki_iov);
+ req->ki_iov = NULL;
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;

@@ -987,6 +992,139 @@
return -EINVAL;
}

+#define _32bit_kiov (sizeof(struct iovec) == sizeof(struct iovec32))
+#define _64bit_kiov (sizeof(struct iovec) == sizeof(struct iovec64))
+
+#define _32bit(cmd) \
+ (((cmd) == IOCB_CMD_PREADV32) || ((cmd) == IOCB_CMD_PWRITEV32))
+#define _64bit(cmd) \
+ (((cmd) == IOCB_CMD_PREADV64) || ((cmd) == IOCB_CMD_PWRITEV64))
+
+#define is_readv(cmd) \
+ (((cmd) == IOCB_CMD_PREADV32) || ((cmd) == IOCB_CMD_PREADV64))
+
+#define can_fast_copy(cmd) \
+ ((_32bit(cmd) && _32bit_kiov) || ((_64bit(cmd) && _64bit_kiov)))
+
+int resize_user_vec(int cmd, struct kiocb *req, const struct iovec *uiov,
+ unsigned long nr_segs)
+{
+ size_t len;
+ struct iovec32 *iov32;
+ struct iovec64 *iov64;
+ struct iovec *kiov = req->ki_iov;
+ int seg, ret = 0;
+
+ /*
+ * copy 32-bit user iovecs to 64-bit kernel iovecs or vice versa
+ */
+ if (_32bit(cmd)) {
+ len = nr_segs*sizeof(struct iovec32);
+ iov32 = kmalloc(len, GFP_KERNEL);
+ if (!iov32)
+ return -ENOMEM;
+ if (copy_from_user(iov32, uiov, len))
+ ret = -EFAULT;
+ else for (seg = 0; seg < nr_segs; seg++) {
+ kiov[seg].iov_base = (void *)iov32[seg].iov_base;
+ kiov[seg].iov_len = iov32[seg].iov_len;
+ }
+ kfree(iov32);
+
+ } else {
+ len = nr_segs*sizeof(struct iovec64);
+ iov64 = kmalloc(len, GFP_KERNEL);
+ if (!iov64)
+ return -ENOMEM;
+ if (copy_from_user(iov64, uiov, len))
+ ret = -EFAULT;
+ else for (seg = 0; seg < nr_segs; seg++) {
+ kiov[seg].iov_base =
+ (void *)(unsigned long)iov64[seg].iov_base;
+ kiov[seg].iov_len = iov64[seg].iov_len;
+ }
+ kfree(iov64);
+ }
+out:
+ return ret;
+}
+
+#define iov(iocb) (const struct iovec *)(unsigned long)iocb->aio_buf
+#define nr_segs(iocb) (unsigned long)iocb->aio_nbytes
+
+static ssize_t io_readv_writev(int cmd, struct kiocb *req, struct iocb *iocb)
+{
+ size_t tot_len;
+ int seg;
+ struct file *file = req->ki_filp;
+ const struct iovec *uiov = iov(iocb);
+ unsigned long nr_segs = nr_segs(iocb);
+ ssize_t ret = 0;
+
+ if (nr_segs == 0)
+ goto out;
+
+ if ((nr_segs > UIO_MAXIOV) || ((ssize_t)nr_segs < 0)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ req->ki_iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (!req->ki_iov) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = -EFAULT;
+ if (can_fast_copy(cmd)) {
+ int len = nr_segs * sizeof(struct iovec);
+ if (copy_from_user(req->ki_iov, uiov, len)) {
+ goto out;
+ }
+ } else if (resize_user_vec(cmd, req, uiov, nr_segs))
+ goto out;
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 or if
+ * the total length does not fit a ssize_t.
+ *
+ * Be careful here because iov_len is a size_t not a ssize_t.
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0 ; seg < nr_segs; seg++) {
+ ssize_t tmp = tot_len;
+ ssize_t len = (ssize_t)req->ki_iov[seg].iov_len;
+ void * base = req->ki_iov[seg].iov_base;
+
+ if (len < 0) /* size_t not fitting an ssize_t .. */
+ goto out;
+ tot_len += len;
+ if (tot_len < tmp) /* maths overflow on the ssize_t */
+ goto out;
+
+ if (unlikely(!access_ok(is_readv(cmd) ?
+ VERIFY_WRITE : VERIFY_READ, base, len))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ if (tot_len == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ if (is_readv(cmd))
+ ret = file->f_op->aio_readv(req, req->ki_iov,
+ nr_segs(iocb), iocb->aio_offset);
+ else
+ ret = file->f_op->aio_writev(req, req->ki_iov,
+ nr_segs(iocb), iocb->aio_offset);
+out:
+ return ret;
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -1063,6 +1201,24 @@
ret = file->f_op->aio_write(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;
+ case IOCB_CMD_PREADV32:
+ case IOCB_CMD_PREADV64:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_readv)
+ ret = io_readv_writev(iocb->aio_lio_opcode, req, iocb);
+ break;
+ case IOCB_CMD_PWRITEV32:
+ case IOCB_CMD_PWRITEV64:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_writev)
+ ret = io_readv_writev(iocb->aio_lio_opcode, req, iocb);
+ break;
case IOCB_CMD_FDSYNC:
ret = -EINVAL;
if (file->f_op->aio_fsync)