2002-09-17 20:58:42

by Badari Pulavarty

[permalink] [raw]
Subject: [RFC] [PATCH] 2.5.35 patch for making DIO async

Hi Ben,

Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
wait for io completion. Waiting will be done at the higher level
(same way generic_file_read does).

Here is what I did:

1) pass kiocb *iocb to DIO
2) allocated a "dio" (instead of using stack)
3) after submitting bios, dio code returns with -EIOCBQUEUED
4) removed dio_bio_end_io(), dio_await_one(), dio_await_completion()
5) added dio_bio_end_aio() which calls dio_bio_complete() for
each bio and if the dio_biocount becomes 0, it calls aio_complete()
6) changed raw_read/raw_write/.. routines to pass a sync_cb and wait
on it.

Any feedback on approach/patch is appreciated.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.35/drivers/char/raw.c linux-2.5.35.aio/drivers/char/raw.c
--- linux-2.5.35/drivers/char/raw.c Sun Sep 15 19:18:37 2002
+++ linux-2.5.35.aio/drivers/char/raw.c Tue Sep 17 09:00:39 2002
@@ -201,8 +201,9 @@
}

static ssize_t
-rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+rw_raw_aio_dev(int rw, struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
{
+ struct file *filp = iocb->ki_filp;
const int minor = minor(filp->f_dentry->d_inode->i_rdev);
struct block_device *bdev = raw_devices[minor].binding;
struct inode *inode = bdev->bd_inode;
@@ -222,7 +223,7 @@
count = inode->i_size - *offp;
nr_segs = iov_shorten((struct iovec *)iov, nr_segs, count);
}
- ret = generic_file_direct_IO(rw, inode, iov, *offp, nr_segs);
+ ret = generic_file_direct_IO(rw, iocb, inode, iov, *offp, nr_segs);

if (ret > 0)
*offp += ret;
@@ -231,6 +232,19 @@
}

static ssize_t
+rw_raw_dev(int rw, struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ ret = rw_raw_aio_dev(rw, &kiocb, iov, nr_segs, offp);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ return ret;
+}
+
+static ssize_t
raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
{
struct iovec local_iov = { .iov_base = buf, .iov_len = size};
@@ -246,6 +260,21 @@
return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
}

+static ssize_t
+raw_aio_read(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+ struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+ return rw_raw_aio_dev(READ, iocb, &local_iov, 1, offp);
+}
+
+static ssize_t
+raw_aio_write(struct kiocb *iocb, const char *buf, size_t size, loff_t *offp)
+{
+ struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+
+ return rw_raw_aio_dev(WRITE, iocb, &local_iov, 1, offp);
+}
static ssize_t
raw_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *offp)
{
@@ -260,7 +289,9 @@

static struct file_operations raw_fops = {
.read = raw_read,
+ .aio_read= raw_aio_read,
.write = raw_write,
+ .aio_write= raw_aio_write,
.open = raw_open,
.release= raw_release,
.ioctl = raw_ioctl,
diff -Naur -X dontdiff linux-2.5.35/fs/block_dev.c linux-2.5.35.aio/fs/block_dev.c
--- linux-2.5.35/fs/block_dev.c Sun Sep 15 19:19:09 2002
+++ linux-2.5.35.aio/fs/block_dev.c Tue Sep 17 09:00:26 2002
@@ -116,10 +116,10 @@
}

static int
-blkdev_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
- return generic_direct_IO(rw, inode, iov, offset,
+ return generic_direct_IO(rw, iocb, inode, iov, offset,
nr_segs, blkdev_get_blocks);
}

diff -Naur -X dontdiff linux-2.5.35/fs/direct-io.c linux-2.5.35.aio/fs/direct-io.c
--- linux-2.5.35/fs/direct-io.c Sun Sep 15 19:18:30 2002
+++ linux-2.5.35.aio/fs/direct-io.c Tue Sep 17 09:48:16 2002
@@ -20,6 +20,7 @@
#include <linux/err.h>
#include <linux/buffer_head.h>
#include <linux/rwsem.h>
+#include <linux/slab.h>
#include <asm/atomic.h>

/*
@@ -68,6 +69,8 @@
spinlock_t bio_list_lock; /* protects bio_list */
struct bio *bio_list; /* singly linked via bi_private */
struct task_struct *waiter; /* waiting task (NULL if none) */
+ struct kiocb *iocb; /* iocb ptr */
+ int results;
};

/*
@@ -145,25 +148,41 @@
}

/*
- * The BIO completion handler simply queues the BIO up for the process-context
- * handler.
- *
- * During I/O bi_private points at the dio. After I/O, bi_private is used to
- * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ * Process one completed BIO. No locks are held.
*/
-static void dio_bio_end_io(struct bio *bio)
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int page_no;
+
+ for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+ struct page *page = bvec[page_no].bv_page;
+
+ if (dio->rw == READ)
+ set_page_dirty(page);
+ page_cache_release(page);
+ }
+ atomic_dec(&dio->bio_count);
+ bio_put(bio);
+ return uptodate ? 0 : -EIO;
+}
+
+static void dio_bio_end_aio(struct bio *bio)
+{
+ int ret;
struct dio *dio = bio->bi_private;
- unsigned long flags;
+ ret = dio_bio_complete(dio, bio);
+ if (ret)
+ dio->results = ret;

- spin_lock_irqsave(&dio->bio_list_lock, flags);
- bio->bi_private = dio->bio_list;
- dio->bio_list = bio;
- if (dio->waiter)
- wake_up_process(dio->waiter);
- spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ if (atomic_read(&dio->bio_count) == 0) {
+ aio_complete(dio->iocb, dio->results, 0);
+ kfree(dio);
+ }
}

+
static int
dio_bio_alloc(struct dio *dio, struct block_device *bdev,
sector_t first_sector, int nr_vecs)
@@ -180,7 +199,7 @@
bio->bi_size = 0;
bio->bi_sector = first_sector;
bio->bi_io_vec[0].bv_page = NULL;
- bio->bi_end_io = dio_bio_end_io;
+ bio->bi_end_io = dio_bio_end_aio;

dio->bio = bio;
dio->bvec = NULL; /* debug */
@@ -212,75 +231,6 @@
}

/*
- * Wait for the next BIO to complete. Remove it and return it.
- */
-static struct bio *dio_await_one(struct dio *dio)
-{
- unsigned long flags;
- struct bio *bio;
-
- spin_lock_irqsave(&dio->bio_list_lock, flags);
- while (dio->bio_list == NULL) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (dio->bio_list == NULL) {
- dio->waiter = current;
- spin_unlock_irqrestore(&dio->bio_list_lock, flags);
- blk_run_queues();
- schedule();
- spin_lock_irqsave(&dio->bio_list_lock, flags);
- dio->waiter = NULL;
- }
- set_current_state(TASK_RUNNING);
- }
- bio = dio->bio_list;
- dio->bio_list = bio->bi_private;
- spin_unlock_irqrestore(&dio->bio_list_lock, flags);
- return bio;
-}
-
-/*
- * Process one completed BIO. No locks are held.
- */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec;
- int page_no;
-
- for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
- struct page *page = bvec[page_no].bv_page;
-
- if (dio->rw == READ)
- set_page_dirty(page);
- page_cache_release(page);
- }
- atomic_dec(&dio->bio_count);
- bio_put(bio);
- return uptodate ? 0 : -EIO;
-}
-
-/*
- * Wait on and process all in-flight BIOs.
- */
-static int dio_await_completion(struct dio *dio)
-{
- int ret = 0;
-
- if (dio->bio)
- dio_bio_submit(dio);
-
- while (atomic_read(&dio->bio_count)) {
- struct bio *bio = dio_await_one(dio);
- int ret2;
-
- ret2 = dio_bio_complete(dio, bio);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
-/*
* A really large O_DIRECT read or write can generate a lot of BIOs. So
* to keep the memory consumption sane we periodically reap any completed BIOs
* during the BIO generation phase.
@@ -528,77 +478,83 @@
}

int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode * inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t get_blocks)
{
const unsigned blkbits = inode->i_blkbits;
unsigned long user_addr;
- int seg, ret2, ret = 0;
- struct dio dio;
- size_t bytes, tot_bytes = 0;
-
- dio.bio = NULL;
- dio.bvec = NULL;
- dio.inode = inode;
- dio.rw = rw;
- dio.blkbits = blkbits;
- dio.block_in_file = offset >> blkbits;
- dio.blocks_available = 0;
-
- dio.boundary = 0;
- dio.reap_counter = 0;
- dio.get_blocks = get_blocks;
- dio.last_block_in_bio = -1;
- dio.next_block_in_bio = -1;
+ int seg, ret = 0;
+ struct dio *dio;
+ size_t bytes;
+
+ dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+ if (!dio)
+ return -ENOMEM;
+
+ dio->bio = NULL;
+ dio->bvec = NULL;
+ dio->inode = inode;
+ dio->iocb = iocb;
+ dio->rw = rw;
+ dio->blkbits = blkbits;
+ dio->block_in_file = offset >> blkbits;
+ dio->blocks_available = 0;

- dio.page_errors = 0;
+ dio->results = 0;
+ dio->boundary = 0;
+ dio->reap_counter = 0;
+ dio->get_blocks = get_blocks;
+ dio->last_block_in_bio = -1;
+ dio->next_block_in_bio = -1;
+
+ dio->page_errors = 0;

/* BIO completion state */
- atomic_set(&dio.bio_count, 0);
- spin_lock_init(&dio.bio_list_lock);
- dio.bio_list = NULL;
- dio.waiter = NULL;
+ atomic_set(&dio->bio_count, 0);
+ spin_lock_init(&dio->bio_list_lock);
+ dio->bio_list = NULL;
+ dio->waiter = NULL;

for (seg = 0; seg < nr_segs; seg++) {
user_addr = (unsigned long)iov[seg].iov_base;
bytes = iov[seg].iov_len;

/* Index into the first page of the first block */
- dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
- dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+ dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+ dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
/* Page fetching state */
- dio.head = 0;
- dio.tail = 0;
- dio.curr_page = 0;
+ dio->head = 0;
+ dio->tail = 0;
+ dio->curr_page = 0;

- dio.total_pages = 0;
+ dio->total_pages = 0;
if (user_addr & (PAGE_SIZE-1)) {
- dio.total_pages++;
+ dio->total_pages++;
bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
}
- dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- dio.curr_user_address = user_addr;
+ dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ dio->curr_user_address = user_addr;

- ret = do_direct_IO(&dio);
+ ret = do_direct_IO(dio);

if (ret) {
- dio_cleanup(&dio);
+ dio_cleanup(dio);
break;
}

- tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
- dio.block_in_file) << blkbits);
+ dio->results += iov[seg].iov_len - ((dio->final_block_in_request -
+ dio->block_in_file) << blkbits);

} /* end iovec loop */

- ret2 = dio_await_completion(&dio);
- if (ret == 0)
- ret = ret2;
+ if (dio->bio)
+ dio_bio_submit(dio);
+
if (ret == 0)
- ret = dio.page_errors;
+ ret = dio->page_errors;
if (ret == 0)
- ret = tot_bytes;
-
+ ret = -EIOCBQUEUED;
return ret;
}

@@ -606,8 +562,9 @@
* This is a library function for use by filesystem drivers.
*/
int
-generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+generic_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t get_blocks)
{
int seg;
size_t size;
@@ -636,19 +593,21 @@
goto out;
}

- retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
+ retval = direct_io_worker(rw, iocb, inode, iov, offset,
+ nr_segs, get_blocks);
out:
return retval;
}

ssize_t
-generic_file_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+generic_file_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct address_space *mapping = inode->i_mapping;
ssize_t retval;

- retval = mapping->a_ops->direct_IO(rw, inode, iov, offset, nr_segs);
+ retval = mapping->a_ops->direct_IO(rw, iocb, inode, iov,
+ offset, nr_segs);
if (inode->i_mapping->nrpages)
invalidate_inode_pages2(inode->i_mapping);
return retval;
diff -Naur -X dontdiff linux-2.5.35/fs/ext2/inode.c linux-2.5.35.aio/fs/ext2/inode.c
--- linux-2.5.35/fs/ext2/inode.c Sun Sep 15 19:18:19 2002
+++ linux-2.5.35.aio/fs/ext2/inode.c Tue Sep 17 09:00:26 2002
@@ -619,10 +619,10 @@
}

static int
-ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ext2_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov,loff_t offset, unsigned long nr_segs)
{
- return generic_direct_IO(rw, inode, iov,
+ return generic_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext2_get_blocks);
}

diff -Naur -X dontdiff linux-2.5.35/fs/ext3/inode.c linux-2.5.35.aio/fs/ext3/inode.c
--- linux-2.5.35/fs/ext3/inode.c Sun Sep 15 19:18:41 2002
+++ linux-2.5.35.aio/fs/ext3/inode.c Tue Sep 17 09:00:26 2002
@@ -1399,7 +1399,7 @@
* If the O_DIRECT write is intantiating holes inside i_size and the machine
* crashes then stale disk data _may_ be exposed inside the file.
*/
-static int ext3_direct_IO(int rw, struct inode *inode,
+static int ext3_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -1430,7 +1430,7 @@
}
}

- ret = generic_direct_IO(rw, inode, iov, offset,
+ ret = generic_direct_IO(rw, iocb, inode, iov, offset,
nr_segs, ext3_direct_io_get_blocks);

out_stop:
diff -Naur -X dontdiff linux-2.5.35/fs/jfs/inode.c linux-2.5.35.aio/fs/jfs/inode.c
--- linux-2.5.35/fs/jfs/inode.c Sun Sep 15 19:18:15 2002
+++ linux-2.5.35.aio/fs/jfs/inode.c Tue Sep 17 09:00:26 2002
@@ -309,10 +309,11 @@
return generic_block_bmap(mapping, block, jfs_get_block);
}

-static int jfs_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+static int jfs_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
{
- return generic_direct_IO(rw, inode, iov,
+ return generic_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, jfs_get_blocks);
}

diff -Naur -X dontdiff linux-2.5.35/include/linux/fs.h linux-2.5.35.aio/include/linux/fs.h
--- linux-2.5.35/include/linux/fs.h Sun Sep 15 19:18:24 2002
+++ linux-2.5.35.aio/include/linux/fs.h Tue Sep 17 09:04:21 2002
@@ -23,6 +23,7 @@
#include <linux/string.h>
#include <linux/radix-tree.h>
#include <linux/bitops.h>
+#include <linux/fs.h>

#include <asm/atomic.h>

@@ -279,6 +280,7 @@
*/
struct page;
struct address_space;
+struct kiocb;

struct address_space_operations {
int (*writepage)(struct page *);
@@ -307,7 +309,7 @@
int (*bmap)(struct address_space *, long);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
- int (*direct_IO)(int, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+ int (*direct_IO)(int, struct kiocb *, struct inode *, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
};

struct backing_dev_info;
@@ -743,7 +745,6 @@
* read, write, poll, fsync, readv, writev can be called
* without the big kernel lock held in all filesystems.
*/
-struct kiocb;
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
@@ -1248,10 +1249,10 @@
unsigned long nr_segs, loff_t *ppos);
extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t);
extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
-extern ssize_t generic_file_direct_IO(int rw, struct inode *inode,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec
- *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
+extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, struct inode * inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs);
+extern int generic_direct_IO(int rw, struct kiocb *iocb, struct inode * inode,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs,
+ get_blocks_t *get_blocks);
extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos);
ssize_t generic_file_writev(struct file *filp, const struct iovec *iov,
diff -Naur -X dontdiff linux-2.5.35/mm/filemap.c linux-2.5.35.aio/mm/filemap.c
--- linux-2.5.35/mm/filemap.c Sun Sep 15 19:18:26 2002
+++ linux-2.5.35.aio/mm/filemap.c Tue Sep 17 09:00:26 2002
@@ -1153,7 +1153,7 @@
nr_segs = iov_shorten((struct iovec *)iov,
nr_segs, count);
}
- retval = generic_file_direct_IO(READ, inode,
+ retval = generic_file_direct_IO(READ, iocb, inode,
iov, pos, nr_segs);
if (retval > 0)
*ppos = pos + retval;
@@ -1989,7 +1989,9 @@
current iovec */
unsigned long seg;
char *buf;
+ struct kiocb kiocb;

+ init_sync_kiocb(&kiocb, file);
if (unlikely((ssize_t)count < 0))
return -EINVAL;

@@ -2099,8 +2101,11 @@
if (count != ocount)
nr_segs = iov_shorten((struct iovec *)iov,
nr_segs, count);
- written = generic_file_direct_IO(WRITE, inode,
+ written = generic_file_direct_IO(WRITE, &kiocb, inode,
iov, pos, nr_segs);
+ if (written == -EIOCBQUEUED)
+ written = wait_on_sync_kiocb(&kiocb);
+
if (written > 0) {
loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {


2002-09-18 11:46:03

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

On Tue, Sep 17, 2002 at 02:03:02PM -0700, Badari Pulavarty wrote:
> Hi Ben,
>
> Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
> wait for io completion. Waiting will be done at the higher level
> (same way generic_file_read does).

This looks pretty good. Andrew Morton has had a look over it too and
agrees that it should go in after a bit of testing. Does someone want
to try comparing throughput of dio based aio vs normal read/write? It
would be interesting to see what kind of an effect pipelining aios makes
and if there are any performance niggles we need to squash. Cheer,

-ben

2002-09-18 15:59:46

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

On Wed, Sep 18, 2002 at 08:58:22AM -0700, Badari Pulavarty wrote:
> Thanks for looking at the patch. Patch needed few cleanups to get it
> working. Here is the status so far..
>
> 1) I tested synchronous raw read/write. They perform almost same as
> without this patch. I am looking at why I can't match the performance.
> (I get 380MB/sec on 40 dd's on 40 disks without this patch.
> I get 350MB/sec with this patch).

Hmm, we should try to improve that. Have you tried profiling a long run?

> 2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
> write perform well.

That should be somewhat conditional, but for now it sounds like the right
thing to do.

> 3) I am testing aio read/writes. I am using libaio.0.3.92.
> When I try aio_read/aio_write on raw device, I am get OOPS.
> Can I use libaio.0.3.92 on 2.5 ? Are there any interface
> changes I need to worry between 2.4 and 2.5 ?

libaio 0.3.92 should work on 2.5. Could you send me a copy of the
decoded Oops?

-ben
--
GMS rules.

2002-09-18 15:54:07

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

>
> On Tue, Sep 17, 2002 at 02:03:02PM -0700, Badari Pulavarty wrote:
> > Hi Ben,
> >
> > Here is a 2.5.35 patch to make DIO async. Basically, DIO does not
> > wait for io completion. Waiting will be done at the higher level
> > (same way generic_file_read does).
>
> This looks pretty good. Andrew Morton has had a look over it too and
> agrees that it should go in after a bit of testing. Does someone want
> to try comparing throughput of dio based aio vs normal read/write? It
> would be interesting to see what kind of an effect pipelining aios makes
> and if there are any performance niggles we need to squash. Cheer,
>
> -ben

Thanks for looking at the patch. Patch needed few cleanups to get it
working. Here is the status so far..

1) I tested synchronous raw read/write. They perform almost same as
without this patch. I am looking at why I can't match the performance.
(I get 380MB/sec on 40 dd's on 40 disks without this patch.
I get 350MB/sec with this patch).

2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
write perform well.

3) I am testing aio read/writes. I am using libaio.0.3.92.
When I try aio_read/aio_write on raw device, I am get OOPS.
Can I use libaio.0.3.92 on 2.5 ? Are there any interface
changes I need to worry between 2.4 and 2.5 ?

Once I get aio read/writes working, I will provide you the performance
numbers.

Thanks,
Badari

2002-09-18 16:25:33

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

>
> On Wed, Sep 18, 2002 at 08:58:22AM -0700, Badari Pulavarty wrote:
> > Thanks for looking at the patch. Patch needed few cleanups to get it
> > working. Here is the status so far..
> >
> > 1) I tested synchronous raw read/write. They perform almost same as
> > without this patch. I am looking at why I can't match the performance.
> > (I get 380MB/sec on 40 dd's on 40 disks without this patch.
> > I get 350MB/sec with this patch).
>
> Hmm, we should try to improve that. Have you tried profiling a long run?


I took few shortcuts in the patch compared to original DIO code.
eg., I create a bio all the time.. original code reuses them.
But I don't think that is causing the performance hit. Anyway,
here is the vmstat and profile=2 output.

vmstat:

0 40 2 0 3804628 0 27396 0 0 344909 192 3786 3536 0 5 95
0 40 1 0 3803268 0 28628 0 0 345686 208 3793 3578 0 5 95
0 40 1 0 3802452 0 29404 0 0 346983 210 3806 3619 0 5 95
0 40 1 0 3801888 0 29932 0 0 345178 203 3786 3500 0 5 95
0 40 1 0 3801804 0 29996 0 0 345792 196 3791 3510 0 5 95
0 40 1 0 3801652 0 30100 0 0 346048 200 3800 3556 0 5 95

profile:

146796 default_idle 2293.6875
1834 __scsi_end_request 10.4205
1201 do_softirq 6.2552
891 scsi_dispatch_cmd 2.3203
50 __wake_up 1.0417
599 get_user_pages 0.7341
362 blk_rq_map_sg 0.6856
420 do_direct_IO 0.5707
25 dio_prep_bio 0.5208
562 __make_request 0.4878
55 __scsi_release_command 0.4167
6 cap_file_permission 0.3750
63 release_console_sem 0.3580
55 do_aic7xxx_isr 0.3438
141 bio_alloc 0.3389
45 scsi_finish_command 0.2557
8 scsi_sense_valid 0.2500
16 dio_new_bio 0.2500
206 schedule 0.2341

>
> > 2) wait_on_sync_kiocb() needed blk_run_queues() to make regular read/
> > write perform well.
>
> That should be somewhat conditional, but for now it sounds like the right
> thing to do.
>
> > 3) I am testing aio read/writes. I am using libaio.0.3.92.
> > When I try aio_read/aio_write on raw device, I am get OOPS.
> > Can I use libaio.0.3.92 on 2.5 ? Are there any interface
> > changes I need to worry between 2.4 and 2.5 ?
>
> libaio 0.3.92 should work on 2.5. Could you send me a copy of the
> decoded Oops?
>

My Bad :)

aio_read/aio_write() routines take loff_t not loff_t * (like regular read/writes)

ssize_t (*read) (struct file *, char *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, char *, size_t, loff_t);

I coded for loff_t *. fixed it. I am testing it now.

Thanks,
Badari


2002-09-18 20:42:33

by Badari Pulavarty

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

Ben,

aio_read/aio_write() are now working with a minor fix to fs/aio.c

io_submit_one():

if (likely(EIOCBQUEUED == ret))

needs to be changed to

if (likely(-EIOCBQUEUED == ret))
^^^


I was wondering what happens to following case (I think this
happend in my test program).

Lets say, I did an sys_io_submit() and my test program did exit().
When the IO complete happend, it tried to do following and got
an OOPS in aio_complete().

if (ctx == &ctx->mm->default_kioctx) {

I think "mm" is freed up, when process exited. Do you think this is
possible ? How do we handle this ?

- Badari

2002-09-19 10:47:43

by Suparna Bhattacharya

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

On Wed, Sep 18, 2002 at 01:47:06PM -0700, Badari Pulavarty wrote:
> Ben,
>
> aio_read/aio_write() are now working with a minor fix to fs/aio.c
>
> io_submit_one():
>
> if (likely(EIOCBQUEUED == ret))
>
> needs to be changed to
>
> if (likely(-EIOCBQUEUED == ret))
> ^^^
>
>
> I was wondering what happens to following case (I think this
> happend in my test program).
>
> Lets say, I did an sys_io_submit() and my test program did exit().
> When the IO complete happend, it tried to do following and got
> an OOPS in aio_complete().
>
> if (ctx == &ctx->mm->default_kioctx) {
>
> I think "mm" is freed up, when process exited. Do you think this is
> possible ? How do we handle this ?

Do you see this only in the sync case ?
init_sync_iocb ought to increment ctx->reqs_active, so that
exit_aio waits for the iocb's to complete.

Regards
Suparna

2002-09-19 11:31:45

by Suparna Bhattacharya

[permalink] [raw]
Subject: Re: [RFC] [PATCH] 2.5.35 patch for making DIO async

On Thu, Sep 19, 2002 at 04:22:14PM +0530, Suparna Bhattacharya wrote:
> On Wed, Sep 18, 2002 at 01:47:06PM -0700, Badari Pulavarty wrote:
> > Ben,
> >
> > aio_read/aio_write() are now working with a minor fix to fs/aio.c
> >
> > io_submit_one():
> >
> > if (likely(EIOCBQUEUED == ret))
> >
> > needs to be changed to
> >
> > if (likely(-EIOCBQUEUED == ret))
> > ^^^
> >
> >
> > I was wondering what happens to following case (I think this
> > happend in my test program).
> >
> > Lets say, I did an sys_io_submit() and my test program did exit().
> > When the IO complete happend, it tried to do following and got
> > an OOPS in aio_complete().
> >
> > if (ctx == &ctx->mm->default_kioctx) {
> >
> > I think "mm" is freed up, when process exited. Do you think this is
> > possible ? How do we handle this ?
>
> Do you see this only in the sync case ?
> init_sync_iocb ought to increment ctx->reqs_active, so that
> exit_aio waits for the iocb's to complete.

Sorry, guess in the sync case, exit_aio shouldn't happen since
the current thread still has a reference to the mm.

And your problem is with the io_submit case ... have to look closely
to find out why.

>
> Regards
> Suparna
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to [email protected]. For more info on Linux AIO,
> see: http://www.kvack.org/aio/