2002-07-08 03:10:46

by Andrew Morton

[permalink] [raw]
Subject: direct-to-BIO for O_DIRECT


Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
the kiovec layer. It's followed by a patch which converts the raw
driver to use the O_DIRECT engine.

CPU utilisation is about the same as the kiovec-based implementation.
Read and write bandwidth are the same too, for 128k chunks. But with
one megabyte chunks, this implementation is 20% faster at writing.

I assume this is because the kiobuf-based implementation has to stop
and wait for each 128k chunk, whereas this code streams the entire
request, regardless of its size.

This is with a single (oldish) scsi disk on aic7xxx. I'd expect the
margin to widen on higher-end hardware which likes to have more
requests in flight.

Question is: what do we want to do with this sucker? These are the
remaining users of kiovecs:

drivers/md/lvm-snap.c
drivers/media/video/video-buf.c
drivers/mtd/devices/blkmtd.c
drivers/scsi/sg.c

the video and mtd drivers seems to be fairly easy to de-kiobufize.
I'm aware of one proprietary driver which uses kiobufs. XFS uses
kiobufs a little bit - just to map the pages.

So with a bit of effort and maintainer-irritation, we can extract
the kiobuf layer from the kernel.

Do we want to do that?



fs/Makefile | 2
fs/block_dev.c | 7
fs/buffer.c | 2
fs/direct-io.c | 491 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext2/inode.c | 7
include/linux/buffer_head.h | 2
include/linux/fs.h | 11
mm/filemap.c | 64 ++---
8 files changed, 543 insertions(+), 43 deletions(-)

--- /dev/null Thu Aug 30 13:30:55 2001
+++ 2.5.25-akpm/fs/direct-io.c Sun Jul 7 19:40:20 2002
@@ -0,0 +1,491 @@
+/*
+ * mm/direct-io.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * O_DIRECT
+ *
+ * 04Jul2002 [email protected]
+ * Initial version
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/buffer_head.h>
+#include <linux/rwsem.h>
+#include <asm/atomic.h>
+
+/*
+ * The largest-sized BIO which this code will assemble, in bytes. Set this
+ * to PAGE_SIZE if your drivers are broken.
+ */
+#define DIO_BIO_MAX_SIZE BIO_MAX_SIZE
+
+/*
+ * How many user pages to map in one call to get_user_pages(). This determines
+ * the size of a structure on the stack.
+ */
+#define DIO_PAGES 64
+
+struct dio {
+ /* BIO submission state */
+ struct bio *bio; /* bio under assembly */
+ struct bio_vec *bvec; /* current bvec in that bio */
+ struct inode *inode;
+ int rw;
+ sector_t block_in_file; /* changes */
+ sector_t final_block_in_request;/* doesn't change */
+ unsigned first_block_in_page; /* doesn't change */
+ int boundary; /* prev block is at a boundary */
+ int reap_counter; /* rate limit reaping */
+ get_block_t *get_block;
+ sector_t last_block_in_bio;
+
+ /* Page fetching state */
+ int curr_page; /* changes */
+ int total_pages; /* doesn't change */
+ unsigned long curr_user_address;/* changes */
+
+ /* Page queue */
+ struct page *pages[DIO_PAGES];
+ unsigned head;
+ unsigned tail;
+
+ /* BIO completion state */
+ atomic_t bio_count;
+ spinlock_t bio_list_lock;
+ struct bio *bio_list; /* singly linked via bi_private */
+ wait_queue_head_t wait_q;
+};
+
+/*
+ * How many pages are in the queue?
+ */
+static inline unsigned dio_pages_present(struct dio *dio)
+{
+ return dio->head - dio->tail;
+}
+
+/*
+ * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
+ */
+static int dio_refill_pages(struct dio *dio)
+{
+ int ret;
+ int nr_pages;
+
+ nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+ ret = get_user_pages(
+ current, /* Task for fault acounting */
+ current->mm, /* whose pages? */
+ dio->curr_user_address, /* Where from? */
+ nr_pages, /* How many pages? */
+ dio->rw == READ, /* Write to memory? */
+ 0, /* force (?) */
+ &dio->pages[0],
+ NULL); /* vmas */
+
+ if (ret >= 0) {
+ dio->curr_user_address += ret * PAGE_SIZE;
+ dio->curr_page += ret;
+ dio->head = 0;
+ dio->tail = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * Get another userspace page. Returns an ERR_PTR on error. Pages are
+ * buffered inside the dio so that we can call get_user_pages() against a
+ * decent number of pages, less frequently. To provide nicer use of the
+ * L1 cache.
+ */
+static struct page *dio_get_page(struct dio *dio)
+{
+ if (dio_pages_present(dio) == 0) {
+ int ret;
+
+ ret = dio_refill_pages(dio);
+ if (ret) {
+ printk("%s: dio_refill_pages returns %d\n",
+ __FUNCTION__, ret);
+ return ERR_PTR(ret);
+ }
+ BUG_ON(dio_pages_present(dio) == 0);
+ }
+ return dio->pages[dio->head++];
+}
+
+/*
+ * The BIO completion handler simply queues the BIO up for the process-context
+ * handler.
+ *
+ * During I/O bi_private points at the dio. After I/O, bi_private is used to
+ * implement a singly-linked list of completed BIOs, at dio->bio_list.
+ */
+static void dio_bio_end_io(struct bio *bio)
+{
+ struct dio *dio = bio->bi_private;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ bio->bi_private = dio->bio_list;
+ dio->bio_list = bio;
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ wake_up(&dio->wait_q);
+}
+
+static int
+dio_bio_alloc(struct dio *dio, struct block_device *bdev,
+ sector_t first_sector, int nr_vecs)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, nr_vecs);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ bio->bi_bdev = bdev;
+ bio->bi_vcnt = nr_vecs;
+ bio->bi_idx = 0;
+ bio->bi_size = 0;
+ bio->bi_sector = first_sector;
+ bio->bi_io_vec[0].bv_page = NULL;
+ bio->bi_end_io = dio_bio_end_io;
+
+ dio->bio = bio;
+ dio->bvec = NULL; /* debug */
+ return 0;
+}
+
+static void dio_bio_submit(struct dio *dio)
+{
+ struct bio *bio = dio->bio;
+
+ bio->bi_vcnt = bio->bi_idx;
+ bio->bi_idx = 0;
+ bio->bi_private = dio;
+ atomic_inc(&dio->bio_count);
+ submit_bio(dio->rw, bio);
+
+ dio->bio = NULL;
+ dio->bvec = NULL;
+}
+
+/*
+ * Release any resources in case of a failure
+ */
+static void dio_cleanup(struct dio *dio)
+{
+ while (dio_pages_present(dio))
+ page_cache_release(dio_get_page(dio));
+}
+
+/*
+ * Wait for the next BIO to complete. Remove it and return it.
+ */
+static struct bio *dio_await_one(struct dio *dio)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ unsigned long flags;
+ struct bio *bio;
+
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ while (dio->bio_list == NULL) {
+ add_wait_queue(&dio->wait_q, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (dio->bio_list == NULL) {
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ blk_run_queues();
+ schedule();
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&dio->wait_q, &wait);
+ }
+ bio = dio->bio_list;
+ dio->bio_list = bio->bi_private;
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ return bio;
+}
+
+/*
+ * Process one completed BIO. No locks are held.
+ */
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec;
+ int page_no;
+ int ret = 0;
+
+ for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+ struct page *page = bvec[page_no].bv_page;
+
+ if (!uptodate) {
+ if (ret == 0)
+ ret = -EIO;
+ }
+
+ if (dio->rw == READ)
+ set_page_dirty(page);
+ page_cache_release(page);
+ }
+ atomic_dec(&dio->bio_count);
+ bio_put(bio);
+ return ret;
+}
+
+/*
+ * Wait on and process all in-flight BIOs.
+ */
+static int dio_await_completion(struct dio *dio)
+{
+ int ret = 0;
+ while (atomic_read(&dio->bio_count)) {
+ struct bio *bio = dio_await_one(dio);
+ int ret2;
+
+ ret2 = dio_bio_complete(dio, bio);
+ if (ret == 0)
+ ret = ret2;
+ }
+ return ret;
+}
+
+/*
+ * A really large O_DIRECT read or write can generate a lot of BIOs. So
+ * to keep the memory consumption sane we periodically reap any completed BIOs
+ * during the BIO generation phase.
+ *
+ * This also helps to limis the peak amount of pinned userspace memory.
+ */
+static int dio_bio_reap(struct dio *dio)
+{
+ int ret = 0;
+
+ if (dio->reap_counter++ >= 64) {
+ while (dio->bio_list) {
+ unsigned long flags;
+ struct bio *bio;
+ int ret2;
+
+ spin_lock_irqsave(&dio->bio_list_lock, flags);
+ bio = dio->bio_list;
+ dio->bio_list = bio->bi_private;
+ spin_unlock_irqrestore(&dio->bio_list_lock, flags);
+ ret2 = dio_bio_complete(dio, bio);
+ if (ret == 0)
+ ret = ret2;
+ }
+ dio->reap_counter = 0;
+ }
+ return ret;
+}
+
+/*
+ * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
+ */
+int do_direct_IO(struct dio *dio)
+{
+ struct inode * const inode = dio->inode;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ struct page *page;
+ unsigned block_in_page;
+ int ret;
+
+ /* The I/O can start at any block offset within the first page */
+ block_in_page = dio->first_block_in_page;
+
+ while (dio->block_in_file < dio->final_block_in_request) {
+ int new_page; /* Need to insert this page into the BIO? */
+
+ page = dio_get_page(dio);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
+ new_page = 1;
+ for ( ; block_in_page < blocks_per_page; block_in_page++) {
+ struct buffer_head map_bh;
+ struct bio *bio;
+
+ map_bh.b_state = 0;
+ ret = (*dio->get_block)(inode, dio->block_in_file,
+ &map_bh, dio->rw == WRITE);
+ if (ret) {
+ printk("%s: get_block returns %d\n",
+ __FUNCTION__, ret);
+ goto fail_release;
+ }
+ /* blockdevs do not set buffer_new */
+ if (buffer_new(&map_bh))
+ unmap_underlying_metadata(map_bh.b_bdev,
+ map_bh.b_blocknr);
+ if (!buffer_mapped(&map_bh)) {
+ ret = -EINVAL; /* A hole */
+ goto fail_release;
+ }
+ if (dio->bio) {
+ if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
+ dio->boundary ||
+ dio->last_block_in_bio !=
+ map_bh.b_blocknr - 1) {
+ dio_bio_submit(dio);
+ dio->boundary = 0;
+ }
+ }
+ if (dio->bio == NULL) {
+ ret = dio_bio_reap(dio);
+ if (ret)
+ goto fail_release;
+ ret = dio_bio_alloc(dio, map_bh.b_bdev,
+ map_bh.b_blocknr << (blkbits - 9),
+ DIO_BIO_MAX_SIZE / PAGE_SIZE);
+ if (ret)
+ goto fail_release;
+ new_page = 1;
+ dio->boundary = 0;
+ }
+
+ bio = dio->bio;
+ if (new_page) {
+ dio->bvec = &bio->bi_io_vec[bio->bi_idx];
+ page_cache_get(page);
+ dio->bvec->bv_page = page;
+ dio->bvec->bv_len = 0;
+ dio->bvec->bv_offset = block_in_page*blocksize;
+ bio->bi_idx++;
+ }
+ new_page = 0;
+ dio->bvec->bv_len += blocksize;
+ bio->bi_size += blocksize;
+ dio->last_block_in_bio = map_bh.b_blocknr;
+ dio->boundary = buffer_boundary(&map_bh);
+
+ dio->block_in_file++;
+ if (dio->block_in_file >= dio->final_block_in_request)
+ break;
+ }
+ block_in_page = 0;
+ page_cache_release(page);
+ }
+ ret = 0;
+ goto out;
+fail_release:
+ page_cache_release(page);
+out:
+ return ret;
+}
+
+struct dio *g_dio;
+
+int
+generic_direct_IO(int rw, struct inode *inode, char *buf, loff_t offset,
+ size_t count, get_block_t get_block)
+{
+ const unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
+ const unsigned long user_addr = (unsigned long)buf;
+ int ret = 0;
+ int ret2;
+ struct dio dio;
+ size_t bytes;
+
+ /* Check the memory alignment. Blocks cannot straddle pages */
+ if ((user_addr & blocksize_mask) || (count & blocksize_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ g_dio = &dio;
+
+ /* BIO submission state */
+ dio.bio = NULL;
+ dio.bvec = NULL;
+ dio.inode = inode;
+ dio.rw = rw;
+ dio.block_in_file = offset >> inode->i_blkbits;
+ dio.final_block_in_request = (offset + count) >> inode->i_blkbits;
+
+ /* Index into the first page of the first block */
+ dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1))
+ >> inode->i_blkbits;
+ dio.boundary = 0;
+ dio.reap_counter = 0;
+ dio.get_block = get_block;
+ dio.last_block_in_bio = -1;
+
+ /* Page fetching state */
+ dio.curr_page = 0;
+ bytes = count;
+ dio.total_pages = 0;
+ if (offset & PAGE_SIZE) {
+ dio.total_pages++;
+ bytes -= PAGE_SIZE - (offset & ~(PAGE_SIZE - 1));
+ }
+ dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+ dio.curr_user_address = user_addr;
+
+ /* Page queue */
+ dio.head = 0;
+ dio.tail = 0;
+
+ /* BIO completion state */
+ atomic_set(&dio.bio_count, 0);
+ spin_lock_init(&dio.bio_list_lock);
+ dio.bio_list = NULL;
+ init_waitqueue_head(&dio.wait_q);
+
+ down_read(&current->mm->mmap_sem);
+ ret = do_direct_IO(&dio);
+ up_read(&current->mm->mmap_sem);
+
+ if (dio.bio)
+ dio_bio_submit(&dio);
+ if (ret)
+ dio_cleanup(&dio);
+ ret2 = dio_await_completion(&dio);
+ if (ret == 0)
+ ret = ret2;
+ if (ret == 0)
+ ret = count - ((dio.final_block_in_request -
+ dio.block_in_file) << inode->i_blkbits);
+out:
+ return ret;
+}
+
+ssize_t
+generic_file_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count)
+{
+ struct address_space *mapping = inode->i_mapping;
+ unsigned blocksize_mask;
+ ssize_t retval;
+
+ blocksize_mask = (1 << inode->i_blkbits) - 1;
+ if ((offset & blocksize_mask) || (count & blocksize_mask)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (mapping->nrpages) {
+ retval = filemap_fdatawrite(mapping);
+ if (retval == 0)
+ retval = filemap_fdatawait(mapping);
+ if (retval)
+ goto out;
+ }
+ retval = mapping->a_ops->direct_IO(rw, inode, buf, offset, count);
+out:
+ return retval;
+}
--- 2.5.25/include/linux/fs.h~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/include/linux/fs.h Sun Jul 7 19:35:39 2002
@@ -303,8 +303,8 @@ struct address_space_operations {
int (*bmap)(struct address_space *, long);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
-#define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
- int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+ int (*direct_IO)(int, struct inode *, char *buf,
+ loff_t offset, size_t count);
};

struct backing_dev_info;
@@ -1128,7 +1128,7 @@ extern int check_disk_change(kdev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
-extern void invalidate_inode_pages2(struct address_space *);
+extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
@@ -1233,6 +1233,11 @@ extern int file_read_actor(read_descript
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+ssize_t generic_file_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count);
+int generic_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count, get_block_t *get_block);
+
extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
--- 2.5.25/include/linux/buffer_head.h~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/include/linux/buffer_head.h Sun Jul 7 19:35:39 2002
@@ -182,8 +182,6 @@ int block_sync_page(struct page *);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);
-int generic_direct_IO(int, struct inode *, struct kiobuf *,
- unsigned long, int, get_block_t *);
int file_fsync(struct file *, struct dentry *, int);

#define OSYNC_METADATA (1<<0)
--- 2.5.25/fs/buffer.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/buffer.c Sun Jul 7 19:35:39 2002
@@ -2298,6 +2298,7 @@ sector_t generic_block_bmap(struct addre
return tmp.b_blocknr;
}

+#if 0
int generic_direct_IO(int rw, struct inode *inode,
struct kiobuf *iobuf, unsigned long blocknr,
int blocksize, get_block_t *get_block)
@@ -2344,6 +2345,7 @@ int generic_direct_IO(int rw, struct ino
out:
return retval;
}
+#endif

/*
* Start I/O on a physical range of kernel memory, defined by a vector
--- 2.5.25/mm/filemap.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/mm/filemap.c Sun Jul 7 19:35:39 2002
@@ -413,7 +413,7 @@ static int invalidate_list_pages2(struct
* free the pages because they're mapped.
* @mapping: the address_space which pages we want to invalidate
*/
-void invalidate_inode_pages2(struct address_space * mapping)
+void invalidate_inode_pages2(struct address_space *mapping)
{
int unlocked;

@@ -1101,6 +1101,7 @@ no_cached_page:
UPDATE_ATIME(inode);
}

+#if 0
static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
{
ssize_t retval;
@@ -1181,6 +1182,7 @@ static ssize_t generic_file_direct_IO(in
out:
return retval;
}
+#endif

int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{
@@ -1208,15 +1210,36 @@ int file_read_actor(read_descriptor_t *
* This is the "read()" routine for all filesystems
* that can use the page cache directly.
*/
-ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+ssize_t
+generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
ssize_t retval;

if ((ssize_t) count < 0)
return -EINVAL;

- if (filp->f_flags & O_DIRECT)
- goto o_direct;
+ if (filp->f_flags & O_DIRECT) {
+ loff_t pos = *ppos, size;
+ struct address_space *mapping;
+ struct inode *inode;
+
+ mapping = filp->f_dentry->d_inode->i_mapping;
+ inode = mapping->host;
+ retval = 0;
+ if (!count)
+ goto out; /* skip atime */
+ size = inode->i_size;
+ if (pos < size) {
+ if (pos + count > size)
+ count = size - pos;
+ retval = generic_file_direct_IO(READ, inode,
+ buf, pos, count);
+ if (retval > 0)
+ *ppos = pos + retval;
+ }
+ UPDATE_ATIME(filp->f_dentry->d_inode);
+ goto out;
+ }

retval = -EFAULT;
if (access_ok(VERIFY_WRITE, buf, count)) {
@@ -1229,36 +1252,14 @@ ssize_t generic_file_read(struct file *
desc.count = count;
desc.buf = buf;
desc.error = 0;
- do_generic_file_read(filp, ppos, &desc, file_read_actor);
-
+ do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval = desc.written;
if (!retval)
retval = desc.error;
}
}
- out:
+out:
return retval;
-
- o_direct:
- {
- loff_t pos = *ppos, size;
- struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
- struct inode *inode = mapping->host;
-
- retval = 0;
- if (!count)
- goto out; /* skip atime */
- size = inode->i_size;
- if (pos < size) {
- if (pos + count > size)
- count = size - pos;
- retval = generic_file_direct_IO(READ, filp, buf, count, pos);
- if (retval > 0)
- *ppos = pos + retval;
- }
- UPDATE_ATIME(filp->f_dentry->d_inode);
- goto out;
- }
}

static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
@@ -2199,8 +2200,8 @@ generic_file_write(struct file *file, co
}

if (unlikely(file->f_flags & O_DIRECT)) {
- written = generic_file_direct_IO(WRITE, file,
- (char *) buf, count, pos);
+ written = generic_file_direct_IO(WRITE, inode,
+ (char *)buf, pos, count);
if (written > 0) {
loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
@@ -2208,7 +2209,8 @@ generic_file_write(struct file *file, co
mark_inode_dirty(inode);
}
*ppos = end;
- invalidate_inode_pages2(mapping);
+ if (mapping->nrpages)
+ invalidate_inode_pages2(mapping);
}
/*
* Sync the fs metadata but not the minor inode changes and
--- 2.5.25/fs/ext2/inode.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/ext2/inode.c Sun Jul 7 19:35:39 2002
@@ -607,11 +607,10 @@ static int ext2_bmap(struct address_spac
}

static int
-ext2_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
- unsigned long blocknr, int blocksize)
+ext2_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count)
{
- return generic_direct_IO(rw, inode, iobuf, blocknr,
- blocksize, ext2_get_block);
+ return generic_direct_IO(rw, inode, buf, offset, count, ext2_get_block);
}

static int
--- 2.5.25/fs/Makefile~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/Makefile Sun Jul 7 19:35:39 2002
@@ -15,7 +15,7 @@ obj-y := open.o read_write.o devices.o f
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
- fs-writeback.o mpage.o
+ fs-writeback.o mpage.o direct-io.o

ifneq ($(CONFIG_NFSD),n)
ifneq ($(CONFIG_NFSD),)
--- 2.5.25/fs/block_dev.c~odirect-redux Sun Jul 7 19:35:39 2002
+++ 2.5.25-akpm/fs/block_dev.c Sun Jul 7 19:35:39 2002
@@ -105,9 +105,12 @@ static int blkdev_get_block(struct inode
return 0;
}

-static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize)
+static int
+blkdev_direct_IO(int rw, struct inode *inode, char *buf,
+ loff_t offset, size_t count)
{
- return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block);
+ return generic_direct_IO(rw, inode, buf, offset,
+ count, blkdev_get_block);
}

static int blkdev_writepage(struct page * page)

-


raw.c | 136 ++++++++++++------------------------------------------------------
1 files changed, 26 insertions(+), 110 deletions(-)

--- 2.5.25/drivers/char/raw.c~raw-use-generic Sun Jul 7 19:35:44 2002
+++ 2.5.25-akpm/drivers/char/raw.c Sun Jul 7 19:58:33 2002
@@ -8,8 +8,8 @@
* device are used to bind the other minor numbers to block devices.
*/

+#include <linux/init.h>
#include <linux/fs.h>
-#include <linux/iobuf.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/raw.h>
@@ -86,12 +86,6 @@ int raw_open(struct inode *inode, struct
return 0;
}

- if (!filp->f_iobuf) {
- err = alloc_kiovec(1, &filp->f_iobuf);
- if (err)
- return err;
- }
-
down(&raw_devices[minor].mutex);
/*
* No, it is a normal raw device. All we need to do on open is
@@ -256,124 +250,46 @@ int raw_ctl_ioctl(struct inode *inode,
return err;
}

-
-
-ssize_t raw_read(struct file *filp, char * buf,
- size_t size, loff_t *offp)
+ssize_t raw_read(struct file *filp, char * buf, size_t size, loff_t *offp)
{
return rw_raw_dev(READ, filp, buf, size, offp);
}

-ssize_t raw_write(struct file *filp, const char *buf,
- size_t size, loff_t *offp)
+ssize_t raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
{
return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
}

-#define SECTOR_BITS 9
-#define SECTOR_SIZE (1U << SECTOR_BITS)
-#define SECTOR_MASK (SECTOR_SIZE - 1)
-
-ssize_t rw_raw_dev(int rw, struct file *filp, char *buf,
- size_t size, loff_t *offp)
+ssize_t
+rw_raw_dev(int rw, struct file *filp, char *buf, size_t size, loff_t *offp)
{
- struct kiobuf * iobuf;
- int new_iobuf;
- int err = 0;
- unsigned long blocks;
- size_t transferred;
- int iosize;
- int minor;
- kdev_t dev;
- unsigned long limit;
- int sector_size, sector_bits, sector_mask;
- sector_t blocknr;
struct block_device *bdev;
-
- /*
- * First, a few checks on device size limits
- */
+ struct inode *inode;
+ int minor;
+ ssize_t ret = 0;

minor = minor(filp->f_dentry->d_inode->i_rdev);
-
- new_iobuf = 0;
- iobuf = filp->f_iobuf;
- if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
- /*
- * A parallel read/write is using the preallocated iobuf
- * so just run slow and allocate a new one.
- */
- err = alloc_kiovec(1, &iobuf);
- if (err)
- goto out;
- new_iobuf = 1;
- }
-
bdev = raw_devices[minor].binding;
- dev = to_kdev_t(bdev->bd_dev);
- sector_size = raw_devices[minor].sector_size;
- sector_bits = raw_devices[minor].sector_bits;
- sector_mask = sector_size - 1;
-
- limit = bdev->bd_inode->i_size >> sector_bits;
- if (!limit)
- limit = INT_MAX;
- dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
- major(dev), minor(dev), limit);
-
- err = -EINVAL;
- if ((*offp & sector_mask) || (size & sector_mask))
- goto out_free;
- err = 0;
- if (size)
- err = -ENXIO;
- if ((*offp >> sector_bits) >= limit)
- goto out_free;
-
- transferred = 0;
- blocknr = *offp >> sector_bits;
- while (size > 0) {
- blocks = size >> sector_bits;
- if (blocks > limit - blocknr)
- blocks = limit - blocknr;
- if (!blocks)
- break;
-
- iosize = blocks << sector_bits;
+ inode = bdev->bd_inode;

- err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
- if (err)
- break;
-
- err = brw_kiovec(rw, 1, &iobuf, raw_devices[minor].binding, &blocknr, sector_size);
-
- if (rw == READ && err > 0)
- mark_dirty_kiobuf(iobuf, err);
-
- if (err >= 0) {
- transferred += err;
- size -= err;
- buf += err;
- }
-
- blocknr += blocks;
-
- unmap_kiobuf(iobuf);
-
- if (err != iosize)
- break;
+ if (size == 0)
+ goto out;
+ if (size < 0) {
+ ret = -EINVAL;
+ goto out;
}
-
- if (transferred) {
- *offp += transferred;
- err = transferred;
+ if (*offp >= inode->i_size) {
+ ret = -ENXIO;
+ goto out;
}
+ if (size + *offp > inode->i_size)
+ size = inode->i_size - *offp;

- out_free:
- if (!new_iobuf)
- clear_bit(0, &filp->f_iobuf_lock);
- else
- free_kiovec(1, &iobuf);
- out:
- return err;
+ ret = generic_file_direct_IO(rw, inode, buf, *offp, size);
+ if (ret > 0)
+ *offp += ret;
+ if (inode->i_mapping->nrpages)
+ invalidate_inode_pages2(inode->i_mapping);
+out:
+ return ret;
}

-


2002-07-08 03:28:21

by Lincoln Dale

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:
>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer. It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.
>
>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks. But with
>one megabyte chunks, this implementation is 20% faster at writing.
>
>I assume this is because the kiobuf-based implementation has to stop
>and wait for each 128k chunk, whereas this code streams the entire
>request, regardless of its size.
>
>This is with a single (oldish) scsi disk on aic7xxx. I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.

i'll have a go at benchmark-testing these.

now have even bigger hardware than before: 2 x 2gbit/s FC HBAs in multiple
dual-processor (Dual P3 Xeon 550MHz 2M L2 cache and Dual P3 Xeon 833MHz
256K L2 cache) boxen, 8 x 15K RPM FC, 28 x 10K RPM SCSI.


cheers,

lincoln.

2002-07-08 07:24:15

by Andi Kleen

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

Andrew Morton <[email protected]> writes:

> drivers/md/lvm-snap.c
> drivers/media/video/video-buf.c
> drivers/mtd/devices/blkmtd.c
> drivers/scsi/sg.c
>
> the video and mtd drivers seems to be fairly easy to de-kiobufize.
> I'm aware of one proprietary driver which uses kiobufs. XFS uses
> kiobufs a little bit - just to map the pages.

lkcd uses it too for its kernel crash dump. I suspect it wouldn't be that
hard to change.

> So with a bit of effort and maintainer-irritation, we can extract
> the kiobuf layer from the kernel.
>
> Do we want to do that?

I think yes - keeping two kinds of iovectors for IO (kiovecs and BIOs) seems
to be redundant.
kiovecs never fulfilled their original promise of a universal zero copy
container (e.g. they were too heavy weight for networking) so it's probably
best to remove them as a failed experiment.

-Andi

2002-07-08 07:41:46

by Ingo Oeser

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

On Sun, Jul 07, 2002 at 08:19:33PM -0700, Andrew Morton wrote:
> Question is: what do we want to do with this sucker? These are the
> remaining users of kiovecs:
>
> drivers/md/lvm-snap.c
> drivers/media/video/video-buf.c
> drivers/mtd/devices/blkmtd.c
> drivers/scsi/sg.c
>
> the video and mtd drivers seems to be fairly easy to de-kiobufize.
> I'm aware of one proprietary driver which uses kiobufs. XFS uses
> kiobufs a little bit - just to map the pages.

It would be nice if we could just map a set of user pages to a scatterlist.

Developers of mass transfer devices (video grabbers, dsp devices, sg and
many others) would just LOVE you for this ;-)

Block devices are the common case worth optimizing for, but character
devices just need to reimplement most of this, if they want the same
optimizations. Some devices need mass transfers and are NOT blockdevices.

Linux supports only one class of them properly: NICs.

Please consider supporting them better for 2.5 in stuff similiar to BIOs
and DMA to/from user pages.

Thanks & Regards

Ingo Oeser

2002-07-08 09:16:51

by Suparna Bhattacharya

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

On Mon, 08 Jul 2002 13:00:12 +0530, Andi Kleen wrote:

> Andrew Morton <[email protected]> writes:
>
>> drivers/md/lvm-snap.c
>> drivers/media/video/video-buf.c
>> drivers/mtd/devices/blkmtd.c
>> drivers/scsi/sg.c
>>
>> the video and mtd drivers seems to be fairly easy to de-kiobufize. I'm
>> aware of one proprietary driver which uses kiobufs. XFS uses kiobufs a
>> little bit - just to map the pages.
>
> lkcd uses it too for its kernel crash dump. I suspect it wouldn't be
> that hard to change.

No, it shouldn't be hard to change. In fact, we've had to think of
changing it for 2.5 anyhow, since most likely we can't afford bio
alloc's happening under the covers down that path.


>
>> So with a bit of effort and maintainer-irritation, we can extract the
>> kiobuf layer from the kernel.
>>
>> Do we want to do that?
>
> I think yes - keeping two kinds of iovectors for IO (kiovecs and BIOs)
> seems to be redundant.
> kiovecs never fulfilled their original promise of a universal zero copy
> container (e.g. they were too heavy weight for networking) so it's
> probably best to remove them as a failed experiment.
>

Yes, I think Kiobufs can go, and we can use something like kvecs
(from aio code base) instead which are better for representing
readv/writev, for the generic case (i.e. when its not just
for block i/o). Its easy enough to map kvecs into bio s or
zero-copy networking.

Regards
Suparna




> -Andi
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel"
> in the body of a message to [email protected] More majordomo
> info at http://vger.kernel.org/majordomo-info.html Please read the FAQ
> at http://www.tux.org/lkml/

2002-07-08 15:10:46

by Matt D. Robinson

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

Andi Kleen wrote:
>
> Andrew Morton <[email protected]> writes:
>
> > drivers/md/lvm-snap.c
> > drivers/media/video/video-buf.c
> > drivers/mtd/devices/blkmtd.c
> > drivers/scsi/sg.c
> >
> > the video and mtd drivers seems to be fairly easy to de-kiobufize.
> > I'm aware of one proprietary driver which uses kiobufs. XFS uses
> > kiobufs a little bit - just to map the pages.
>
> lkcd uses it too for its kernel crash dump. I suspect it wouldn't be that
> hard to change.
>

We can remove their use from our 2.5 tree. Not a problem, as
there are other ways to accomplish what we want.

> -Andi

--Matt

2002-07-09 03:55:18

by Douglas Gilbert

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

Ingo Oeser wrote:

>On Sun, Jul 07, 2002 at 08:19:33PM -0700, Andrew Morton wrote:
> > Question is: what do we want to do with this sucker? These are the
> > remaining users of kiovecs:
> >
> > drivers/md/lvm-snap.c
> > drivers/media/video/video-buf.c
> > drivers/mtd/devices/blkmtd.c
> > drivers/scsi/sg.c
> >
> > the video and mtd drivers seems to be fairly easy to de-kiobufize.
> > I'm aware of one proprietary driver which uses kiobufs. XFS uses
> > kiobufs a little bit - just to map the pages.
>
> It would be nice if we could just map a set of user pages to a scatterlist.

After disabling kiobufs in sg I would like such a drop
in replacement.

> Developers of mass transfer devices (video grabbers, dsp devices, sg and
> many others) would just LOVE you for this ;-)

Agreed. Tape devices could be added to your list.
Large page support will make for very efficient zero
copy IO.

> Block devices are the common case worth optimizing for, but character
> devices just need to reimplement most of this, if they want the same
> optimizations. Some devices need mass transfers and are NOT blockdevices.

> Please consider supporting them better for 2.5 in stuff similiar to BIOs
> and DMA to/from user pages.

CIOs?

Doug Gilbert

2002-07-09 04:17:38

by Andrew Morton

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

Douglas Gilbert wrote:
>
> Ingo Oeser wrote:
>
> >On Sun, Jul 07, 2002 at 08:19:33PM -0700, Andrew Morton wrote:
> > > Question is: what do we want to do with this sucker? These are the
> > > remaining users of kiovecs:
> > >
> > > drivers/md/lvm-snap.c
> > > drivers/media/video/video-buf.c
> > > drivers/mtd/devices/blkmtd.c
> > > drivers/scsi/sg.c
> > >
> > > the video and mtd drivers seems to be fairly easy to de-kiobufize.
> > > I'm aware of one proprietary driver which uses kiobufs. XFS uses
> > > kiobufs a little bit - just to map the pages.
> >
> > It would be nice if we could just map a set of user pages to a scatterlist.
>
> After disabling kiobufs in sg I would like such a drop
> in replacement.

Ben had lightweight sg structures called `kvecs' and `kveclets'. And
library functions to map pages into them. And code to attach them
to BIOs. So we'll be looking at getting that happening.

The other common requirement (used in several places in the kernel,
and in LVM2) is the ability to perform bulk I/O against a blockdev - simply
read and write a chunk of disk into a list of kernel pages. So we'll need a
library function for that. And the O_DIRECT/raw implementation can be bent
around to use those things.

> > Developers of mass transfer devices (video grabbers, dsp devices, sg and
> > many others) would just LOVE you for this ;-)
>
> Agreed. Tape devices could be added to your list.
> Large page support will make for very efficient zero
> copy IO.

Haven't thought about large pages. We don't seem to have an implementation of
them yet, and I'm not sure how the DMA mapping API would get along with
them.

-

2002-07-09 08:13:50

by Ingo Oeser

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

On Mon, Jul 08, 2002 at 09:26:48PM -0700, Andrew Morton wrote:
> > > It would be nice if we could just map a set of user pages
> > > to a scatterlist.
> >
> > After disabling kiobufs in sg I would like such a drop
> > in replacement.
>
> Ben had lightweight sg structures called `kvecs' and `kveclets'. And
> library functions to map pages into them. And code to attach them
> to BIOs. So we'll be looking at getting that happening.

BIOs are for BLOCK devices we want sth. like this for CHARACTER
devices.

I just want sth. along the lines of this:

/* Pin down (COMPLETE!) user pages and put them into a scatter gather list */
int sg_map_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
unsigned long uaddr, int rw) {
int res, i;
struct page *pages[nr_pages];

down_read(&current->mm->mmap_sem);
res = get_user_pages(
current,
current->mm,
uaddr,
nr_pages,
rw == READ, /* logic is perversed^Wreversed here :-( */
0, /* don't force */
&pages[0],
NULL);
up_read(&current->mm->mmap_sem);

/* Errors and no page mapped should return here */
if (res <= 0) return res;

for (i=1; i < res; i++) {
sgl[i].page = pages[i];
}
return res;
}

/* And unmap them... */
int sg_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages) {
int i;

for (i=0; i < nr_pages; i++)
page_cache_release(sgl[i].page);

return 0;
}

Possibly more complicated and less error prone, but you get the
idea ;-)

Regards

Ingo Oeser
--
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth

2002-07-11 02:23:34

by Lincoln Dale

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

At 08:19 PM 7/07/2002 -0700, Andrew Morton wrote:
>Here's a patch which converts O_DIRECT to go direct-to-BIO, bypassing
>the kiovec layer. It's followed by a patch which converts the raw
>driver to use the O_DIRECT engine.
>
>CPU utilisation is about the same as the kiovec-based implementation.
>Read and write bandwidth are the same too, for 128k chunks. But with
>one megabyte chunks, this implementation is 20% faster at writing.
..
>This is with a single (oldish) scsi disk on aic7xxx. I'd expect the
>margin to widen on higher-end hardware which likes to have more
>requests in flight.

sorry for the delay.
upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
FC 2300 HBA
driver isn't part of the standard kernel, and i had to update it to reflect the
io_request_lock -> host->host_lock, kdev_t and kbuild changes. urgh, pain
pain pain.
in the process, i discovered some races in their driver, so fixed them also.

the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
to Jens, Andrew & co for the changeover.

the results:
2.4.19pre8aa2 (with lockmeter and profile=2)
normal 167772160 blocks of 512 bytes in 778 seconds (105.27
mbyte/sec), CPUs 0% idle
O_DIRECT 20480 blocks of 4194304 bytes in 430 seconds (190.47
mbyte/sec), CPUs ~55% idle
/dev/rawN 20480 blocks of 4194304 bytes in 463 seconds (176.86
mbyte/sec), CPUs ~62% idle

2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
0x80000000 and
your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
below)
normal 167772160 blocks of 512 bytes in 607 seconds (134.81
mbyte/sec), CPUs 0% idle
O_DIRECT 20480 blocks of 4194304 bytes in 420 seconds (194.61
mbyte/sec), CPUs ~93% idle
/dev/rawN 20480 blocks of 4194304 bytes in 422 seconds (193.84
mbyte/sec), CPUs ~92% idle

2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
normal 167772160 blocks of 512 bytes in 615 seconds (133.06
mbyte/sec), CPUs 0% idle
O_DIRECT 20480 blocks of 4194304 bytes in 421 seconds (194.37
mbyte/sec), CPUs ~92% idle
/dev/rawN 20480 blocks of 4194304 bytes in 421 seconds (194.35
mbyte/sec), CPUs ~92% idle


its a little hard to tell CPU load difference between direct-to-BIO versus
non-direct-to-BIO,
but clearly performance was at 100% of 2gbit/s Fibre Channel with
direct-to-bio; i've never
seen it sustain exactly 100% throughout a test before.

it was interesting to watch the test of 2.4.19pre8aa2 versus both 2.5.25
tests; whether it is a
change in the linux scheduler or some other artifact, all "worker" threads
(1 thread per disk)
completed at almost exactly the same time on 2.5.25 kernels.
in contrast, the benchmark on 2.4.19pre8aa2 had some disks complete their
work up to half
a minute prior to the last thread finishing -- clearly there was some
degree of "unfairness"
between threads that has since been addressed.

i'll see about getting dual 2gbit/s FC HBAs working now; my FC MultiPathing
configuration
is having a bad hair day today and i'm not physically near the test host in
question to
replace a physical fibre cable reporting errors.


details of how the test was conducted --

test host:
- dual P3 Xeon (733MHz), 2GB PC133 SDRAM (no HIGHMEM defined)
- single QLogic FC 2300 HBA operating at 2gbit/s in a 64/66 PCI slot

test:
- benchmark consisted of sequential read requests in parallel across
8 x 18G 15K RPM FC disks across the first 10GB of each disk
(why use "sequential reads" you ask? because its generally consistent --
i'm not measuring any i/o re-ordering/elevator behaviour, nor am
i measuring the speed of any disk-shelf controller cache or
disk-spindle seek speed. i'm purely measuring how fast data can
move from the storage subsystem to userspace).
- benchmark-test considered complete when all disks have gone idle.
- benchmark program is multithreaded, one thread per device
- each test run twice with machine rebooted in-between to ensure
repeatability

block sizes:
- for normal, test used 20971520 blocks of 512 bytes (10GB) on each disk
- for O_DIRECT, test used 2560 blocks of 4194304 bytes (10GB) on each disk
- for /dev/rawN, test used 2560 blocks of 4194304 bytes (10GB) on each disk


oops report #1: (virgin 2.5.25)
oops occurs on attempting to issue a read() on a O_DIRECT device.
this was corrected with Andrew's patch of:

Oops: 0000
CPU: 0
EIP: 0010:[<801c4e11>] Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010296
eax: 00000080 ebx: 00000000 ecx: f6e83b20 edx: f3e79c00
esi: f3e79cc0 edi: 00010000 ebp: f6e83b20 esp: f393bdcc
ds: 0018 es: 0018 ss: 0018
Stack: 8013e856 820fcde0 00000010 000000c0 2aca6000 00000000
f3e79cc0 00070000
00000070 801c4fac f6e83b20 f6e83b20 8013edbd 00000000
f6e83b20 00000010
00000010 00000000 00000000 00000010 00000001 80127acb
f56e9ae0 f54691e0
Call Trace: [<8013e856>] [<801c4fac>] [<8013edbd>] [<80127acb>]
[<8013e118>]
[<8013e05f>] [<801269de>] [<80126af8>] [<80140113>]
[<801400a0>] [<8012a9c7>]
[<8012abad>] [<8011404b>] [<8013a738>] [<8013a8ea>] [<80108a0b>]
Code: 8b 43 0c c1 ef 09 8b 50 38 8b 40 34 0f ac d0 09 89 c6 85 f6

>>EIP; 801c4e11 <generic_make_request+11/130> <=====
Trace; 8013e856 <bio_alloc+e6/1a0>
Trace; 801c4fac <submit_bio+5c/70>
Trace; 8013edbd <ll_rw_kio+1ad/210>
Trace; 80127acb <handle_mm_fault+6b/e0>
Trace; 8013e118 <brw_kiovec+a8/100>
Trace; 8013e05f <generic_direct_IO+ef/100>
Trace; 801269de <get_user_pages+ee/150>
Trace; 80126af8 <map_user_kiobuf+b8/100>
Trace; 80140113 <blkdev_direct_IO+23/30>
Trace; 801400a0 <blkdev_get_block+0/50>
Trace; 8012a9c7 <generic_file_direct_IO+167/1e0>
Trace; 8012abad <generic_file_read+ed/130>
Trace; 8011404b <schedule+33b/3a0>
Trace; 8013a738 <vfs_read+98/110>
Trace; 8013a8ea <sys_read+2a/40>
Trace; 80108a0b <syscall_call+7/b>
Code; 801c4e11 <generic_make_request+11/130>
00000000 <_EIP>:
Code; 801c4e11 <generic_make_request+11/130> <=====
0: 8b 43 0c mov 0xc(%ebx),%eax <=====
Code; 801c4e14 <generic_make_request+14/130>
3: c1 ef 09 shr $0x9,%edi
Code; 801c4e17 <generic_make_request+17/130>
6: 8b 50 38 mov 0x38(%eax),%edx
Code; 801c4e1a <generic_make_request+1a/130>
9: 8b 40 34 mov 0x34(%eax),%eax
Code; 801c4e1d <generic_make_request+1d/130>
c: 0f ac d0 09 shrd $0x9,%edx,%eax
Code; 801c4e21 <generic_make_request+21/130>
10: 89 c6 mov %eax,%esi
Code; 801c4e23 <generic_make_request+23/130>
12: 85 f6 test %esi,%esi


cheers,

lincoln.

2002-07-11 03:15:06

by Andrew Morton

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

Lincoln Dale wrote:
>
> ...
> sorry for the delay.

Is cool. Thanks for doing this.

> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the QLogic
> FC 2300 HBA
> driver isn't part of the standard kernel, and i had to update it to reflect the
> io_request_lock -> host->host_lock, kdev_t and kbuild changes. urgh, pain
> pain pain.
> in the process, i discovered some races in their driver, so fixed them also.
>
> the 2.5 block i/o layer is FAR superior to the 2.4 block i/o layer. kudos
> to Jens, Andrew & co for the changeover.
>
> the results:
> 2.4.19pre8aa2 (with lockmeter and profile=2)
> normal 167772160 blocks of 512 bytes in 778 seconds (105.27
> mbyte/sec), CPUs 0% idle
> O_DIRECT 20480 blocks of 4194304 bytes in 430 seconds (190.47
> mbyte/sec), CPUs ~55% idle
> /dev/rawN 20480 blocks of 4194304 bytes in 463 seconds (176.86
> mbyte/sec), CPUs ~62% idle
>
> 2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
> 0x80000000 and
> your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> below)
> normal 167772160 blocks of 512 bytes in 607 seconds (134.81
> mbyte/sec), CPUs 0% idle
> O_DIRECT 20480 blocks of 4194304 bytes in 420 seconds (194.61
> mbyte/sec), CPUs ~93% idle
> /dev/rawN 20480 blocks of 4194304 bytes in 422 seconds (193.84
> mbyte/sec), CPUs ~92% idle

The 30% improvement in pagecache-buffered reads is somewhat unexpected.
The blockdevs are not using multipage BIOs - they're still using
buffer_head-based I/O for both reads and writes. Are you sure that
the 2.4 QLogic driver is using block-highmem?

> 2.5.25 with direct-to-BIO (and PAGE_OFFSET at 0x80000000)
> normal 167772160 blocks of 512 bytes in 615 seconds (133.06
> mbyte/sec), CPUs 0% idle
> O_DIRECT 20480 blocks of 4194304 bytes in 421 seconds (194.37
> mbyte/sec), CPUs ~92% idle
> /dev/rawN 20480 blocks of 4194304 bytes in 421 seconds (194.35
> mbyte/sec), CPUs ~92% idle

OK, so there's nothing there at all really (or there may be. Hard
to tell when the interface has saturated).

But on my lowly scsi disks I was seeing no change in read bandwidth
either. Only writes benefitted for some reason. Can you do
some write testing as well? If you test writes through the pagecache,
use ext2 and not direct-to-blockdev please - that'll take the multipage
BIOs, buffer_head-bypass route. Plain old read and write of /dev/XdYY
isn't very optimised at all.

Thanks.

-

2002-07-11 03:24:27

by Lincoln Dale

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

At 08:24 PM 10/07/2002 -0700, Andrew Morton wrote:
> > 2.5.25 ('virgin' 2.5.25 with the exception of changing PAGE_OFFSET to
> > 0x80000000 and
> > your O_DIRECT-on-blockdev patch to stop it oopsing -- oops report
> > below)
> > normal 167772160 blocks of 512 bytes in 607 seconds (134.81
> > mbyte/sec), CPUs 0% idle
> > O_DIRECT 20480 blocks of 4194304 bytes in 420 seconds (194.61
> > mbyte/sec), CPUs ~93% idle
> > /dev/rawN 20480 blocks of 4194304 bytes in 422 seconds (193.84
> > mbyte/sec), CPUs ~92% idle
>
>The 30% improvement in pagecache-buffered reads is somewhat unexpected.
>The blockdevs are not using multipage BIOs - they're still using
>buffer_head-based I/O for both reads and writes. Are you sure that
>the 2.4 QLogic driver is using block-highmem?

pretty sure -- there's no highmem in the system: :-)
(i.e. i changed PAGE_OFFSET in order to prevent there being any highmem).

[root@mel-stglab-host1 root]# cat /proc/meminfo
MemTotal: 1945680 kB
MemFree: 1853812 kB
MemShared: 0 kB
Cached: 29536 kB
SwapCached: 2520 kB
Active: 32336 kB
Inactive: 8336 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 1945680 kB
LowFree: 1853812 kB
SwapTotal: 2047992 kB
SwapFree: 2037268 kB
Dirty: 1396 kB
Writeback: 0 kB

>OK, so there's nothing there at all really (or there may be. Hard
>to tell when the interface has saturated).
>
>But on my lowly scsi disks I was seeing no change in read bandwidth
>either. Only writes benefitted for some reason. Can you do
>some write testing as well? If you test writes through the pagecache,
>use ext2 and not direct-to-blockdev please - that'll take the multipage
>BIOs, buffer_head-bypass route. Plain old read and write of /dev/XdYY
>isn't very optimised at all.

will do.

do you have any other preferences --
- ext2 or ext3?
- if ext3, change the journalling mode?
- i/o to a single large file or multiple files per spindle?

i can also add combinations of read/write & seeking also.
what kind of file-size should i be using?


cheers,

lincoln.

2002-07-11 06:08:10

by Adam J. Richter

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

Douglas Gilbert wrote:
>Ingo Oeser wrote:
[...]
>> It would be nice if we could just map a set of user pages to a scatterlist.
>
>After disabling kiobufs in sg I would like such a drop
>in replacement.
>
>> Developers of mass transfer devices (video grabbers, dsp devices, sg and
>> many others) would just LOVE you for this ;-)
>
>Agreed. Tape devices could be added to your list.
>Large page support will make for very efficient zero
>copy IO.
>
>> Block devices are the common case worth optimizing for, but character
>> devices just need to reimplement most of this, if they want the same
>> optimizations. Some devices need mass transfers and are NOT blockdevices.
>
>> Please consider supporting them better for 2.5 in stuff similiar to BIOs
>> and DMA to/from user pages.
>
>CIOs?

This is what I want to accomplish in my proposal to
pull most of the DMA transfer optimization code up from block
devices by generalizing DMA targets and turning struct scatterlist
into a linked list, discussed here:

http://marc.theaimsgroup.com/?t=102487685000002&r=1&w=2

I have not started coding this yet because:

1. I'm tracking down a bug in the next revision of my proposed
bio_append patch (which eliminates {read,write}_full_page from
fs/buffers.c), and I want to hit that ball out of my court first.

2. I want to look at aio to see if it has a better way or if it
could benefit from this.

3. I want to accomodate Dave Miller's request for a non-PCI
generalization of pci_alloc_consistent, pci_map_single, etc.,
first, and that will depend on struct device, for which there are
some relevant changes working their way from Patrick Mochel
to Linus.

4. After getting a general dma_alloc_consistent, etc. interface,
then I want to create a struct dma_target, to abstract
out the DMA capabilities currently maintained by the block
layer. I hope that by doing this in stages, that it will be
more palatable to Jens, who expressed concern that the
my proposal to go to a linked list for struct scatterlist
was a bit too much change.

Then, I think we'll be in a better position to go to a struct
scatterlist linked list or something similar that can be used by most
if not all big producers of IO.

In the meantime, killing off kiobufs should be helpful.

Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."

2002-07-11 17:27:08

by Ingo Oeser

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

On Mon, Jul 08, 2002 at 09:26:48PM -0700, Andrew Morton wrote:
> Ben had lightweight sg structures called `kvecs' and `kveclets'. And
> library functions to map pages into them. And code to attach them
> to BIOs. So we'll be looking at getting that happening.

Ok, I've looked at them and they don't help me at all.

A user, who splits its IO into single pages, wants to do DMA and
needs bus addresses for that. So he needs "struct scatterlist".

If one doesn't need to DMA, one can do copy_{from,to}_user
directly with an immediate buffer, so the splitup isn't needed.

For this I conclude, that using the EXISTING 'struct scatterlist'
will be enough for both. Attaching a vector of these to the BIOs
is no problem. Neither it is for CHARACTER device IOs (CIOs).

So by using this simple abstraction we MIGHT waste only 4-8 bytes
per page submitted, but by page-splitting the IO only for devices,
that need DMA (e.g. make the request that explicitly) we don't
really waste it and support BIOs and CIOs the same way.

I will refine that code for my own uses anyway, so if nobody with
more clues about IO than me implements it, I will submit it
later.

Regards

Ingo Oeser
--
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth

2002-07-11 19:49:38

by Jesse Barnes

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

On Thu, Jul 11, 2002 at 12:25:03PM +1000, Lincoln Dale wrote:
> sorry for the delay.
> upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> had to update it to reflect the io_request_lock -> host->host_lock,
> kdev_t and kbuild changes. urgh, pain pain pain. in the process, i
> discovered some races in their driver, so fixed them also.

So you ported the qla2x00 driver forward to 2.5? Would it be possible
to post that driver? Not having it has held up some testing I'd like
to do...

Thanks,
Jesse

2002-07-11 20:40:38

by Daniel Phillips

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

On Tuesday 09 July 2002 06:26, Andrew Morton wrote:
> Ben had lightweight sg structures called `kvecs' and `kveclets'. And
> library functions to map pages into them. And code to attach them
> to BIOs. So we'll be looking at getting that happening.

And as I recall, a grand plan was hatched at the kernel summit to
slice and dice all the various forms of block IO into that model.

Seeing -> believing

--
Daniel

2002-07-11 23:39:02

by Lincoln Dale

[permalink] [raw]
Subject: Re: direct-to-BIO for O_DIRECT

At 12:52 PM 11/07/2002 -0700, Jesse Barnes wrote:
>On Thu, Jul 11, 2002 at 12:25:03PM +1000, Lincoln Dale wrote:
> > sorry for the delay.
> > upgrading from 2.4.19 to 2.5.25 took longer than expected, since the
> > QLogic FC 2300 HBA driver isn't part of the standard kernel, and i
> > had to update it to reflect the io_request_lock -> host->host_lock,
> > kdev_t and kbuild changes. urgh, pain pain pain. in the process, i
> > discovered some races in their driver, so fixed them also.
>
>So you ported the qla2x00 driver forward to 2.5? Would it be possible
>to post that driver? Not having it has held up some testing I'd like
>to do...

these are the changes to the qla2x00 6.1 beta 2 driver, as downloadable
from the QLogic web-site.

there were also some changes required to the makefiles to get this working
with linux-2.5 kbuild infrastructure.
the hacks i did there are awful and i'm not prepared to put my name against
those bad hacks just yet. :-)

===
diff -urN base/listops.h 2.5.25/listops.h
--- base/listops.h Tue Apr 16 05:15:40 2002
+++ 2.5.25/listops.h Fri Jul 12 09:29:45 2002
@@ -324,9 +324,9 @@
return;
}

- spin_lock_irqsave(&io_request_lock, flags);
+ spin_lock_irqsave(ha->host->host_lock, flags);
qla2x00_callback(ha, sp->cmd);
- spin_unlock_irqrestore(&io_request_lock, flags);
+ spin_unlock_irqrestore(ha->host->host_lock, flags);
}

/**************************************************************************
diff -urN base/qla2x00.c 2.5.25/qla2x00.c
--- base/qla2x00.c Wed Jul 10 18:32:25 2002
+++ 2.5.25/qla2x00.c Fri Jul 12 09:29:51 2002
@@ -532,10 +532,11 @@
static int recoveryTime = MAX_RECOVERYTIME;
static int failbackTime = MAX_FAILBACKTIME;
#endif /* end of MPIO_SUPPORT */
-#ifdef MODULE
+
static char *ql2xopts = NULL;
static int ql2xmaxqdepth = 0;

+#ifdef MODULE
/* insmod qla2100 ql2xopts=verbose" */
MODULE_PARM(ql2xopts, "s");
MODULE_PARM(ql2xmaxqdepth, "i");
@@ -552,7 +553,6 @@
MODULE_LICENSE("GPL");
#endif

-#include "listops.h"
#include "qla_fo.cfg"


@@ -564,6 +564,7 @@
static char dummy_buffer[60] = "Please don't add commas in your insmod
command!!\n";

#endif
+#include "listops.h"

#if QLA2100_LIPTEST
static int qla2x00_lip = 0;
@@ -1459,10 +1460,6 @@

ENTER("qla2x00_detect");

-#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
-#endif
-
#ifdef MODULE
DEBUG2(printk("DEBUG: qla2x00_set_info starts at address = %p\n",
qla2x00_set_info);)
@@ -1497,9 +1494,6 @@

if (!pci_present()) {
printk("scsi: PCI not present\n");
-#if NEW_EH_CODE
- spin_lock_irq(&io_request_lock);
-#endif
return 0;
} /* end of !pci_present() */

@@ -1542,9 +1536,6 @@
continue;
}
*/
-#if NEW_EH_CODE
- spin_lock_irq(&io_request_lock);
-#endif

if ((host =
scsi_register(
@@ -1609,9 +1600,6 @@
"scsi%d: [ERROR] Failed to allocate "
"memory for adapter\n",host->host_no);
qla2x00_mem_free(ha);
-#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
-#endif
continue;
}

@@ -1654,10 +1642,6 @@

ha->list_lock = SPIN_LOCK_UNLOCKED;

-#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
-#endif
-
if (qla2x00_initialize_adapter(ha) &&
!(ha->device_flags & DFLG_NO_CABLE)) {

@@ -1706,8 +1690,7 @@
ha->fabricid[SIMPLE_NAME_SERVER].in_use = TRUE;

#if NEW_EH_CODE
-
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
#endif

/* Register our resources with Linux */
@@ -1719,7 +1702,7 @@
qla2x00_mem_free(ha);
scsi_unregister(host);
#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
#endif
continue;
}
@@ -1741,7 +1724,7 @@
spin_unlock_irqrestore(&ha->hardware_lock, flags);

#if NEW_EH_CODE
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
#endif

#if MPIO_SUPPORT
@@ -1805,10 +1788,6 @@
}
} /* end of FOR */

-#if NEW_EH_CODE
- spin_lock_irq(&io_request_lock);
-#endif
-
LEAVE("qla2x00_detect");

return num_hosts;
@@ -2217,7 +2196,7 @@
ha = (scsi_qla_host_t *) host->hostdata;

cmd->scsi_done = fn;
- spin_unlock(&io_request_lock);
+ spin_unlock(host->host_lock);

/* Allocate a command packet from the "sp" pool.
* If we cant get back one then let scsi layer
@@ -2227,7 +2206,7 @@
printk(KERN_WARNING
"queuecommand: Couldn't allocate memory "
"for sp - retried.\n");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);

LEAVE("qla2x00_queuecommand");
return(1);
@@ -2309,14 +2288,14 @@
(int)ha->host_no,t,l);)

CMD_RESULT(cmd) = DID_NO_CONNECT << 16;
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
__sp_put(ha, sp);
return(0);
}

if (l >= ha->max_luns) {
CMD_RESULT(cmd) = DID_NO_CONNECT << 16;
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
__sp_put(ha, sp);
LEAVE("qla2x00_queuecommand");
return(0);
@@ -2379,7 +2358,7 @@
tasklet_schedule(&ha->run_qla_task);

LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
}

@@ -2427,7 +2406,7 @@
qla2x00_extend_timeout(sp->cmd ,60);

LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
} else {
sp->flags &= ~SRB_BUSY; /* v5.21b16 */
@@ -2449,7 +2428,7 @@
add_to_scsi_retry_queue(ha,sp);

LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
}

@@ -2462,7 +2441,7 @@

COMTRACE('c')
LEAVE("qla2x00_queuecommand");
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
return (0);
}

@@ -2526,10 +2505,10 @@
break;


- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(2*HZ);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);

} while (time_before_eq(jiffies, max_wait_time));

@@ -2811,7 +2790,7 @@
sp_get(ha,sp);

spin_unlock_irqrestore(&ha->hardware_lock, flags);
- spin_unlock(&io_request_lock);
+ spin_unlock(host->host_lock);

if (qla2x00_abort_command(ha, sp)) {
DEBUG2(printk("qla2xxx_eh_abort:
abort_command "
@@ -2825,7 +2804,7 @@
}

sp_put(ha,sp);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
spin_lock_irqsave(&ha->hardware_lock, flags);

/*
@@ -2862,15 +2841,15 @@
*/
if ((which_ha & BIT_0) && (!list_empty(&ha->done_queue))) {
DEBUG3(printk("qla2xxx_eh_abort: calling done for ha.\n");)
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
}
if ((which_ha & BIT_1) && (!list_empty(&vis_ha->done_queue))) {
DEBUG3(printk("qla2xxx_eh_abort: calling done for
vis_ha.\n");)
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(host->host_lock);
qla2x00_done(vis_ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(host->host_lock);
}

DEBUG(printk("qla2xxx_eh_abort: Exiting. return_status=0x%x.\n",
@@ -2975,22 +2954,22 @@
ha->cfg_active || ha->loop_state != LOOP_READY)) {

clear_bit(DEVICE_RESET_NEEDED, &ha->dpc_flags);
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
if (qla2x00_device_reset(ha, t) != 0) {
return_status = FAILED;
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} else {
/*
* Wait a while for the loop to come back. Return SUCCESS
* for the kernel to try again.
*/
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);

set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);

- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);

return_status = SUCCESS;
}
@@ -3010,9 +2989,9 @@
DEBUG3(printk("qla2xxx_eh_device_reset: calling "
"done for ha.\n");)

- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
}

DRIVER_UNLOCK
@@ -3114,22 +3093,22 @@
ha->cfg_active || ha->loop_state != LOOP_READY)) {

clear_bit(LOOP_RESET_NEEDED, &ha->dpc_flags);
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
if (qla2x00_loop_reset(ha) != 0) {
return_status = FAILED;
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} else {
/*
* Wait a while for the loop to come back. Return SUCCESS
* for the kernel to try again.
*/
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);

set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);

- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);

return_status = SUCCESS;
}
@@ -3147,9 +3126,9 @@
if (!list_empty(&ha->done_queue)) {
DEBUG3(printk("qla2xxx_eh_bus_reset: calling done for
ha.\n");)

- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
}

DEBUG2_3(printk("qla2xxx_eh_bus_reset: exiting. status=0x%x.\n",
@@ -3272,7 +3251,7 @@

if (!(test_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags))) {
set_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);

if (qla2x00_abort_isp(ha, 1)) {
/* failed. try later */
@@ -3292,27 +3271,27 @@
return_status = SUCCESS;
}

- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
clear_bit(ABORT_ISP_ACTIVE, &ha->dpc_flags);
} else {
/*
* Already active. Sleep a while then return SUCCESS for
* kernel to retry the IO.
*/
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);

set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(5 * HZ);

- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);

return_status = SUCCESS;
}

if (!list_empty(&ha->done_queue)) {
- spin_unlock_irq(&io_request_lock);
+ spin_unlock_irq(ha->host->host_lock);
qla2x00_done(ha);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
}

DRIVER_UNLOCK
@@ -3595,9 +3574,9 @@
tasklet_schedule(&ha->run_qla_task);

if (found) {
- spin_unlock(&io_request_lock);
+ spin_unlock(ha->host->host_lock);
qla2x00_restart_queues(vis_ha, TRUE);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
} else {
printk(KERN_INFO
"qla2x00_abort: Couldn't Abort command = %p\n", cmd);
@@ -3851,12 +3830,12 @@
* mid-level code can expect completions
momentitarily.
*/
#if NEW_EH_CODE
- spin_unlock(&io_request_lock);
+ spin_unlock(ha->host->host_lock);
if (qla2x00_abort_isp(ha, 0)) {
/* failed. try later */
set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);
}
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
#else
set_bit(ISP_ABORT_NEEDED, &ha->dpc_flags);

@@ -3874,9 +3853,9 @@
DEBUG3(printk("qla2x00_reset: going to call restart_queues. "
"jiffies=%lx.\n", jiffies);)

- spin_unlock(&io_request_lock);
+ spin_unlock(ha->host->host_lock);
qla2x00_restart_queues(ha,TRUE);
- spin_lock_irq(&io_request_lock);
+ spin_lock_irq(ha->host->host_lock);
DRIVER_UNLOCK

COMTRACE('r')
@@ -3946,7 +3925,7 @@
qla2x00_stats.irqhba = ha;

/* Prevent concurrent access to adapters register */
- /* spin_lock_irqsave(&io_request_lock, cpu_flags);*/
+ /* spin_lock_irqsave(host->host_lock, cpu_flags);*/

reg = ha->iobase;

@@ -3998,7 +3977,7 @@
if (!list_empty(&ha->done_queue))
tasklet_schedule(&ha->run_qla_task);

- /* spin_unlock_irqrestore(&io_request_lock, cpu_flags);*/
+ /* spin_unlock_irqrestore(host->host_lock, cpu_flags);*/

/* Wakeup the DPC routine */
if ((!ha->flags.mbox_busy &&
@@ -4179,7 +4158,7 @@

QLA2100_DPC_LOCK(ha);

- /* spin_lock_irqsave(&io_request_lock, ha->cpu_flags);*/
+ /* spin_lock_irqsave(host->host_lock, ha->cpu_flags);*/
ha->dpc_active = 1;

/* Determine what action is necessary */
@@ -4477,7 +4456,7 @@
if (!list_empty(&ha->done_queue))
tasklet_schedule(&ha->run_qla_task);

- /* spin_unlock_irqrestore(&io_request_lock, ha->cpu_flags);*/
+ /* spin_unlock_irqrestore(host->host_lock, ha->cpu_flags);*/

ha->dpc_active = 0;

@@ -4778,9 +4757,9 @@

/* Call the mid-level driver interrupt handler */
#if 0
- spin_lock_irqsave(&io_request_lock, flags);
+ spin_lock_irqsave(host->host_lock, flags);
qla2x00_callback(ha,cmd);
- spin_unlock_irqrestore(&io_request_lock, flags);
+ spin_unlock_irqrestore(host->host_lock, flags);
#else

sp_put(ha, sp);
@@ -15846,7 +15825,7 @@
printk(KERN_INFO
"qla2x00_apidev: open MAJOR number = %d, "
"MINOR number = %d\n",
- MAJOR(inode->i_rdev), MINOR(inode->i_rdev));
+ major(inode->i_rdev), minor(inode->i_rdev));

return 0;
}
@@ -15902,7 +15881,8 @@
APIDEV_NODE, apidev_major);)

proc_mknod(APIDEV_NODE, 0777+S_IFCHR, host->hostt->proc_dir,
- (kdev_t)MKDEV(apidev_major,0));
+ (kdev_t)mk_kdev(apidev_major,0));
+

return 0;
}
diff -urN base/qla2x00.h 2.5.25/qla2x00.h
--- base/qla2x00.h Tue Apr 16 05:15:40 2002
+++ 2.5.25/qla2x00.h Fri Jul 12 09:29:51 2002
@@ -2682,10 +2682,8 @@
present: 0, /* number of 7xxx's present */\
unchecked_isa_dma: 0, /* no memory DMA restrictions */\
use_clustering: ENABLE_CLUSTERING, \
- use_new_eh_code: 1, \
max_sectors: 512, \
- highmem_io: 1, \
- emulated: 0 \
+ highmem_io: 1 \
}
#else /* KERNEL_VERSION < 2.5.7 */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,8)
diff -urN base/qla2x00_ioctl.c 2.5.25/qla2x00_ioctl.c
--- base/qla2x00_ioctl.c Tue Apr 16 05:15:40 2002
+++ 2.5.25/qla2x00_ioctl.c Fri Jul 12 09:29:51 2002
@@ -2509,14 +2509,14 @@
ha->host_no);)

/* get spin lock for this operation */
- spin_lock_irqsave(&io_request_lock, ha->cpu_flags);
+ spin_lock_irqsave(ha->host->host_lock, ha->cpu_flags);

qla2x00_queuecommand(pscsi_cmd, (void *) qla2x00_scsi_pt_done);

ha->ioctl->cmpl_timer.expires = jiffies + ha->ioctl->ioctl_tov * HZ;
add_timer(&ha->ioctl->cmpl_timer);

- spin_unlock_irqrestore(&io_request_lock, ha->cpu_flags);
+ spin_unlock_irqrestore(ha->host->host_lock, ha->cpu_flags);
down(&ha->ioctl->cmpl_sem);

del_timer(&ha->ioctl->cmpl_timer);
===


cheers,

lincoln.