LinuxLists.cc - patch: aio + bio for raw io

2002-02-08 07:53:34

Subject: patch: aio + bio for raw io

Quick message: this patch makes aio use bio directly for brw_kvec_async.
This is against yesterday's patchset. Comments?

-ben

===== fs/Makefile 1.15 vs 1.16 =====
--- 1.15/fs/Makefile Wed Jan 30 02:21:55 2002
+++ 1.16/fs/Makefile Fri Feb 8 14:57:54 2002
@@ -22,6 +22,7 @@
obj-y += noquot.o
endif

+export-objs += aio.o
obj-y += aio.o

subdir-$(CONFIG_PROC_FS) += proc
===== fs/buffer.c 1.60 vs 1.61 =====
--- 1.60/fs/buffer.c Tue Jan 15 05:53:34 2002
+++ 1.61/fs/buffer.c Fri Feb 8 17:22:14 2002
@@ -54,6 +54,8 @@
#include <asm/bitops.h>
#include <asm/mmu_context.h>

+extern struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec);
+
#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
@@ -2764,16 +2766,26 @@
* It is up to the caller to make sure that there are enough blocks
* passed in to completely map the iobufs to disk.
*/
+static int brw_kvec_end_io(struct bio *bio, int nr_sectors)
+{
+ kvec_cb_t cb = bio->cb;
+ int res = nr_sectors * 512;
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ res = -EIO;
+ bio_put(bio);
+ cb.fn(cb.data, cb.vec, res);
+ return 0;
+}

-int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift)
+int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, sector_t blknr, int sector_shift)
{
struct kvec *vec = cb.vec;
struct kveclet *veclet;
- int err;
int length;
unsigned sector_size = 1 << sector_shift;
int i;

+ struct bio *bio;
struct brw_cb *brw_cb;

if (!vec->nr)
@@ -2795,125 +2807,21 @@
if (length < (blocks << sector_shift))
BUG();

- /*
- * OK to walk down the iovec doing page IO on each page we find.
- */
- err = 0;
-
if (!blocks) {
printk("brw_kiovec_async: !i\n");
return -EINVAL;
}

- /* FIXME: tie into userbeans here */
- brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL);
- if (!brw_cb)
+ bio = bio_setup_from_kvec(GFP_KERNEL, vec);
+ if (unlikely(!bio))
return -ENOMEM;
-
- brw_cb->cb = cb;
- brw_cb->nr = 0;
-
- /* This is ugly. FIXME. */
- for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
- struct page *page = veclet->page;
- unsigned offset = veclet->offset;
- unsigned length = veclet->length;
-
- if (!page)
- BUG();
-
- while (length > 0) {
- struct buffer_head *tmp;
- tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
- err = -ENOMEM;
- if (!tmp)
- goto error;
-
- tmp->b_dev = B_FREE;
- tmp->b_size = sector_size;
- set_bh_page(tmp, page, offset);
- tmp->b_this_page = tmp;
-
- init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
- tmp->b_dev = dev;
- tmp->b_blocknr = blknr++;
- tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
- | (1 << BH_Req);
- tmp->b_private = brw_cb;
-
- if (rw == WRITE) {
- set_bit(BH_Uptodate, &tmp->b_state);
- clear_bit(BH_Dirty, &tmp->b_state);
- }
-
- brw_cb->bh[brw_cb->nr++] = tmp;
- length -= sector_size;
- offset += sector_size;
-
- if (offset >= PAGE_SIZE) {
- offset = 0;
- break;
- }
-
- if (brw_cb->nr >= blocks)
- goto submit;
- } /* End of block loop */
- } /* End of page loop */
-
-submit:
- atomic_set(&brw_cb->io_count, brw_cb->nr+1);
- /* okay, we've setup all our io requests, now fire them off! */
- for (i=0; i<brw_cb->nr; i++)
- submit_bh(rw, brw_cb->bh[i]);
- brw_cb_put(brw_cb);
+ bio->cb = cb;
+ bio->bi_sector = blknr;
+ bio->bi_dev = dev;
+ bio->bi_vcnt = vec->nr;
+ bio->bi_size = length;
+ bio->bi_end_io = brw_kvec_end_io;
+ submit_bio(rw, bio);

return 0;
-
-error:
- /* Walk brw_cb_table freeing all the goop associated with each kiobuf */
- if (brw_cb) {
- /* We got an error allocating the bh'es. Just free the current
- buffer_heads and exit. */
- for (i=0; i<brw_cb->nr; i++)
- kmem_cache_free(bh_cachep, brw_cb->bh[i]);
- kfree(brw_cb);
- }
-
- return err;
-}
-#if 0
-int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
- kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
-{
- int i;
- int transferred = 0;
- int err = 0;
-
- if (!nr)
- return 0;
-
- /* queue up and trigger the io */
- err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
- if (err)
- goto out;
-
- /* wait on the last iovec first -- it's more likely to finish last */
- for (i=nr; --i >= 0; )
- kiobuf_wait_for_io(iovec[i]);
-
- run_task_queue(&tq_disk);
-
- /* okay, how much data actually got through? */
- for (i=0; i<nr; i++) {
- if (iovec[i]->errno) {
- if (!err)
- err = iovec[i]->errno;
- break;
- }
- transferred += iovec[i]->length;
- }
-
-out:
- return transferred ? transferred : err;
}
-#endif
===== fs/bio.c 1.14 vs 1.15 =====
--- 1.14/fs/bio.c Thu Feb 7 16:13:25 2002
+++ 1.15/fs/bio.c Fri Feb 8 17:22:14 2002
@@ -151,6 +151,22 @@
return NULL;
}

+struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec)
+{
+ struct bio *bio = mempool_alloc(bio_pool, gfp_mask);
+ struct bio_vec *bvl = NULL;
+
+ if (unlikely(!bio))
+ return NULL;
+
+ bio->bi_max = kvec->max_nr;
+ bio_init(bio);
+ bio->bi_destructor = bio_destructor;
+ bio->bi_io_vec = (struct bio_vec *)kvec->veclet; /* shoot me */
+ bio->bi_flags |= 1 << BIO_CLONED; /* don't free the vec */
+ return bio;
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
===== include/linux/kiovec.h 1.1 vs 1.2 =====
--- 1.1/include/linux/kiovec.h Sat Jan 12 02:19:10 2002
+++ 1.2/include/linux/kiovec.h Fri Feb 8 15:00:54 2002
@@ -6,8 +6,8 @@

struct kveclet {
struct page *page;
- unsigned offset;
unsigned length;
+ unsigned offset;
};

struct kvec {
===== include/linux/bio.h 1.11 vs 1.12 =====
--- 1.11/include/linux/bio.h Wed Feb 6 01:23:04 2002
+++ 1.12/include/linux/bio.h Fri Feb 8 17:22:23 2002
@@ -26,6 +26,8 @@
#define BIO_VMERGE_BOUNDARY 0
#endif

+#include <linux/kiovec.h>
+
#define BIO_DEBUG

#ifdef BIO_DEBUG
@@ -91,6 +93,7 @@
void *bi_private;

bio_destructor_t *bi_destructor; /* destructor */
+ kvec_cb_t cb;
};

/*
===== fs/aio.c 1.1 vs 1.2 =====
--- 1.1/fs/aio.c Wed Jan 30 13:12:34 2002
+++ 1.2/fs/aio.c Fri Feb 8 14:58:10 2002
@@ -37,6 +37,7 @@
#include <linux/smp_lock.h>
#include <linux/compiler.h>
#include <linux/poll.h>
+#include <linux/module.h>

#include <asm/uaccess.h>

@@ -964,3 +965,8 @@
}

__initcall(aio_setup);
+EXPORT_SYMBOL_GPL(generic_file_kvec_read);
+EXPORT_SYMBOL_GPL(generic_file_aio_read);
+EXPORT_SYMBOL_GPL(generic_file_kvec_write);
+EXPORT_SYMBOL_GPL(generic_file_aio_write);
+EXPORT_SYMBOL_GPL(generic_file_new_read);

2002-02-08 09:37:29

by Suparna Bhattacharya

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, Feb 08, 2002 at 02:53:13AM -0500, Benjamin LaHaise wrote:
> Quick message: this patch makes aio use bio directly for brw_kvec_async.
> This is against yesterday's patchset. Comments?
>

I was looking for this in yesterday's aio patchset for 2.5 and was just
about to send you a note asking you about this :)

You chose to add a kvec_cb field to the bio structure rather than use
bi_private ?

For the raw path, you are OK since you never have to copy data out of
the kvecs after i/o completion, and unmap_kvec only looks at veclet pages.
So the fact block can change the offset and len fields in the veclets
doesn't affect you, but thought I'd mention it as a point of caution
anyhow ...

> -ben
>
> ===== fs/Makefile 1.15 vs 1.16 =====
> --- 1.15/fs/Makefile Wed Jan 30 02:21:55 2002
> +++ 1.16/fs/Makefile Fri Feb 8 14:57:54 2002
> @@ -22,6 +22,7 @@
> obj-y += noquot.o
> endif
>
> +export-objs += aio.o
> obj-y += aio.o
>
> subdir-$(CONFIG_PROC_FS) += proc
> ===== fs/buffer.c 1.60 vs 1.61 =====
> --- 1.60/fs/buffer.c Tue Jan 15 05:53:34 2002
> +++ 1.61/fs/buffer.c Fri Feb 8 17:22:14 2002
> @@ -54,6 +54,8 @@
> #include <asm/bitops.h>
> #include <asm/mmu_context.h>
>
> +extern struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec);
> +
> #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
> #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
> #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
> @@ -2764,16 +2766,26 @@
> * It is up to the caller to make sure that there are enough blocks
> * passed in to completely map the iobufs to disk.
> */
> +static int brw_kvec_end_io(struct bio *bio, int nr_sectors)
> +{
> + kvec_cb_t cb = bio->cb;
> + int res = nr_sectors * 512;
> + if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
> + res = -EIO;
> + bio_put(bio);
> + cb.fn(cb.data, cb.vec, res);
> + return 0;
> +}
>
> -int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift)
> +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, sector_t blknr, int sector_shift)
> {
> struct kvec *vec = cb.vec;
> struct kveclet *veclet;
> - int err;
> int length;
> unsigned sector_size = 1 << sector_shift;
> int i;
>
> + struct bio *bio;
> struct brw_cb *brw_cb;
>
> if (!vec->nr)
> @@ -2795,125 +2807,21 @@
> if (length < (blocks << sector_shift))
> BUG();
>
> - /*
> - * OK to walk down the iovec doing page IO on each page we find.
> - */
> - err = 0;
> -
> if (!blocks) {
> printk("brw_kiovec_async: !i\n");
> return -EINVAL;
> }
>
> - /* FIXME: tie into userbeans here */
> - brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL);
> - if (!brw_cb)
> + bio = bio_setup_from_kvec(GFP_KERNEL, vec);
> + if (unlikely(!bio))
> return -ENOMEM;
> -
> - brw_cb->cb = cb;
> - brw_cb->nr = 0;
> -
> - /* This is ugly. FIXME. */
> - for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
> - struct page *page = veclet->page;
> - unsigned offset = veclet->offset;
> - unsigned length = veclet->length;
> -
> - if (!page)
> - BUG();
> -
> - while (length > 0) {
> - struct buffer_head *tmp;
> - tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
> - err = -ENOMEM;
> - if (!tmp)
> - goto error;
> -
> - tmp->b_dev = B_FREE;
> - tmp->b_size = sector_size;
> - set_bh_page(tmp, page, offset);
> - tmp->b_this_page = tmp;
> -
> - init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
> - tmp->b_dev = dev;
> - tmp->b_blocknr = blknr++;
> - tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
> - | (1 << BH_Req);
> - tmp->b_private = brw_cb;
> -
> - if (rw == WRITE) {
> - set_bit(BH_Uptodate, &tmp->b_state);
> - clear_bit(BH_Dirty, &tmp->b_state);
> - }
> -
> - brw_cb->bh[brw_cb->nr++] = tmp;
> - length -= sector_size;
> - offset += sector_size;
> -
> - if (offset >= PAGE_SIZE) {
> - offset = 0;
> - break;
> - }
> -
> - if (brw_cb->nr >= blocks)
> - goto submit;
> - } /* End of block loop */
> - } /* End of page loop */
> -
> -submit:
> - atomic_set(&brw_cb->io_count, brw_cb->nr+1);
> - /* okay, we've setup all our io requests, now fire them off! */
> - for (i=0; i<brw_cb->nr; i++)
> - submit_bh(rw, brw_cb->bh[i]);
> - brw_cb_put(brw_cb);
> + bio->cb = cb;
> + bio->bi_sector = blknr;
> + bio->bi_dev = dev;
> + bio->bi_vcnt = vec->nr;
> + bio->bi_size = length;
> + bio->bi_end_io = brw_kvec_end_io;
> + submit_bio(rw, bio);
>
> return 0;
> -
> -error:
> - /* Walk brw_cb_table freeing all the goop associated with each kiobuf */
> - if (brw_cb) {
> - /* We got an error allocating the bh'es. Just free the current
> - buffer_heads and exit. */
> - for (i=0; i<brw_cb->nr; i++)
> - kmem_cache_free(bh_cachep, brw_cb->bh[i]);
> - kfree(brw_cb);
> - }
> -
> - return err;
> -}
> -#if 0
> -int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
> - kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
> -{
> - int i;
> - int transferred = 0;
> - int err = 0;
> -
> - if (!nr)
> - return 0;
> -
> - /* queue up and trigger the io */
> - err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
> - if (err)
> - goto out;
> -
> - /* wait on the last iovec first -- it's more likely to finish last */
> - for (i=nr; --i >= 0; )
> - kiobuf_wait_for_io(iovec[i]);
> -
> - run_task_queue(&tq_disk);
> -
> - /* okay, how much data actually got through? */
> - for (i=0; i<nr; i++) {
> - if (iovec[i]->errno) {
> - if (!err)
> - err = iovec[i]->errno;
> - break;
> - }
> - transferred += iovec[i]->length;
> - }
> -
> -out:
> - return transferred ? transferred : err;
> }
> -#endif
> ===== fs/bio.c 1.14 vs 1.15 =====
> --- 1.14/fs/bio.c Thu Feb 7 16:13:25 2002
> +++ 1.15/fs/bio.c Fri Feb 8 17:22:14 2002
> @@ -151,6 +151,22 @@
> return NULL;
> }
>
> +struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec)
> +{
> + struct bio *bio = mempool_alloc(bio_pool, gfp_mask);
> + struct bio_vec *bvl = NULL;
> +
> + if (unlikely(!bio))
> + return NULL;
> +
> + bio->bi_max = kvec->max_nr;
> + bio_init(bio);
> + bio->bi_destructor = bio_destructor;
> + bio->bi_io_vec = (struct bio_vec *)kvec->veclet; /* shoot me */
> + bio->bi_flags |= 1 << BIO_CLONED; /* don't free the vec */
> + return bio;
> +}
> +
> /**
> * bio_put - release a reference to a bio
> * @bio: bio to release reference to
> ===== include/linux/kiovec.h 1.1 vs 1.2 =====
> --- 1.1/include/linux/kiovec.h Sat Jan 12 02:19:10 2002
> +++ 1.2/include/linux/kiovec.h Fri Feb 8 15:00:54 2002
> @@ -6,8 +6,8 @@
>
> struct kveclet {
> struct page *page;
> - unsigned offset;
> unsigned length;
> + unsigned offset;
> };
>
> struct kvec {
> ===== include/linux/bio.h 1.11 vs 1.12 =====
> --- 1.11/include/linux/bio.h Wed Feb 6 01:23:04 2002
> +++ 1.12/include/linux/bio.h Fri Feb 8 17:22:23 2002
> @@ -26,6 +26,8 @@
> #define BIO_VMERGE_BOUNDARY 0
> #endif
>
> +#include <linux/kiovec.h>
> +
> #define BIO_DEBUG
>
> #ifdef BIO_DEBUG
> @@ -91,6 +93,7 @@
> void *bi_private;
>
> bio_destructor_t *bi_destructor; /* destructor */
> + kvec_cb_t cb;
> };
>
> /*
> ===== fs/aio.c 1.1 vs 1.2 =====
> --- 1.1/fs/aio.c Wed Jan 30 13:12:34 2002
> +++ 1.2/fs/aio.c Fri Feb 8 14:58:10 2002
> @@ -37,6 +37,7 @@
> #include <linux/smp_lock.h>
> #include <linux/compiler.h>
> #include <linux/poll.h>
> +#include <linux/module.h>
>
> #include <asm/uaccess.h>
>
> @@ -964,3 +965,8 @@
> }
>
> __initcall(aio_setup);
> +EXPORT_SYMBOL_GPL(generic_file_kvec_read);
> +EXPORT_SYMBOL_GPL(generic_file_aio_read);
> +EXPORT_SYMBOL_GPL(generic_file_kvec_write);
> +EXPORT_SYMBOL_GPL(generic_file_aio_write);
> +EXPORT_SYMBOL_GPL(generic_file_new_read);
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to [email protected]. For more info on Linux AIO,
> see: http://www.kvack.org/aio/

2002-02-08 21:09:05

by Badari Pulavarty

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

>
> Quick message: this patch makes aio use bio directly for brw_kvec_async.
> This is against yesterday's patchset. Comments?
>
> -ben

Hi Ben,

I am looking at the 2.5 patch you sent out. I have few questions/comments:

1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
for each BIO_MAX_SIZE IO).

And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.

kernel BUG at ll_rw_blk.c:1336

Code is: BUG_ON(bio_sectors(bio) > q->max_sectors);

bio_sectors(bio) is 128
q->max_sectors is 64 (for QLOGIC ISP)

Is this going to be fixed ? Can "bio" should be able to handle any
size IO ? Or we should issue create a new bio for each BIO_MAX_SIZE IO ?

2) Could you please make map_user_kvec() generic enough to handle mapping
of mutliple iovecs to single kvec (to handle readv/writev). I think
this is a very easy change:

* Add alloc_kvec() and take out the kmalloc() from map_user_kvec().
* Change map_user_kvec() to start mapping from kvec->veclet[kvec->nr]
instead of kvec->veclet[0]

This way allocation for kvec to hold all the iovecs can be done at higher
level and map_user_kvec() can be called in a loop once for each iovec.

Then we can add read/writev support for RAW IO very easily. I am looking
at making use of kvec for RAW IO and also add support for readv/writev.
(I already sent a patch to do this - but since you ported all your AIO
work to 2.5, we can work on your infrastructure :)

Please let me know what you think.

Thanks,
Badari

2002-02-08 21:22:09

by Arjan van de Ven

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

In article <[email protected]> you wrote:

> 1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
> each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
> for each BIO_MAX_SIZE IO).

> And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
> 64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.

> kernel BUG at ll_rw_blk.c:1336

> Code is: BUG_ON(bio_sectors(bio) > q->max_sectors);

> bio_sectors(bio) is 128
> q->max_sectors is 64 (for QLOGIC ISP)

this is a bio bug. BIO should split if needed.
(Oh and qlogic hw can easily handle 1024 sector-sized requests)

> 2) Could you please make map_user_kvec() generic enough to handle mapping
> of mutliple iovecs to single kvec (to handle readv/writev).

I think the "move all readv/writev to one single kvec" is a mistake. The
OPPOSITE should happen. If you submit a huge single vector it should be
split up internally. This would also be the fix for the "submit the entire
vector so far and sync wait on it after 512Kb" performance bug in the normal
rawio code, since it can just submit partial (say 256Kb sized) vectors and
wait for ANY one of them before going over a 512Kb boundary.

Sure readv/writev are not optimal now. but that is because the kernel waits for
IO complete per vector element instead of submitting them all async and
waiting at the end (or in the aio case, not wait at all).

Gretings,
Arjan van de Ven.

2002-02-08 22:03:18

by Benjamin LaHaise

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, Feb 08, 2002 at 03:10:09PM +0530, Suparna Bhattacharya wrote:
> You chose to add a kvec_cb field to the bio structure rather than use
> bi_private ?

Yup. I'm lazy. Also, the cb struct actually needs 2 pointers, so just
bi_private isn't enough.

> For the raw path, you are OK since you never have to copy data out of
> the kvecs after i/o completion, and unmap_kvec only looks at veclet pages.
> So the fact block can change the offset and len fields in the veclets
> doesn't affect you, but thought I'd mention it as a point of caution
> anyhow ...

Ugh. That sounds like something bio should not be doing. If someone
wants to fix it, such a patch would be much appreciated, otherwise it'll
wait until I'm back in Canada.

-ben

2002-02-08 22:13:49

by Benjamin LaHaise

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, Feb 08, 2002 at 01:07:58PM -0800, Badari Pulavarty wrote:
> I am looking at the 2.5 patch you sent out. I have few questions/comments:
>
> 1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
> each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
> for each BIO_MAX_SIZE IO).

Sounds like a needless restriction in bio, especially as one of the design
requirements for the 2.5 block work is that we're able to support large ios
(think 4MB page support).

> And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
> 64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.

Jens? Also, why does the bio callback return a value? Nothing could ever
possibly use it as far as I can see.

> 2) Could you please make map_user_kvec() generic enough to handle mapping
> of mutliple iovecs to single kvec (to handle readv/writev). I think
> this is a very easy change:
>
> * Add alloc_kvec() and take out the kmalloc() from map_user_kvec().
> * Change map_user_kvec() to start mapping from kvec->veclet[kvec->nr]
> instead of kvec->veclet[0]
>
> This way allocation for kvec to hold all the iovecs can be done at higher
> level and map_user_kvec() can be called in a loop once for each iovec.

Sounds good.

-ben

2002-02-08 22:56:23

by Linus Torvalds

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

In article <[email protected]>,
Benjamin LaHaise <[email protected]> wrote:
>On Fri, Feb 08, 2002 at 01:07:58PM -0800, Badari Pulavarty wrote:
>> I am looking at the 2.5 patch you sent out. I have few questions/comments:
>>
>> 1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
>> each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
>> for each BIO_MAX_SIZE IO).
>
>Sounds like a needless restriction in bio, especially as one of the design
>requirements for the 2.5 block work is that we're able to support large ios
>(think 4MB page support).

bio can handle arbitrarily large IO's, BUT it can never split them.

Basically, IO splitting is NOT the job of the IO layer. So you can make
any size request you want, but you had better know that the hardware you
send it to can take it. The bio layer basically guarantees only that
you can send a single contiguous request of PAGE_SIZE, nothing more (in
fact, we might at some point get away from even that, and only guarantee
sectors - with things like loopback remapping etc you might have trouble
even for "contiguous" requests).

Now, before you say "that's stupid, I don't know what the driver limits
are", ask yourself:
- what is it that you want to go fast?
- what is it that you CAN make fast?

The answer to the "want" question is: the common case. And like it or
not, the common case is never going to be 4MB pages.

The answer to the "can" question is: merging can be fast, splitting
fundamentally cannot.

Splitting a request _fundamentally_ involves memory management (at the
very least you have to allocate a new request), while growing a request
can (and does) mean just adding an entry to the end of a list (until you
cannot grow it any more, of course, but that's the point where you have
to end anyway, so..)

Now, think about that for five minutes, and if you don't come back with
the right answer, you get an F.

In short:

- the right answer to handling 4MB pages is not to push complexity into
the low-level drivers and make them try to handle requests that are
bigger than the hardware can do.

In fact, we don't even want to handle it in the mid layers, because
(a) the mid layers have historically been even more flaky than some
device drivers and (b) it's a performance loss to even test for the
common case where the splitting is neither needed nor wanted.

- the _right_ answer to handling big areas is to build up big bio's
from smaller ones. And no, you don't have to call the elevator in
between requests that you know are consecutive on the disk.

Another way of saying it: if you have 4MB worth of IO, it's YOUR
resposibility to do the work to make it fit the controller. It is off
the default case, and _you_ do a bit of extra work instead of asking
everybody else to do your heavy lifting for you.

Does bio have the interfaces to do this yet? No. But if you think that
bio's should natively handle any kind of request at all, you're really
barking up the wrong tree.

If you are in the small small _small_ minority care about 4MB requests,
you should build the infrastructure not to make drivers split them, but
to build up a list of bio's and then submit them all consecutively in
one go.

Remember: checking the limits as you build stuff up is easy, and fast.

So you should make sure that you never EVER cause anybody to want to
split a bio.

Linus

2002-02-09 00:01:38

by Benjamin LaHaise

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, Feb 08, 2002 at 02:54:51PM -0800, Linus Torvalds wrote:
> bio can handle arbitrarily large IO's, BUT it can never split them.

I agree that it should not split ios, but it should not needlessly limit
the size of them either -- that choice should be left to individual
drivers.

...
> If you are in the small small _small_ minority care about 4MB requests,
> you should build the infrastructure not to make drivers split them, but
> to build up a list of bio's and then submit them all consecutively in
> one go.
>
> Remember: checking the limits as you build stuff up is easy, and fast.
>
> So you should make sure that you never EVER cause anybody to want to
> split a bio.

Yup. What we need is an interface for getting the max size of an io --
I can see this being needed for filesystems, char devices, network drivers,
basically anything that we can do aio/"direct" io on. Given that, I can
put the split / pipelining code into the generic layer.

-ben

2002-02-09 00:29:33

by Linus Torvalds

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, 8 Feb 2002, Benjamin LaHaise wrote:
>
> Yup. What we need is an interface for getting the max size of an io --

No. There is no such thing.

There is no "max size". There are various different limits, and "size" is
usually the last one on the list. The limitations are things like "I can
have at most N consecutive segments" or "crossing a 64kB border is fine,
but implies a new segment" or "a single segment is limited to X bytes, and
the _sum_ of all segments are limited to Y bytes" or..

And it can depend on the _address_ of the thing you're writing. If the
address is above a bounce limit, the bouncing code ends up having to copy
it to accessible memory, so you can have a device that can do a 4MB
request in one go if it's directly accessible, but if it is not in the low
XXX bits, then it gets split into chunks and copied down, at which point
you may only be able to do N chunks at a time.

And no, I didn't make any of these examples up.

A "max size" does not work. It needs to be a lot more complex than that.
For block devices, you need the whole "struct request_queue" to describe
the default cases, and even then there are function pointers to let
individual drivers limits of their own _outside_ those cases.

So it basically needs to be a "grow_bio()" function that does the choice,
not a size limitation.

(And then most devices will just use one of a few standard "grow()"
functions, of course - you need the flexibility, but at the same time
there is a lot of common cases).

Linus

2002-02-11 11:16:38

by Jens Axboe

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, Feb 08 2002, Benjamin LaHaise wrote:
> > And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
> > 64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.
>
> Jens? Also, why does the bio callback return a value? Nothing could ever
> possibly use it as far as I can see.

WRT max size, Linus already replied why this is so. The other thing --
the callback return value was really to have it return 'done or not'
when doing partial completions. It was later decided that the end_io
callback should only be called for final completion, so really the
nr_sectors and return value is not used now. I'll clean that up next
time around.

--
Jens Axboe

2002-02-11 12:41:52

by Suparna Bhattacharya

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

On Fri, Feb 08, 2002 at 05:02:57PM -0500, Benjamin LaHaise wrote:
> On Fri, Feb 08, 2002 at 03:10:09PM +0530, Suparna Bhattacharya wrote:
> > You chose to add a kvec_cb field to the bio structure rather than use
> > bi_private ?
>
> Yup. I'm lazy. Also, the cb struct actually needs 2 pointers, so just
> bi_private isn't enough.
>
> > For the raw path, you are OK since you never have to copy data out of
> > the kvecs after i/o completion, and unmap_kvec only looks at veclet pages.
> > So the fact block can change the offset and len fields in the veclets
> > doesn't affect you, but thought I'd mention it as a point of caution
> > anyhow ...
>
> Ugh. That sounds like something bio should not be doing. If someone
> wants to fix it, such a patch would be much appreciated, otherwise it'll
> wait until I'm back in Canada.

I had posted a suggestion on lkml earlier about this problem and one way
to fix this - at that time I was thinking of bio->bi_voffset field (the
thought was that it could even be useful for cloning/splitting), and
was waiting for a decision from Jens about it.
(Your kvec_dst is another way :))

However I'm now wondering if the rq->hard_cur_sectors can be used to
achieve a similar effect (won't help with splitting, but might suffice
for the request transfer, without requiring new bio fields).

>
> -ben

------------------------------------------

>From [email protected] Fri Dec 28 15:50:06 2001
Date: Fri, 28 Dec 2001 15:50:06 -0500
From: Suparna Bhattacharya <[email protected]>
To: [email protected]
Cc: [email protected]
Subject: bio_io_vec and kvecs integration
Message-ID: <[email protected]>
Reply-To: [email protected]
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.2.5i
Status: RO
Content-Length: 1831
Lines: 36

Since bio, aio and zero-copy networking code all make use of a similar
representation of a memory vector or i/o currency, I was looking at
what it would take at bio level to make sure that these descriptors can
be passed down as is to the block layer (without having to translate or
copy descriptors).

Ben has already posted some patches to change bio and skbuff fragments to
his kveclet representation. Once all these subsystems share the same
representation, the next step is to be able to pass the vectors around as
is.

Some of the changes towards this sort of a thing went in during one of
the early updates to bio. The bio structure was modified to maintain a
pointer to the vector list, so it can point to a vector list sent down
to it by the calling subsystem. Well, this change was also needed for
bio cloning (for lvm etc) and that's all its been used for so far since
we don't have kvecs in the kernel yet.

I think just a little more work is needed to get bio layer to take in
a vector directly. This is because currently the block layer could modify
the bv_offset and bv_len fields in the vector directly (as part of
end_that_request_first processing) in case the whole segment/veclet/bvec
doesn't get transfered/completed in one shot. This could result in
unexpected effects for the higher layer if it tried to interpret the
vector or say copy data from it after the i/o completes.

So, should we add a bi_voffset field, which can be used in conjunction with
bi_vidx to locate exactly where to start the next i/o transfer from/to, and
try to leave bv_offset and bv_len fields untouched during request processing ?
Or is there some other way to do this ?

One of these days we're going to have to sort out the kiobuf/kvec decision
for 2.5, but that's probably a discussion for a little later.

Regards
Suparna

2002-02-11 21:37:36

by Badari Pulavarty

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

>
> Quick message: this patch makes aio use bio directly for brw_kvec_async.
> This is against yesterday's patchset. Comments?
>
Hi Ben,

I have one more question on your latest aio patch.

raw_kvec_rw() seem to handle max upto 512K (max_sectors = 2500).
But I don't see any where you loop thro to satisfy the entire
IO request.

Infact, generic_aio_read() is mapping the user buffer for the iosize
and calling file->f_op->kvec_read(file, cb, iosize, pos).
How does the io > 512K gets handled ?

NOTE: I have not looked at the userlevel code. Is IO split at max 512K
at userlevel ?

Thanks,
Badari

2002-02-14 04:45:45

by Suparna Bhattacharya

[permalink] [raw]

Subject: Re: patch: aio + bio for raw io

A variation in perspective on this.

One could look at a bio simply as a completion unit. A caller into
block would split up its i/os depending on the granularity (latency)
at which it would like to see partial completion information
propagated about the i/o as it progresses, in case it is available.
Of course this would at best be a hint, as the true granularity
depends on how much the driver handles in one go, and in case of
merges this would be coarser than what the bio splitup indicates.
(Arjan, this is also in the light of your concerns about latency
impacts of making readv/writev vectors mappable to the same kvec;
the same kvec can be mapped to multiple bios without any difficulty
and the advantage is that it can be split uniformly irrespective
of user space iovec element boundaries)

Alignment requirements might need to be honoured at this level
(this seems inevitable), but when it comes to the size restrictions,
here are a few aspects to keep in mind.

> bio can handle arbitrarily large IO's, BUT it can never split them.
>
> The answer to the "can" question is: merging can be fast, splitting
> fundamentally cannot.
>
> Splitting a request _fundamentally_ involves memory management (at the
> very least you have to allocate a new request), while growing a request
> can (and does) mean just adding an entry to the end of a list (until you
> cannot grow it any more, of course, but that's the point where you have
> to end anyway, so..)

Splitting of bios by block layer or a driver should only be needed
in case the i/o needs to be split or striped across multiple
devices/queues/offsets as in the case of lvm or software raid
(remappings). If the caller sends down bios with the right limits
(with the help of grow_bio), then the need for allocating bios for
splits may be avoided in several such cases, but not always
(e.g raid5, bad block remapping).

In situations where it is just a matter of the low level driver not
being able to handle a single request in one shot, it only needs
a bit of state information to be able to complete it in chunks.
There is _no_ need for memory allocation in this case. Most of the
infrastructure for this already exists in bio and is in use for
a few IDE cases; I have in mind a few enhancements and helpers to
make this more generic.

The maximum i/o limits (size + segment limits etc) do have another
implication in terms of how much it makes sense to merge into a
single request, especially when it comes to more complicated
merge decisions accounting for latencies/deadlines/priorities.

So, having a grow_bio or some such function which _combines_ the
constraints of all the driver layers involved as far as possible,
could help with minimizing bio splitting due to remapping and
account for balanced merges.

It is just that the degree of benefits seem less that it might
appear at first sight if we are willing to depend on the bio
infrastructure available to enable drivers to handle a partial
requests (i.e. complete a single request in smaller chunks).

Also, notice that combining these layered constraints to the most
conservative sizes could result in more granular splits than might
be absolutely necessary, which incurs extra costs down the
completion path (the need to combine completion status from all
the pieces, even though they might have ultimately been transferred
to the device in one shot).

Regards
Suparna

On Fri, Feb 08, 2002 at 04:25:25PM -0800, Linus Torvalds wrote:
> On Fri, 8 Feb 2002, Benjamin LaHaise wrote:
> >
> > Yup. What we need is an interface for getting the max size of an io --
>
> No. There is no such thing.
>
> There is no "max size". There are various different limits, and "size" is
> usually the last one on the list. The limitations are things like "I can
> have at most N consecutive segments" or "crossing a 64kB border is fine,
> but implies a new segment" or "a single segment is limited to X bytes, and
> the _sum_ of all segments are limited to Y bytes" or..
>
> And it can depend on the _address_ of the thing you're writing. If the
> address is above a bounce limit, the bouncing code ends up having to copy
> it to accessible memory, so you can have a device that can do a 4MB
> request in one go if it's directly accessible, but if it is not in the low
> XXX bits, then it gets split into chunks and copied down, at which point
> you may only be able to do N chunks at a time.
>
> And no, I didn't make any of these examples up.
>
> A "max size" does not work. It needs to be a lot more complex than that.
> For block devices, you need the whole "struct request_queue" to describe
> the default cases, and even then there are function pointers to let
> individual drivers limits of their own _outside_ those cases.
>
> So it basically needs to be a "grow_bio()" function that does the choice,
> not a size limitation.
>
> (And then most devices will just use one of a few standard "grow()"
> functions, of course - you need the flexibility, but at the same time
> there is a lot of common cases).
>
> Linus
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to [email protected]. For more info on Linux AIO,
> see: http://www.kvack.org/aio/