2002-06-22 15:04:33

by Adam J. Richter

[permalink] [raw]
Subject: bio_append patch


Here is a first draft of my bio_append patch. It includes
a rewrite of most of mpage.c.

The new mpage routines never punt to the fs/buffer.c routines.
So, it may be possible to rename them to their fs/buffer.c counterparts
and delete the fs/buffer.c versions. Also, the new mpage routines
notice if you are trying to write PAGE_ZERO to a hole in a file and
will leave the hole in place in that case rather than allocate space
on disk unnecessarily.

I know the code needs a little some clarification in the
form of documentation and perhaps some symbol renaming.

Any comments, testing or benchmarking would be appreciated.
(I sure would like to know how much of a benefit if any we get from
building IO bigger requests.)

Note: I believe I have seen 2.5.24 without these changes lock up
under one process's constant IDE disk activity. So if you see a "soft
lockup", it would be helpful if you could see if you can repeat it
with this patch and have it not occur without this patch.

Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."

--- linux-2.5.24/include/linux/bio.h 2002-06-20 15:53:55.000000000 -0700
+++ linux/include/linux/bio.h 2002-06-20 06:36:16.000000000 -0700
@@ -37,8 +37,7 @@
#define BIO_BUG_ON
#endif

-#define BIO_MAX_SECTORS 128
-#define BIO_MAX_SIZE (BIO_MAX_SECTORS << 9)
+#define BIO_MAX_VECS 256

/*
* was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -78,6 +77,7 @@
* hardware coalescing is performed.
*/
unsigned short bi_hw_segments;
+ unsigned int bi_last_seg_size;

unsigned int bi_size; /* residual I/O count */
unsigned int bi_max; /* max bvl_vecs we can hold,
@@ -196,11 +196,14 @@
struct request_queue;
extern inline int bio_phys_segments(struct request_queue *, struct bio *);
extern inline int bio_hw_segments(struct request_queue *, struct bio *);
+extern int bio_max_iovecs(struct request_queue *q, int *iovec_size);

extern inline void __bio_clone(struct bio *, struct bio *);
extern struct bio *bio_clone(struct bio *, int);
extern struct bio *bio_copy(struct bio *, int, int);

+extern void bio_recycle (struct bio **bio, int gfp_mask);
+extern void bio_append(struct bio **bio_p, struct bio_vec *bv, int gfp_mask);
extern inline void bio_init(struct bio *);

extern int bio_ioctl(kdev_t, unsigned int, unsigned long);
--- linux-2.5.24/include/linux/blkdev.h 2002-06-20 15:53:47.000000000 -0700
+++ linux/include/linux/blkdev.h 2002-06-19 03:54:33.000000000 -0700
@@ -266,16 +266,26 @@
unsigned block_size_bits;
};

+struct partition_ops {
+ int (*check)(struct gendisk *hd,
+ struct block_device *bdev,
+ unsigned long first_sect,
+ int first_minor);
+ struct partition_ops *next;
+};
+
/*
* Used to indicate the default queue for drivers that don't bother
* to implement multiple queues. We have this access macro here
* so as to eliminate the need for each and every block device
* driver to know about the internal structure of blk_dev[].
*/
-#define BLK_DEFAULT_QUEUE(_MAJOR) &blk_dev[_MAJOR].request_queue
+#define BLK_DEFAULT_QUEUE(_MAJOR) (&blk_dev[_MAJOR].request_queue)

extern struct sec_size * blk_sec[MAX_BLKDEV];
extern struct blk_dev_struct blk_dev[MAX_BLKDEV];
+extern void register_partition_ops(struct partition_ops *ops);
+extern void unregister_partition_ops(struct partition_ops *ops);
extern void grok_partitions(kdev_t dev, long size);
extern int wipe_partitions(kdev_t dev);
extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size);
@@ -287,6 +297,10 @@
extern void blk_put_request(struct request *);
extern void blk_plug_device(request_queue_t *);
extern int blk_remove_plug(request_queue_t *);
+struct bio_vec;
+extern void blk_tally_bvecs(request_queue_t *q, struct bio *bio,
+ struct bio_vec *bvprv,
+ struct bio_vec *bv, int bv_count);
extern void blk_recount_segments(request_queue_t *, struct bio *);
extern inline int blk_phys_contig_segment(request_queue_t *q, struct bio *, struct bio *);
extern inline int blk_hw_contig_segment(request_queue_t *q, struct bio *, struct bio *);
@@ -410,4 +424,6 @@
page_cache_release(p.v);
}

+extern int bio_max_iovecs(request_queue_t *q, int *iovec_size);
+
#endif
--- linux-2.5.24/fs/bio.c 2002-06-20 15:53:43.000000000 -0700
+++ linux/fs/bio.c 2002-06-22 07:30:21.000000000 -0700
@@ -92,6 +92,13 @@
return bvl;
}

+/* Must correspond to the switch statement in bvec_alloc. */
+static inline int bi_max_to_size(int bi_max)
+{
+ static unsigned short bi_to_size[] = { 1, 4, 16, 64, 128, 256 };
+ return bi_to_size[bi_max];
+}
+
/*
* default destructor for a bio allocated with bio_alloc()
*/
@@ -118,6 +125,7 @@
bio->bi_idx = 0;
bio->bi_phys_segments = 0;
bio->bi_hw_segments = 0;
+ bio->bi_last_seg_size = 0;
bio->bi_size = 0;
bio->bi_end_io = NULL;
atomic_set(&bio->bi_cnt, 1);
@@ -317,6 +325,94 @@
return NULL;
}

+void bio_recycle (struct bio **bio, int gfp_mask)
+{
+ struct bio *old = *bio;
+ struct bio *newbio = bio_alloc(gfp_mask, bi_max_to_size(old->bi_max));
+
+ newbio->bi_sector = old->bi_sector + (old->bi_size >> 9);
+
+#define COPY(field) newbio->bi_ ## field = old->bi_ ## field
+ COPY(bdev);
+ COPY(rw);
+ COPY(end_io);
+ COPY(private);
+#undef COPY
+
+ submit_bio(old->bi_rw, old);
+ *bio = newbio;
+}
+
+
+/* bio_append appends an IO vector to a bio. If there is no room
+ in the bio, the bio will first be submitted and replaced with an
+ empty one.
+
+ bio_append expects to be called with bio->bi_vcnt indicating the
+ number of IO vectors already loaded. If the provided bio is
+ already full, bio_append will submit the current bio and allocate
+ a new one. */
+
+void bio_append(struct bio **bio_p, struct bio_vec *bv, int gfp_mask)
+{
+ struct bio *bio = *bio_p;
+ request_queue_t *q = bio->bi_bdev->bd_queue;
+
+ BUG_ON(q == NULL);
+ BUG_ON(bio->bi_io_vec == NULL);
+
+ if (bio->bi_vcnt != 0) {
+
+ if (bio->bi_vcnt < bi_max_to_size(bio->bi_max) &&
+ ((bio->bi_size + bv->bv_len) >> 9) <= q->max_sectors) {
+
+ unsigned int phys_segs = bio->bi_phys_segments;
+ unsigned int hw_segs = bio->bi_hw_segments;
+ unsigned int seg_size = bio->bi_last_seg_size;
+
+ blk_tally_bvecs(q, bio,
+ &bio->bi_io_vec[bio->bi_vcnt-1], bv, 1);
+
+ if (bio->bi_phys_segments <= q->max_phys_segments &&
+ bio->bi_hw_segments <= q->max_hw_segments)
+ goto goodbio;
+
+ bio->bi_phys_segments = phys_segs;
+ bio->bi_hw_segments = hw_segs;
+ bio->bi_last_seg_size = seg_size;
+
+ }
+ bio_recycle(&bio, gfp_mask);
+ *bio_p = bio;
+ }
+
+ blk_tally_bvecs(q, bio, NULL, bv, 1);
+ bio->bi_flags |= (1 << BIO_SEG_VALID);
+
+ goodbio:
+ bio->bi_io_vec[bio->bi_vcnt++] = *bv;
+ bio->bi_size += bv->bv_len;
+}
+
+int bio_max_iovecs(request_queue_t *q, int *iovec_size)
+{
+ unsigned max_iovecs = min(q->max_phys_segments, q->max_hw_segments);
+
+ if (q->max_segment_size != 0 && *iovec_size > q->max_segment_size)
+ *iovec_size = q->max_segment_size;
+
+ if (q->max_sectors != 0) {
+ unsigned int max_bytes = q->max_sectors << 9;
+ if (*iovec_size > max_bytes) {
+ *iovec_size = max_bytes;
+ return 1;
+ }
+ max_iovecs = min(max_iovecs, max_bytes / *iovec_size);
+ }
+
+ return max_iovecs;
+}
+
static void bio_end_io_kio(struct bio *bio)
{
struct kiobuf *kio = (struct kiobuf *) bio->bi_private;
@@ -339,9 +435,10 @@
**/
void ll_rw_kio(int rw, struct kiobuf *kio, struct block_device *bdev, sector_t sector)
{
- int i, offset, size, err, map_i, total_nr_pages, nr_pages;
- struct bio_vec *bvec;
+ int offset, size, err, map_i, total_nr_pages, nr_bvecs;
+ struct bio_vec bvec;
struct bio *bio;
+ int bytes_per_bvec, max_bvecs;

err = 0;
if ((rw & WRITE) && bdev_read_only(bdev)) {
@@ -367,17 +464,18 @@

map_i = 0;

-next_chunk:
- nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - 9);
- if (nr_pages > total_nr_pages)
- nr_pages = total_nr_pages;
+ max_bvecs = bio_max_iovecs(bdev->bd_queue, &bytes_per_bvec);
+
+ nr_bvecs = max_bvecs;
+ if (nr_bvecs > total_nr_pages)
+ nr_bvecs = total_nr_pages;

atomic_inc(&kio->io_count);

/*
* allocate bio and do initial setup
*/
- if ((bio = bio_alloc(GFP_NOIO, nr_pages)) == NULL) {
+ if ((bio = bio_alloc(GFP_NOIO, nr_bvecs)) == NULL) {
err = -ENOMEM;
goto out;
}
@@ -387,43 +485,37 @@
bio->bi_idx = 0;
bio->bi_end_io = bio_end_io_kio;
bio->bi_private = kio;
+ bio->bi_rw = rw;

- bvec = bio->bi_io_vec;
- for (i = 0; i < nr_pages; i++, bvec++, map_i++) {
+ while (total_nr_pages > 0) {
int nbytes = PAGE_SIZE - offset;
-
+
if (nbytes > size)
nbytes = size;
+ if (nbytes > bytes_per_bvec)
+ nbytes = bytes_per_bvec;

BUG_ON(kio->maplist[map_i] == NULL);

- if (bio->bi_size + nbytes > (BIO_MAX_SECTORS << 9))
- goto queue_io;
-
- bio->bi_vcnt++;
- bio->bi_size += nbytes;
-
- bvec->bv_page = kio->maplist[map_i];
- bvec->bv_len = nbytes;
- bvec->bv_offset = offset;
-
- /*
- * kiobuf only has an offset into the first page
- */
- offset = 0;
+ bvec.bv_page = kio->maplist[map_i];
+ bvec.bv_offset = offset;
+ bvec.bv_len = nbytes;
+
+ bio_append(&bio, &bvec, GFP_NOIO);
+
+ offset = (offset + nbytes) & PAGE_MASK;
+ if (offset == 0) {
+ total_nr_pages--;
+ map_i++;
+ }

sector += nbytes >> 9;
size -= nbytes;
- total_nr_pages--;
kio->offset += nbytes;
}

-queue_io:
submit_bio(rw, bio);

- if (total_nr_pages)
- goto next_chunk;
-
if (size) {
printk("ll_rw_kio: size %d left (kio %d)\n", size, kio->length);
BUG();
--- linux-2.5.24/fs/mpage.c 2002-06-20 15:53:47.000000000 -0700
+++ linux/fs/mpage.c 2002-06-22 07:48:57.000000000 -0700
@@ -2,12 +2,19 @@
* fs/mpage.c
*
* Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2001, Jens Axboe
+ * Copyright (C) 2002, Yggdrasil Computing, Inc.
*
* Contains functions related to preparing and submitting BIOs which contain
* multiple pagecache pages.
*
* 15May2002 [email protected]
* Initial version
+ *
+ * 21June2002 [email protected]
+ * Rewrote at least half of it, in order to safely build
+ * large IO transfers and to support the cases that
+ * fs/buffer.c handled.
*/

#include <linux/kernel.h>
@@ -19,12 +26,40 @@
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+struct mpage_private {
+ atomic_t bvec_count;
+ atomic_t uptodate;
+};

-/*
- * The largest-sized BIO which this code will assemble, in bytes. Set this
- * to PAGE_CACHE_SIZE if your drivers are broken.
- */
-#define MPAGE_BIO_MAX_SIZE BIO_MAX_SIZE
+static mempool_t *mpage_priv_pool;
+static kmem_cache_t *mpage_priv_slab;
+
+#define MPAGE_PRIV_POOL_SIZE 100
+
+static void
+bvec_io_error(struct bio_vec *bvec)
+{
+ if (page_has_buffers(bvec->bv_page)) {
+ const int bv_end = bvec->bv_offset + bvec->bv_len;
+ int buf_offset = 0;
+ struct buffer_head *bh = page_buffers(bvec->bv_page);
+
+ while (buf_offset + bh->b_size <= bvec->bv_offset) {
+ buf_offset += bh->b_size;
+ bh = bh->b_this_page;
+ }
+ while (buf_offset < bv_end) {
+ clear_buffer_uptodate(bh);
+ buf_offset += bh->b_size;
+ bh = bh->b_this_page;
+ }
+ }
+}

/*
* I/O completion handler for multipage BIOs.
@@ -38,81 +73,298 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io_read(struct bio *bio)
+static void mpage_end_io(struct bio *bio)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec;
+ struct mpage_private *priv = bio->bi_private;
+ struct page *prevpage = NULL;
+ int rw;
+
+ if (!uptodate) {
+ int i;
+ bio_for_each_segment(bvec, bio, i) {
+ bvec_io_error(bvec);
+ }
+
+ if (priv != NULL)
+ atomic_set(&priv->uptodate, 0);
+ }

+ if (priv != NULL) {
+ if (!atomic_sub_and_test(bio->bi_vcnt, &priv->bvec_count))
+ return;
+ uptodate = atomic_read(&priv->uptodate);
+ mempool_free(priv, mpage_priv_pool);
+ }
+
+ rw = bio->bi_rw;
+ bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
do {
struct page *page = bvec->bv_page;

if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);

- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
+ if (page != prevpage) {
+ if (rw == READ) {
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ }
+ else {
+ if (!uptodate)
+ SetPageError(page);
+ end_page_writeback(page);
+ }
}
- unlock_page(page);
} while (bvec >= bio->bi_io_vec);
+
bio_put(bio);
}

-static void mpage_end_io_write(struct bio *bio)
+static struct bio *
+mpage_alloc(struct address_space *mapping, int nr_pages, int rw)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct inode *inode = mapping->host;
+ int blocks_per_page;
+ struct bio *bio;

- do {
- struct page *page = bvec->bv_page;
+ if (inode->i_blkbits >= PAGE_CACHE_SHIFT)
+ blocks_per_page = 1;
+ else
+ blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;

- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
+ bio = bio_alloc(GFP_KERNEL,
+ min(nr_pages * blocks_per_page, BIO_MAX_VECS));

- if (!uptodate)
- SetPageError(page);
- end_page_writeback(page);
- } while (bvec >= bio->bi_io_vec);
- bio_put(bio);
+ /* We rely on bio_alloc clearing bi_idx, bi_vcnt and bi_size. */
+ bio->bi_bdev =
+ S_ISBLK(inode->i_mode) ? inode->i_bdev : inode->i_sb->s_bdev;
+
+ bio->bi_end_io = mpage_end_io;
+ bio->bi_private = NULL;
+ bio->bi_rw = rw;
+
+ return bio;
}

-struct bio *mpage_bio_submit(int rw, struct bio *bio)
+static inline void
+mpage_add_block(struct bio **bio_p,
+ struct bio_vec *bvec,
+ int offset,
+ int length,
+ sector_t sector)
{
- bio->bi_vcnt = bio->bi_idx;
- bio->bi_idx = 0;
- bio->bi_end_io = mpage_end_io_read;
- if (rw == WRITE)
- bio->bi_end_io = mpage_end_io_write;
- submit_bio(rw, bio);
- return NULL;
+ struct mpage_private *priv;
+ sector_t contig;
+ struct bio *old = *bio_p;
+
+ /* Deal with empty bvec and empty bio. */
+ if (bvec->bv_len == 0) {
+ bvec->bv_offset = offset;
+ if (old->bi_vcnt == 0)
+ old->bi_sector = sector;
+ }
+
+ contig = old->bi_sector + ((old->bi_size + bvec->bv_len)>>9);
+
+ if (sector == contig &&
+ bvec->bv_offset + bvec->bv_len == offset &&
+ (bvec->bv_len + length <=
+ old->bi_bdev->bd_queue->max_segment_size)) {
+
+ bvec->bv_len += length;
+ return;
+ }
+
+ /* We are definitely going to use multiple bvec's for this
+ page, which can possibly mean multiple bio's, either due
+ to bio_append possibly splitting the bio or us explicitly
+ doing so due to a transfer that is not contiguous on the
+ block device. */
+
+ priv = old->bi_private;
+
+ if (priv == NULL) {
+
+ /* Send off the transfers for other pages, which
+ do not need to use old->bi_private (they must not
+ share ours anyhow). */
+
+ if (old->bi_vcnt != 0)
+ bio_recycle(bio_p, GFP_KERNEL);
+
+ /* Variable "old" is not valid after this point. */
+
+ priv = mempool_alloc(mpage_priv_pool, GFP_KERNEL);
+ (*bio_p)->bi_private = priv;
+ atomic_set(&priv->uptodate, 1);
+ atomic_set(&priv->bvec_count, 1);
+ }
+
+ if (bvec->bv_len != 0) {
+ atomic_inc(&priv->bvec_count);
+ bio_append(bio_p, bvec, GFP_KERNEL);
+ }
+
+ if (sector != contig) {
+ if ((*bio_p)->bi_vcnt != 0)
+ bio_recycle(bio_p, GFP_KERNEL);
+ (*bio_p)->bi_sector = sector;
+ }
+
+ bvec->bv_offset = offset;
+ bvec->bv_len = length;
}

static struct bio *
-mpage_alloc(struct block_device *bdev,
- sector_t first_sector, int nr_vecs, int gfp_flags)
+mpage_xferpage(struct bio *bio, struct page *page, get_block_t *get_block)
{
- struct bio *bio;
+ struct inode *inode = page->mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ unsigned num_blocks;
+ struct bio_vec bvec;
+ sector_t block_in_file;
+ unsigned offset;
+ struct buffer_head bh_local, *bh;
+ void *page_addr = NULL;
+ int sector_offset, page_within_block, xfer_size, xfer_total;
+ unsigned int stride;
+ int rw = bio->bi_rw;
+ const int create = (rw == WRITE);
+ request_queue_t *q = bio->bi_bdev->bd_queue;
+ sector_t sector;
+
+ if (page_has_buffers(page))
+ bh = page_buffers(page);
+ else {
+ /* Make bh keep pointing to bh_local throughout the loop. */
+ bh = &bh_local;
+ bh_local.b_this_page = &bh_local;
+ }
+
+ page_within_block = page->index & ((blocksize-1) >> PAGE_CACHE_SHIFT);
+ sector_offset = page_within_block << (PAGE_CACHE_SHIFT - 9);
+
+ if (blkbits <= PAGE_CACHE_SHIFT) {
+ stride = blocksize;
+ num_blocks = PAGE_CACHE_SIZE >> blkbits;
+ block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
+ } else {
+ stride = PAGE_CACHE_SIZE;
+ num_blocks = 1;
+ block_in_file = page->index >> (blkbits - PAGE_CACHE_SHIFT);
+ }
+
+ /* FIXME. Right now you cannot have a file system on a block
+ device where the minimum tranfer size (i.e., hardsect_size)
+ is larger than either the file system block size, or
+ PAGE_CACHE_SIZE. However, I think adding such support would
+ require changing most of the Linux filesystem drivers.
+ -Adam J. Richter, 2002 June 22, */
+
+ BUG_ON(q->hardsect_size > stride);
+
+ xfer_size = min(q->max_segment_size, stride);
+
+ /* Avoid accidentally extending files and directories. */
+ if (rw == WRITE && !S_ISBLK(inode->i_mode)) {
+ sector_t end_block = (inode->i_size + blocksize - 1)>>blkbits;
+ if (block_in_file + num_blocks > end_block)
+ num_blocks = end_block - block_in_file;
+ }
+
+ bvec.bv_len = 0;
+ bvec.bv_page = page;
+ for (offset = 0; num_blocks--;
+ offset += stride, block_in_file++, bh = bh->b_this_page) {
+
+ bh_local.b_state = 0;
+
+ if (!buffer_mapped(bh)) {
+
+ /*
+ * Do not write all zeroes into file holes.
+ */
+ if (rw == WRITE && page == ZERO_PAGE(NULL) &&
+ !get_block(inode, block_in_file, bh, 0) &&
+ !buffer_mapped(bh)) {
+ clear_buffer_dirty(bh);
+ continue;
+ }
+

- bio = bio_alloc(gfp_flags, nr_vecs);
+ if (get_block(inode, block_in_file, bh, create)) {
+ SetPageError(page);
+ continue;
+ }
+
+ if (!buffer_mapped(bh)) {
+
+ BUG_ON(rw != READ);
+
+ if (page_addr == NULL)
+ page_addr = kmap(page);
+
+ memset(page_addr + offset, 0, stride);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ }
+
+ sector = (bh->b_blocknr << (blkbits - 9)) + sector_offset;
+
+ for(xfer_total = 0; xfer_total < stride;
+ xfer_total += xfer_size)
+ mpage_add_block(&bio, &bvec, offset + xfer_total,
+ xfer_size,
+ sector + (xfer_total >> 9));
+
+ if (rw == WRITE)
+ clear_buffer_dirty(bh);

- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
}

- if (bio) {
- bio->bi_bdev = bdev;
- bio->bi_vcnt = nr_vecs;
- bio->bi_idx = 0;
- bio->bi_size = 0;
- bio->bi_sector = first_sector;
- bio->bi_io_vec[0].bv_page = NULL;
+ if (bvec.bv_len == 0) {
+ /* A hole in the file covered the entire page. */
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ } else {
+ if (rw == WRITE) {
+ BUG_ON(PageWriteback(page));
+ SetPageWriteback(page);
+ unlock_page(page);
+ }
+ bio_append(&bio, &bvec, GFP_KERNEL);
+ if (bio->bi_private) {
+ /* Multiple transfer on the same page must not be
+ merged with transfers for another page, so
+ submit them now. Also, this balances out the
+ initial prev->bvec_count == 1 that was set
+ by mpage_add_block. */
+
+ bio_recycle(&bio, GFP_KERNEL);
+ bio->bi_private = NULL;
+ }
}
+
+ if (page_addr != NULL)
+ kunmap(page);
+
+ if (buffer_boundary(bh) && bio->bi_vcnt != 0)
+ bio_recycle(&bio, GFP_KERNEL);
+
return bio;
}

+
/**
* mpage_readpages - populate an address space with some pages, and
* start reads against them.
@@ -159,336 +411,97 @@
*
* This all causes the disk requests to be issued in the correct order.
*/
-static struct bio *
-do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
- sector_t *last_block_in_bio, get_block_t get_block)
-{
- struct inode *inode = page->mapping->host;
- const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
- const unsigned blocksize = 1 << blkbits;
- struct bio_vec *bvec;
- sector_t block_in_file;
- sector_t last_block;
- sector_t blocks[MAX_BUF_PER_PAGE];
- unsigned page_block;
- unsigned first_hole = blocks_per_page;
- struct block_device *bdev = NULL;
- struct buffer_head bh;
-
- if (page_has_buffers(page))
- goto confused;
-
- block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
- last_block = (inode->i_size + blocksize - 1) >> blkbits;
-
- for (page_block = 0; page_block < blocks_per_page;
- page_block++, block_in_file++) {
- bh.b_state = 0;
- if (block_in_file < last_block) {
- if (get_block(inode, block_in_file, &bh, 0))
- goto confused;
- }
-
- if (!buffer_mapped(&bh)) {
- if (first_hole == blocks_per_page)
- first_hole = page_block;
- continue;
- }
-
- if (first_hole != blocks_per_page)
- goto confused; /* hole -> non-hole */
-
- /* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
- goto confused;
- blocks[page_block] = bh.b_blocknr;
- bdev = bh.b_bdev;
- }
-
- if (first_hole != blocks_per_page) {
- memset(kmap(page) + (first_hole << blkbits), 0,
- PAGE_CACHE_SIZE - (first_hole << blkbits));
- flush_dcache_page(page);
- kunmap(page);
- if (first_hole == 0) {
- SetPageUptodate(page);
- unlock_page(page);
- goto out;
- }
- }
-
- /*
- * This page will go to BIO. Do we need to send this BIO off first?
- */
- if (bio && (bio->bi_idx == bio->bi_vcnt ||
- *last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(READ, bio);
-
- if (bio == NULL) {
- unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
-
- if (nr_bvecs > nr_pages)
- nr_bvecs = nr_pages;
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- nr_bvecs, GFP_KERNEL);
- if (bio == NULL)
- goto confused;
- }
-
- bvec = &bio->bi_io_vec[bio->bi_idx++];
- bvec->bv_page = page;
- bvec->bv_len = (first_hole << blkbits);
- bvec->bv_offset = 0;
- bio->bi_size += bvec->bv_len;
- if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
- bio = mpage_bio_submit(READ, bio);
- else
- *last_block_in_bio = blocks[blocks_per_page - 1];
-out:
- return bio;
-
-confused:
- if (bio)
- bio = mpage_bio_submit(READ, bio);
- block_read_full_page(page, get_block);
- goto out;
-}
-
int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
- unsigned nr_pages, get_block_t get_block)
+ unsigned nr_pages, get_block_t *get_block)
{
- struct bio *bio = NULL;
- unsigned page_idx;
- sector_t last_block_in_bio = 0;
+ struct bio *bio;
+
+ if (nr_pages == 0)
+ return 0;

- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ bio = mpage_alloc(mapping, nr_pages, READ);
+ while (nr_pages--) {
struct page *page = list_entry(pages->prev, struct page, list);

prefetchw(&page->flags);
list_del(&page->list);
if (!add_to_page_cache_unique(page, mapping, page->index))
- bio = do_mpage_readpage(bio, page,
- nr_pages - page_idx,
- &last_block_in_bio, get_block);
+ bio = mpage_xferpage(bio, page, get_block);
page_cache_release(page);
}
BUG_ON(!list_empty(pages));
- if (bio)
- mpage_bio_submit(READ, bio);
+
+ if (bio->bi_vcnt)
+ submit_bio(READ, bio);
+ else
+ bio_put(bio);
+
return 0;
}
+
EXPORT_SYMBOL(mpage_readpages);

/*
* This isn't called much at all
*/
-int mpage_readpage(struct page *page, get_block_t get_block)
+int mpage_readpage(struct page *page, get_block_t *get_block)
{
- struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
+ struct bio *bio = mpage_alloc(page->mapping, 1, READ);
+
+ bio = mpage_xferpage(bio, page, get_block);
+
+ if (bio->bi_vcnt)
+ submit_bio(READ, bio);
+ else
+ bio_put(bio);

- bio = do_mpage_readpage(bio, page, 1,
- &last_block_in_bio, get_block);
- if (bio)
- mpage_bio_submit(READ, bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);

-/*
- * Writing is not so simple.
- *
- * If the page has buffers then they will be used for obtaining the disk
- * mapping. We only support pages which are fully mapped-and-dirty, with a
- * special case for pages which are unmapped at the end: end-of-file.
- *
- * If the page has no buffers (preferred) then the page is mapped here.
- *
- * If all blocks are found to be contiguous then the page can go into the
- * BIO. Otherwise fall back to the mapping's writepage().
- *
- * FIXME: This code wants an estimate of how many pages are still to be
- * written, so it can intelligently allocate a suitably-sized BIO. For now,
- * just allocate full-size (16-page) BIOs.
- */
-static inline struct bio *
-mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
- sector_t *last_block_in_bio, int *ret)
-{
- struct inode *inode = page->mapping->host;
- const unsigned blkbits = inode->i_blkbits;
- unsigned long end_index;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
- struct bio_vec *bvec;
- sector_t last_block;
- sector_t block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
- unsigned page_block;
- unsigned first_unmapped = blocks_per_page;
- struct block_device *bdev = NULL;
- int boundary = 0;
-
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
- struct buffer_head *bh = head;
-
- /* If they're all mapped and dirty, do it */
- page_block = 0;
- do {
- BUG_ON(buffer_locked(bh));
- if (!buffer_mapped(bh)) {
- /*
- * unmapped dirty buffers are created by
- * __set_page_dirty_buffers -> mmapped data
- */
- if (buffer_dirty(bh))
- goto confused;
- if (first_unmapped == blocks_per_page)
- first_unmapped = page_block;
- continue;
- }
-
- if (first_unmapped != blocks_per_page)
- goto confused; /* hole -> non-hole */
-
- if (!buffer_dirty(bh) || !buffer_uptodate(bh))
- goto confused;
- if (page_block) {
- if (bh->b_blocknr != blocks[page_block-1] + 1)
- goto confused;
- }
- blocks[page_block++] = bh->b_blocknr;
- boundary = buffer_boundary(bh);
- bdev = bh->b_bdev;
- } while ((bh = bh->b_this_page) != head);
-
- if (first_unmapped)
- goto page_is_mapped;
-
- /*
- * Page has buffers, but they are all unmapped. The page was
- * created by pagein or read over a hole which was handled by
- * block_read_full_page(). If this address_space is also
- * using mpage_readpages then this can rarely happen.
- */
- goto confused;
- }

- /*
- * The page has no buffers: map it to disk
- */
- BUG_ON(!PageUptodate(page));
- block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
- last_block = (inode->i_size - 1) >> blkbits;
- for (page_block = 0; page_block < blocks_per_page; ) {
- struct buffer_head map_bh;
-
- map_bh.b_state = 0;
- if (get_block(inode, block_in_file, &map_bh, 1))
- goto confused;
- if (buffer_new(&map_bh))
- unmap_underlying_metadata(map_bh.b_bdev,
- map_bh.b_blocknr);
- if (page_block) {
- if (map_bh.b_blocknr != blocks[page_block-1] + 1)
- goto confused;
- }
- blocks[page_block++] = map_bh.b_blocknr;
- boundary = buffer_boundary(&map_bh);
- bdev = map_bh.b_bdev;
- if (block_in_file == last_block)
- break;
- block_in_file++;
- }
- if (page_block == 0)
- buffer_error();
-
- first_unmapped = page_block;
-
- end_index = inode->i_size >> PAGE_CACHE_SHIFT;
- if (page->index >= end_index) {
- unsigned offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
-
- if (page->index > end_index || !offset)
- goto confused;
- memset(kmap(page) + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap(page);
- }
-
-page_is_mapped:
-
- /*
- * This page will go to BIO. Do we need to send this BIO off first?
- */
- if (bio && (bio->bi_idx == bio->bi_vcnt ||
- *last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(WRITE, bio);
-
- if (bio == NULL) {
- unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
-
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- nr_bvecs, GFP_NOFS);
- if (bio == NULL)
- goto confused;
- }
-
- /*
- * OK, we have our BIO, so we can now mark the buffers clean. Make
- * sure to only clean buffers which we know we'll be writing.
- */
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
- struct buffer_head *bh = head;
- unsigned buffer_counter = 0;
-
- do {
- if (buffer_counter++ == first_unmapped)
- break;
- clear_buffer_dirty(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
-
- bvec = &bio->bi_io_vec[bio->bi_idx++];
- bvec->bv_page = page;
- bvec->bv_len = (first_unmapped << blkbits);
- bvec->bv_offset = 0;
- bio->bi_size += bvec->bv_len;
- BUG_ON(PageWriteback(page));
- SetPageWriteback(page);
- unlock_page(page);
- if (boundary || (first_unmapped != blocks_per_page))
- bio = mpage_bio_submit(WRITE, bio);
- else
- *last_block_in_bio = blocks[blocks_per_page - 1];
- goto out;
-
-confused:
- if (bio)
- bio = mpage_bio_submit(WRITE, bio);
- *ret = page->mapping->a_ops->writepage(page);
-out:
- return bio;
+static inline int list_len(struct list_head *list)
+{
+ list_t *tmp;
+ int count = 0;
+ list_for_each(tmp, list)
+ count++;
+ return count;
}

/*
* This is a cut-n-paste of generic_writepages(). We _could_
* generalise that function. It'd get a bit messy. We'll see.
*/
+
int
mpage_writepages(struct address_space *mapping,
int *nr_to_write, get_block_t get_block)
{
- struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
- int ret = 0;
+ struct bio *bio;
int done = 0;
+ int pages_to_write;

write_lock(&mapping->page_lock);
+ pages_to_write =
+ nr_to_write ? *nr_to_write : list_len(&mapping->io_pages);
+
+ /*
+ * FIXME. The old code would actually do one iteration if
+ * page_to_write was 0. Was that necessary?
+ */
+ if (pages_to_write <= 0)
+ pages_to_write = 1;
+
+#if 0
+ if (pages_to_write == 0) {
+ write_unlock(&mapping->page_lock);
+ return 0;
+ }
+#endif
+
+ bio = mpage_alloc(mapping, pages_to_write, WRITE);

list_splice(&mapping->dirty_pages, &mapping->io_pages);
INIT_LIST_HEAD(&mapping->dirty_pages);
@@ -498,11 +511,9 @@
struct page, list);
list_del(&page->list);
if (PageWriteback(page)) {
- if (PageDirty(page)) {
- list_add(&page->list, &mapping->dirty_pages);
- continue;
- }
- list_add(&page->list, &mapping->locked_pages);
+ list_add(&page->list, PageDirty(page) ?
+ &mapping->dirty_pages :
+ &mapping->locked_pages);
continue;
}
if (!PageDirty(page)) {
@@ -527,9 +538,8 @@
}
spin_unlock(&pagemap_lru_lock);
}
- bio = mpage_writepage(bio, page, get_block,
- &last_block_in_bio, &ret);
- if (ret || (nr_to_write && --(*nr_to_write) <= 0))
+ bio = mpage_xferpage(bio, page, get_block);
+ if (nr_to_write && --(*nr_to_write) <= 0)
done = 1;
} else {
unlock_page(page);
@@ -546,8 +556,50 @@
INIT_LIST_HEAD(&mapping->io_pages);
}
write_unlock(&mapping->page_lock);
- if (bio)
- mpage_bio_submit(WRITE, bio);
- return ret;
+ if (bio->bi_vcnt)
+ submit_bio(WRITE, bio);
+ else
+ bio_put(bio);
+
+ return 0;
}
+
EXPORT_SYMBOL(mpage_writepages);
+
+
+/* Below is copied from bio.c, so I added Jens to the copyright notice.
+ -Adam */
+
+static void *slab_pool_alloc(int gfp_mask, void *data)
+{
+ return kmem_cache_alloc(data, gfp_mask);
+}
+
+static void slab_pool_free(void *ptr, void *data)
+{
+ kmem_cache_free(data, ptr);
+}
+
+static int __init init_mpage(void)
+{
+ mpage_priv_slab = kmem_cache_create("mpage_priv",
+ sizeof(struct mpage_private), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!mpage_priv_slab)
+ panic("mpage: can't create slab cache\n");
+
+ mpage_priv_pool = mempool_create(MPAGE_PRIV_POOL_SIZE,
+ slab_pool_alloc, slab_pool_free,
+ mpage_priv_slab);
+ if (!mpage_priv_pool)
+ panic("mpage: can't create mempool\n");
+
+ printk("MPAGE_PRIV: pool of %d setup, %ZuKb (%Zd bytes/mpage_priv)\n",
+ MPAGE_PRIV_POOL_SIZE,
+ MPAGE_PRIV_POOL_SIZE * sizeof(struct mpage_private) >> 10,
+ sizeof(struct mpage_private));
+
+ return 0;
+}
+
+module_init(init_mpage);


2002-07-01 22:43:15

by Andrew Morton

[permalink] [raw]
Subject: Re: bio_append patch

"Adam J. Richter" wrote:
>
> Here is a first draft of my bio_append patch. It includes
> a rewrite of most of mpage.c.
>

I basically gave up on trying to get the mpage code to
go 100% BIO. Impressed.

Some random thoughtlets:

Removing the buffer-layer fallback is a cleanliness/architectural
thing rather than a performance thing. The number of pages which
fall back to the buffer functions is utterly tiny.

Really, I don't think it's a desirable thing to do - those
buffers were added by the buffer layer, they're owned by the
buffer layer and it should be up to the buffer layer to handle
the I/O, without making mpage peer into buffer state internals
any more than necessary.

I don't think your implementation correctly handles the case
where a page has some dirty buffers and some non-uptodate
buffers. Looks like the readpage function will read old data
on top of the dirty buffers?


wrt integrating the readpages and writepages code: now is not the
time - mpage_writepages() and mpage_readpages() are very simple
functions which really only support the simplest of filesytems: ext2.
Changes have been identified on both the read and write side for
reiserfs, and on the write side for XFS. Could be that integrating
the read and write paths will make those changes more painful. It
would be best to wait until the mpage code has been fleshed out to
support other filesystems and then take a look. read and write are
rather different things...


Pointing all the BIOs at a common completion structure is cunning.
But we really don't want to have to perform another memory allocation
on that path. Frankly, given that this is the most important and
the most frequent bio-assembly code in the kernel, I'd be inclined
to just stick an extra field in struct bio for it.

-