Message-ID: <3CFFDCD1.8000305@us.ibm.com>
Date: Thu, 06 Jun 2002 15:06:09 -0700
From: Matthew Dobson <colpatch@us.ibm.com>
Reply-To: colpatch@us.ibm.com
Organization: IBM LTC
User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko/20020513
MIME-Version: 1.0
To: "Adam J. Richter" <adam@yggdrasil.com>
CC: akpm@zip.com.au, axboe@suse.de, linux-kernel@vger.kernel.org
Subject: Re: Patch??: linux-2.5.20/fs/bio.c - ll_rw_kio could generate bio's
 bigger than queue could handle
In-Reply-To: <200206061331.GAA00287@baldur.yggdrasil.com>
Content-Type: text/plain; charset=us-ascii; format=flowed
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org

Adam,
	Your patch version 3 did the trick for me.  It booted past the panic, but 
it's now hung right after init loads.  After init spits out "INIT: " it 
locks up..  I've put in kdb, and all the cpus seem to be spinning in the 
TIF_NEED_RESCHED loop in poll_idle.  I dunno if this is in any way 
related to the block stuff or not...

But the panic stopped happening!

Cheers!

-Matt


Adam J. Richter wrote:
> Here is version 3 of my changes to ll_rw_kio in fs/bio.c and fs/mpage.c.
> The differences from the previous version are that do_mpage_readpage
> and mpage_writepage now precompute bdev differently, and I have
> tried to factor out a little bit of common code between bio.c and
> mpage.c in the form of a routine bio_append().
> 
> I now know that I am actually running the mpage.c code, because I
> screwed up one of my changes and got to see the resulting kernel
> panic.  On the other hand, I do not think that I have caused ll_rw_kio
> to be executed.  So, I beileve my ll_rw_kio changes are completely
> untested.
> 
> For what it's worth, I am composing this email on a machine running
> the patch that I have attached below.
> 
> Adam J. Richter     __     ______________   575 Oroville Road
> adam@yggdrasil.com     \ /                  Milpitas, California 95035
> +1 408 309-6081         | g g d r a s i l   United States of America
>                          "Free Software For The Rest Of Us."
> 
> 
> --- linux-2.5.20/include/linux/bio.h	2002-06-02 18:44:52.000000000 -0700
> +++ linux/include/linux/bio.h	2002-06-06 06:30:04.000000000 -0700
> @@ -34,9 +34,6 @@
>  #define BIO_BUG_ON
>  #endif
>  
> -#define BIO_MAX_SECTORS	128
> -#define BIO_MAX_SIZE	(BIO_MAX_SECTORS << 9)
> -
>  /*
>   * was unsigned short, but we might as well be ready for > 64kB I/O pages
>   */
> @@ -201,6 +198,8 @@
>  extern struct bio *bio_clone(struct bio *, int);
>  extern struct bio *bio_copy(struct bio *, int, int);
>  
> +extern int bio_append(struct bio **bio_p,
> +		      struct page *page, int len, int offset);
>  extern inline void bio_init(struct bio *);
>  
>  extern int bio_ioctl(kdev_t, unsigned int, unsigned long);
> --- linux-2.5.20/include/linux/blkdev.h	2002-06-02 18:44:44.000000000 -0700
> +++ linux/include/linux/blkdev.h	2002-06-06 04:40:25.000000000 -0700
> @@ -191,9 +191,9 @@
>  	 * queue settings
>  	 */
>  	unsigned short		max_sectors;
> -	unsigned short		max_phys_segments;
> -	unsigned short		max_hw_segments;
> +	unsigned short		max_phys_segments; 	/* <= max_sectors */
> +	unsigned short		max_hw_segments; 	/* <= max_sectors */
>  	unsigned short		hardsect_size;
>  	unsigned int		max_segment_size;
>  
> @@ -418,5 +428,7 @@
>  	page_cache_release(p.v);
>  }
>  
> +extern int bio_max_iovecs(request_queue_t *q, int *iovec_size);
> +
>  #endif
> --- linux-2.5.20/fs/bio.c	2002-06-02 18:44:40.000000000 -0700
> +++ linux/fs/bio.c	2002-06-06 06:13:52.000000000 -0700
> @@ -316,6 +316,64 @@
>  	return NULL;
>  }
>  
> +int bio_max_iovecs(request_queue_t *q, int *iovec_size)
> +{
> +	const int max_bytes = q->max_sectors << 9;
> +	int max_iovecs;
> +
> +	if (*iovec_size > max_bytes) {
> +		*iovec_size = max_bytes;
> +		return 1;
> +	}
> +	max_iovecs = max_bytes / *iovec_size;
> +	if (max_iovecs > q->max_phys_segments)
> +		max_iovecs = q->max_phys_segments;
> +	if (max_iovecs > q->max_hw_segments)
> +		max_iovecs = q->max_hw_segments;
> +	return max_iovecs;
> +}
> +
> +/* bio_append appends an IO vector to a bio.  bio_append expects to be
> +   called with bio->bi_idx indicating the maximum number of IO vectors
> +   that this bio can hold, and with bio->bi_vcnt indicating the number
> +   of IO vectors already loaded.  If the provided bio is already full,
> +   bio_append will submit the current bio and allocate a new one. */
> +   
> +int bio_append(struct bio **bio_p, struct page *page, int len, int offset)
> +{
> +	struct bio *bio = *bio_p;
> +	struct bio_vec *vec;
> +	if (bio->bi_idx == bio->bi_vcnt) {
> +		struct bio *old = bio; 
> +		*bio_p = bio = bio_alloc(GFP_KERNEL, old->bi_vcnt);
> +		if (bio != NULL) {
> +			bio->bi_sector =
> +				old->bi_sector + (old->bi_size >> 9);
> +
> +#define COPY(field)	bio->bi_ ## field = old->bi_ ## field
> +			COPY(bdev);
> +			COPY(flags);
> +			COPY(idx);
> +			COPY(rw);
> +			COPY(end_io);
> +			COPY(private);
> +#undef COPY
> +		}
> +		old->bi_idx = 0;
> +		submit_bio(old->bi_rw, old);
> +		if (bio == NULL)
> +			return -ENOMEM;
> +	}
> +	vec = &bio->bi_io_vec[bio->bi_vcnt++];
> +	vec->bv_page = page;
> +	vec->bv_len = len;
> +	vec->bv_offset = offset;
> +
> +	bio->bi_size += len;
> +	return 0;
> +}
> +
> +
>  static void bio_end_io_kio(struct bio *bio)
>  {
>  	struct kiobuf *kio = (struct kiobuf *) bio->bi_private;
> @@ -338,10 +396,10 @@
>   **/
>  void ll_rw_kio(int rw, struct kiobuf *kio, struct block_device *bdev, sector_t sector)
>  {
> -	int i, offset, size, err, map_i, total_nr_pages, nr_pages;
> -	struct bio_vec *bvec;
> +	int offset, size, err, map_i, total_nr_pages, nr_bvecs;
>  	struct bio *bio;
>  	kdev_t dev = to_kdev_t(bdev->bd_dev);
> +	int bytes_per_iovec, max_pages;
>  
>  	err = 0;
>  	if ((rw & WRITE) && is_read_only(dev)) {
> @@ -367,63 +425,58 @@
>  
>  	map_i = 0;
>  
> -next_chunk:
> -	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - 9);
> -	if (nr_pages > total_nr_pages)
> -		nr_pages = total_nr_pages;
> +	bytes_per_iovec = PAGE_SIZE;
> +	max_pages = bio_max_iovecs(bdev->bd_queue, &bytes_per_iovec);
> +
> +	nr_bvecs = max_pages;
> +	if (nr_bvecs > total_nr_pages)
> +		nr_bvecs = total_nr_pages;
>  
>  	atomic_inc(&kio->io_count);
>  
>  	/*
>  	 * allocate bio and do initial setup
>  	 */
> -	if ((bio = bio_alloc(GFP_NOIO, nr_pages)) == NULL) {
> +	if ((bio = bio_alloc(GFP_NOIO, nr_bvecs)) == NULL) {
>  		err = -ENOMEM;
>  		goto out;
>  	}
>  
> +	bio->bi_idx = nr_bvecs;		/* for bio_append */
> +	bio->bi_rw = rw;
> +
>  	bio->bi_sector = sector;
>  	bio->bi_bdev = bdev;
> -	bio->bi_idx = 0;
>  	bio->bi_end_io = bio_end_io_kio;
>  	bio->bi_private = kio;
>  
> -	bvec = bio->bi_io_vec;
> -	for (i = 0; i < nr_pages; i++, bvec++, map_i++) {
> +	while (total_nr_pages > 0) {
>  		int nbytes = PAGE_SIZE - offset;
>  
>  		if (nbytes > size)
>  			nbytes = size;
> +		if (nbytes > bytes_per_iovec)
> +			nbytes = bytes_per_iovec;
>  
>  		BUG_ON(kio->maplist[map_i] == NULL);
>  
> -		if (bio->bi_size + nbytes > (BIO_MAX_SECTORS << 9))
> -			goto queue_io;
> -
> -		bio->bi_vcnt++;
> -		bio->bi_size += nbytes;
> +		err = bio_append(&bio, kio->maplist[map_i], nbytes, offset);
> +		if (err)
> +			goto out;
> +
> +		offset = (offset + nbytes) & PAGE_MASK;
> +		if (offset == 0) {
> +			total_nr_pages--;
> +			map_i++;
> +		}
>  
> -		bvec->bv_page = kio->maplist[map_i];
> -		bvec->bv_len = nbytes;
> -		bvec->bv_offset = offset;
> -
> -		/*
> -		 * kiobuf only has an offset into the first page
> -		 */
> -		offset = 0;
> -
> -		sector += nbytes >> 9;
>  		size -= nbytes;
> -		total_nr_pages--;
>  		kio->offset += nbytes;
>  	}
>  
> -queue_io:
> +	bio->bi_idx = 0;
>  	submit_bio(rw, bio);
>  
> -	if (total_nr_pages)
> -		goto next_chunk;
> -
>  	if (size) {
>  		printk("ll_rw_kio: size %d left (kio %d)\n", size, kio->length);
>  		BUG();
> --- linux-2.5.20/fs/mpage.c	2002-06-02 18:44:44.000000000 -0700
> +++ linux/fs/mpage.c	2002-06-06 06:14:13.000000000 -0700
> @@ -21,12 +21,6 @@
>  #include <linux/mpage.h>
>  
>  /*
> - * The largest-sized BIO which this code will assemble, in bytes.  Set this
> - * to PAGE_CACHE_SIZE if your drivers are broken.
> - */
> -#define MPAGE_BIO_MAX_SIZE BIO_MAX_SIZE
> -
> -/*
>   * I/O completion handler for multipage BIOs.
>   *
>   * The mpage code never puts partial pages into a BIO (except for end-of-file).
> @@ -78,19 +72,15 @@
>  	bio_put(bio);
>  }
>  
> -struct bio *mpage_bio_submit(int rw, struct bio *bio)
> +struct bio *mpage_bio_submit(struct bio *bio)
>  {
> -	bio->bi_vcnt = bio->bi_idx;
>  	bio->bi_idx = 0;
> -	bio->bi_end_io = mpage_end_io_read;
> -	if (rw == WRITE)
> -		bio->bi_end_io = mpage_end_io_write;
> -	submit_bio(rw, bio);
> +	submit_bio(bio->bi_rw, bio);
>  	return NULL;
>  }
>  
>  static struct bio *
> -mpage_alloc(struct block_device *bdev,
> +mpage_alloc(struct block_device *bdev, int rw,
>  		sector_t first_sector, int nr_vecs, int gfp_flags)
>  {
>  	struct bio *bio;
> @@ -98,11 +88,14 @@
>  	bio = bio_alloc(gfp_flags, nr_vecs);
>  	if (bio) {
>  		bio->bi_bdev = bdev;
> -		bio->bi_vcnt = nr_vecs;
> -		bio->bi_idx = 0;
> +		bio->bi_vcnt = 0;
> +		bio->bi_idx = nr_vecs;
>  		bio->bi_size = 0;
>  		bio->bi_sector = first_sector;
> -		bio->bi_io_vec[0].bv_page = NULL;
> +		bio->bi_rw = rw;
> +		bio->bi_end_io = mpage_end_io_read;
> +		if (rw == WRITE)
> +			bio->bi_end_io = mpage_end_io_write;
>  	}
>  	return bio;
>  }
> @@ -161,14 +154,19 @@
>  	const unsigned blkbits = inode->i_blkbits;
>  	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
>  	const unsigned blocksize = 1 << blkbits;
> -	struct bio_vec *bvec;
>  	sector_t block_in_file;
>  	sector_t last_block;
>  	sector_t blocks[MAX_BUF_PER_PAGE];
>  	unsigned page_block;
>  	unsigned first_hole = blocks_per_page;
> -	struct block_device *bdev = NULL;
> +	struct block_device *bdev =
> +		S_ISBLK(inode->i_mode) ? inode->i_bdev : inode->i_sb->s_bdev;
>  	struct buffer_head bh;
> +	int bvec_size = PAGE_SIZE;
> +	const unsigned max_bvecs = bio_max_iovecs(bdev->bd_queue, &bvec_size);
> +
> +	if (bvec_size != PAGE_SIZE)
> +		goto confused;
>  
>  	if (page_has_buffers(page))
>  		goto confused;
> @@ -197,7 +195,6 @@
>  		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
>  			goto confused;
>  		blocks[page_block] = bh.b_blocknr;
> -		bdev = bh.b_bdev;
>  	}
>  
>  	if (first_hole != blocks_per_page) {
> @@ -215,28 +212,28 @@
>  	/*
>  	 * This page will go to BIO.  Do we need to send this BIO off first?
>  	 */
> -	if (bio && (bio->bi_idx == bio->bi_vcnt ||
> -			*last_block_in_bio != blocks[0] - 1))
> -		bio = mpage_bio_submit(READ, bio);
> -
>  	if (bio == NULL) {
> -		unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
> +		unsigned int nr_bvecs = max_bvecs;
>  
>  		if (nr_bvecs > nr_pages)
>  			nr_bvecs = nr_pages;
> -		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
> +		bio = mpage_alloc(bdev, READ, blocks[0] << (blkbits - 9),
>  					nr_bvecs, GFP_KERNEL);
>  		if (bio == NULL)
>  			goto confused;
>  	}
> +	else if (*last_block_in_bio != blocks[0] - 1)
> +		bio->bi_idx = bio->bi_vcnt; /* force bio_append to
> +					       allocate a new bio. */
> +
> +	if (bio_append(&bio, page, first_hole << blkbits, 0) != 0)
> +		goto confused;
> +
> +	if (bio->bi_vcnt == 1)
> +		bio->bi_sector = blocks[0] << (blkbits - 9);
>  
> -	bvec = &bio->bi_io_vec[bio->bi_idx++];
> -	bvec->bv_page = page;
> -	bvec->bv_len = (first_hole << blkbits);
> -	bvec->bv_offset = 0;
> -	bio->bi_size += bvec->bv_len;
>  	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
> -		bio = mpage_bio_submit(READ, bio);
> +		bio->bi_idx = bio->bi_vcnt;
>  	else
>  		*last_block_in_bio = blocks[blocks_per_page - 1];
>  out:
> @@ -244,7 +241,7 @@
>  
>  confused:
>  	if (bio)
> -		bio = mpage_bio_submit(READ, bio);
> +		bio = mpage_bio_submit(bio);
>  	block_read_full_page(page, get_block);
>  	goto out;
>  }
> @@ -270,7 +267,7 @@
>  	}
>  	BUG_ON(!list_empty(pages));
>  	if (bio)
> -		mpage_bio_submit(READ, bio);
> +		mpage_bio_submit(bio);
>  	return 0;
>  }
>  EXPORT_SYMBOL(mpage_readpages);
> @@ -286,7 +283,7 @@
>  	bio = do_mpage_readpage(bio, page, 1,
>  			&last_block_in_bio, get_block);
>  	if (bio)
> -		mpage_bio_submit(READ, bio);
> +		mpage_bio_submit(bio);
>  	return 0;
>  }
>  EXPORT_SYMBOL(mpage_readpage);
> @@ -315,14 +312,19 @@
>  	const unsigned blkbits = inode->i_blkbits;
>  	unsigned long end_index;
>  	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
> -	struct bio_vec *bvec;
>  	sector_t last_block;
>  	sector_t block_in_file;
>  	sector_t blocks[MAX_BUF_PER_PAGE];
>  	unsigned page_block;
>  	unsigned first_unmapped = blocks_per_page;
> -	struct block_device *bdev = NULL;
> +	struct block_device *bdev =
> +		S_ISBLK(inode->i_mode) ? inode->i_bdev : inode->i_sb->s_bdev;
>  	int boundary = 0;
> +	int bvec_size = PAGE_SIZE;
> +	const unsigned max_bvecs = bio_max_iovecs(bdev->bd_queue, &bvec_size);
> +
> +	if (bvec_size != PAGE_SIZE)
> +		goto confused;
>  
>  	if (page_has_buffers(page)) {
>  		struct buffer_head *head = page_buffers(page);
> @@ -355,7 +357,6 @@
>  			}
>  			blocks[page_block++] = bh->b_blocknr;
>  			boundary = buffer_boundary(bh);
> -			bdev = bh->b_bdev;
>  		} while ((bh = bh->b_this_page) != head);
>  
>  		if (first_unmapped)
> @@ -391,7 +392,6 @@
>  		}
>  		blocks[page_block++] = map_bh.b_blocknr;
>  		boundary = buffer_boundary(&map_bh);
> -		bdev = map_bh.b_bdev;
>  		if (block_in_file == last_block)
>  			break;
>  		block_in_file++;
> @@ -417,18 +417,14 @@
>  	/*
>  	 * This page will go to BIO.  Do we need to send this BIO off first?
>  	 */
> -	if (bio && (bio->bi_idx == bio->bi_vcnt ||
> -				*last_block_in_bio != blocks[0] - 1))
> -		bio = mpage_bio_submit(WRITE, bio);
> -
>  	if (bio == NULL) {
> -		unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;
> -
> -		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
> -					nr_bvecs, GFP_NOFS);
> +		bio = mpage_alloc(bdev, WRITE, blocks[0] << (blkbits - 9),
> +					max_bvecs, GFP_NOFS);
>  		if (bio == NULL)
>  			goto confused;
>  	}
> +	else if (*last_block_in_bio != blocks[0] - 1)
> +		bio->bi_idx = bio->bi_vcnt;
>  
>  	/*
>  	 * OK, we have our BIO, so we can now mark the buffers clean.  Make
> @@ -447,23 +443,21 @@
>  		} while (bh != head);
>  	}
>  
> -	bvec = &bio->bi_io_vec[bio->bi_idx++];
> -	bvec->bv_page = page;
> -	bvec->bv_len = (first_unmapped << blkbits);
> -	bvec->bv_offset = 0;
> -	bio->bi_size += bvec->bv_len;
> +	bio_append(&bio, page, first_unmapped << blkbits, 0);
> +	if (bio->bi_vcnt == 1)
> +		bio->bi_sector = blocks[0] << (blkbits - 9);
>  	BUG_ON(PageWriteback(page));
>  	SetPageWriteback(page);
>  	unlock_page(page);
>  	if (boundary || (first_unmapped != blocks_per_page))
> -		bio = mpage_bio_submit(WRITE, bio);
> +		bio = mpage_bio_submit(bio);
>  	else
>  		*last_block_in_bio = blocks[blocks_per_page - 1];
>  	goto out;
>  
>  confused:
>  	if (bio)
> -		bio = mpage_bio_submit(WRITE, bio);
> +		bio = mpage_bio_submit(bio);
>  	*ret = block_write_full_page(page, get_block);
>  out:
>  	return bio;
> @@ -541,7 +535,7 @@
>  	}
>  	write_unlock(&mapping->page_lock);
>  	if (bio)
> -		mpage_bio_submit(WRITE, bio);
> +		mpage_bio_submit(bio);
>  	return ret;
>  }
>  EXPORT_SYMBOL(mpage_writepages);
> 


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/