2002-12-10 22:42:34

by Kevin Corry

[permalink] [raw]
Subject: [PATCH] dm.c - device-mapper I/O path fixes

Joe, Linus,

This patch fixes problems with the device-mapper I/O path in 2.5.51. The
existing code does not properly split requests when necessary, and can
cause segfaults and/or data corruption. This can easily manifest itself
when running XFS on striped LVM volumes.

Notes:
- New bio's must be alloc'd instead of clone'd, since the bio vector may need
to be adjusted if the end of the split request does not fill a page.
- Fix reference counting of md->pending. This should only be incremented once
for each incoming bio, not for each split bio that is resubmitted.
- Copy the correct bvec when splitting the tail-end of a page.


--
Kevin Corry
[email protected]
http://evms.sourceforge.net/


--- linux-2.5.51a/drivers/md/dm.c Tue Dec 10 11:01:13 2002
+++ linux-2.5.51b/drivers/md/dm.c Tue Dec 10 11:03:55 2002
@@ -242,17 +242,18 @@
static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
unsigned long flags;

- spin_lock_irqsave(&_uptodate_lock, flags);
- if (error)
+ if (error) {
+ spin_lock_irqsave(&_uptodate_lock, flags);
io->error = error;
- spin_unlock_irqrestore(&_uptodate_lock, flags);
+ spin_unlock_irqrestore(&_uptodate_lock, flags);
+ }

if (atomic_dec_and_test(&io->io_count)) {
if (atomic_dec_and_test(&io->md->pending))
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);

- bio_endio(io->bio, io->error ? 0 : io->bio->bi_size, io->error);
+ bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io);
}
}
@@ -261,15 +262,15 @@
{
struct dm_io *io = bio->bi_private;

- /*
- * Only call dec_pending if the clone has completely
- * finished. If a partial io errors I'm assuming it won't
- * be requeued. FIXME: check this.
- */
- if (error || !bio->bi_size) {
- dec_pending(io, error);
- bio_put(bio);
+ if (bio->bi_size)
+ return 1;
+
+ if (error) {
+ struct gendisk *disk = dm_disk(io->md);
+ DMWARN("I/O error (%d) on device %s\n", error, disk->disk_name);
}
+ dec_pending(io, error);
+ bio_put(bio);

return 0;
}
@@ -313,7 +314,6 @@
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&io->md->pending);
atomic_inc(&io->io_count);
r = ti->type->map(ti, clone);
if (r > 0)
@@ -341,9 +341,7 @@
{
struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
struct bio *clone, *bio = ci->bio;
- struct bio_vec *bv = bio->bi_io_vec + (bio->bi_vcnt - 1);
-
- DMWARN("splitting page");
+ struct bio_vec *bv = bio->bi_io_vec + ci->idx;

if (len > ci->sector_count)
len = ci->sector_count;
@@ -353,11 +351,13 @@

clone->bi_sector = ci->sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_flags = bio->bi_flags | (1 << BIO_SEG_VALID);
clone->bi_rw = bio->bi_rw;
+ clone->bi_vcnt = 1;
clone->bi_size = len << SECTOR_SHIFT;
clone->bi_end_io = clone_endio;
clone->bi_private = ci->io;
+ clone->bi_io_vec->bv_offset = clone->bi_io_vec->bv_len - clone->bi_size;
+ clone->bi_io_vec->bv_len = clone->bi_size;

ci->sector += len;
ci->sector_count -= len;
@@ -369,24 +369,48 @@
{
struct bio *clone, *bio = ci->bio;
struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
- sector_t len = max_io_len(ci->md, bio->bi_sector, ti);
+ sector_t bv_len, len = max_io_len(ci->md, ci->sector, ti);
+ struct bio_vec *bv;
+ int i, vcnt = bio->bi_vcnt - ci->idx;

/* shorter than current target ? */
if (ci->sector_count < len)
len = ci->sector_count;

/* create the clone */
- clone = bio_clone(ci->bio, GFP_NOIO);
+ clone = bio_alloc(GFP_NOIO, vcnt);
+ if (!clone) {
+ dec_pending(ci->io, -ENOMEM);
+ return;
+ }
clone->bi_sector = ci->sector;
- clone->bi_idx = ci->idx;
+ clone->bi_bdev = bio->bi_bdev;
+ clone->bi_rw = bio->bi_rw;
+ clone->bi_vcnt = vcnt;
clone->bi_size = len << SECTOR_SHIFT;
clone->bi_end_io = clone_endio;
clone->bi_private = ci->io;

+ /* copy the original vector and adjust if necessary. */
+ memcpy(clone->bi_io_vec, bio->bi_io_vec + ci->idx,
+ vcnt * sizeof(*clone->bi_io_vec));
+ bv_len = len << SECTOR_SHIFT;
+ bio_for_each_segment(bv, clone, i) {
+ if (bv_len >= bv->bv_len) {
+ bv_len -= bv->bv_len;
+ } else {
+ bv->bv_len = bv_len;
+ clone->bi_vcnt = i + 1;
+ break;
+ }
+ }
+
+ /* submit this io */
+ __map_bio(ti, clone);
+
/* adjust the remaining io */
ci->sector += len;
ci->sector_count -= len;
- __map_bio(ti, clone);

/*
* If we are not performing all remaining io in this
@@ -395,8 +419,8 @@
*/
if (ci->sector_count) {
while (len) {
- struct bio_vec *bv = clone->bi_io_vec + ci->idx;
- sector_t bv_len = bv->bv_len >> SECTOR_SHIFT;
+ bv = bio->bi_io_vec + ci->idx;
+ bv_len = bv->bv_len >> SECTOR_SHIFT;
if (bv_len <= len)
len -= bv_len;

@@ -427,6 +451,8 @@
ci.sector_count = bio_sectors(bio);
ci.idx = 0;

+ atomic_inc(&ci.io->md->pending);
+
while (ci.sector_count)
__clone_and_map(&ci);

@@ -457,13 +483,13 @@
up_read(&md->lock);

if (bio_rw(bio) == READA) {
- bio_io_error(bio, 0);
+ bio_io_error(bio, bio->bi_size);
return 0;
}

r = queue_io(md, bio);
if (r < 0) {
- bio_io_error(bio, 0);
+ bio_io_error(bio, bio->bi_size);
return 0;

} else if (r == 0)


2002-12-11 12:10:33

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

Kevin,

On Tue, Dec 10, 2002 at 04:03:47PM -0600, Kevin Corry wrote:
> Joe, Linus,
>
> This patch fixes problems with the device-mapper I/O path in 2.5.51. The
> existing code does not properly split requests when necessary, and can
> cause segfaults and/or data corruption. This can easily manifest itself
> when running XFS on striped LVM volumes.

Many thanks for doing this work, but _please_ split your patches up more.
There are several changes rolled in here.

I've split the patch up, and will post the ones I'm accepting as
replies to this current mail.

The full set of changes for 2.5.51 is available here:
http://people.sistina.com/~thornber/patches/2.5-stable/2.5.51/

This works for me with xfs and stripes (limited testing).


These are the bits of your patch that I have queries about:

--- linux-2.5.51a/drivers/md/dm.c Tue Dec 10 11:01:13 2002
+++ linux-2.5.51b/drivers/md/dm.c Tue Dec 10 11:03:55 2002
@@ -242,4 +242,4 @@
- bio_endio(io->bio, io->error ? 0 : io->bio->bi_size, io->error);
+ bio_endio(io->bio, io->bio->bi_size, io->error);

You seem to be assuming that io->bio->bi_size will always be zero if
an error occurs. I was not aware that this was the case.


@@ -261,15 +262,15 @@
{
struct dm_io *io = bio->bi_private;

- /*
- * Only call dec_pending if the clone has completely
- * finished. If a partial io errors I'm assuming it won't
- * be requeued. FIXME: check this.
- */
- if (error || !bio->bi_size) {
- dec_pending(io, error);
- bio_put(bio);
+ if (bio->bi_size)
+ return 1;
+
+ if (error) {
+ struct gendisk *disk = dm_disk(io->md);
+ DMWARN("I/O error (%d) on device %s\n", error, disk->disk_name);
}
+ dec_pending(io, error);
+ bio_put(bio);

return 0;
}


All you're doing here is adding a warning (which is nice), and making
the same assumption about bio->bi_size in the case of an error.



@@ -457,13 +483,13 @@
up_read(&md->lock);

if (bio_rw(bio) == READA) {
- bio_io_error(bio, 0);
+ bio_io_error(bio, bio->bi_size);
return 0;
}

r = queue_io(md, bio);
if (r < 0) {
- bio_io_error(bio, 0);
+ bio_io_error(bio, bio->bi_size);
return 0;

} else if (r == 0)

Why is it better to say that all the io was 'done' rather than none?
It did fail after all.


@@ -369,24 +369,48 @@
{
struct bio *clone, *bio = ci->bio;
struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
- sector_t len = max_io_len(ci->md, bio->bi_sector, ti);
+ sector_t bv_len, len = max_io_len(ci->md, ci->sector, ti);
+ struct bio_vec *bv;
+ int i, vcnt = bio->bi_vcnt - ci->idx;

/* shorter than current target ? */
if (ci->sector_count < len)
len = ci->sector_count;

/* create the clone */
- clone = bio_clone(ci->bio, GFP_NOIO);
+ clone = bio_alloc(GFP_NOIO, vcnt);
+ if (!clone) {
+ dec_pending(ci->io, -ENOMEM);
+ return;
+ }
clone->bi_sector = ci->sector;
- clone->bi_idx = ci->idx;
+ clone->bi_bdev = bio->bi_bdev;
+ clone->bi_rw = bio->bi_rw;
+ clone->bi_vcnt = vcnt;
clone->bi_size = len << SECTOR_SHIFT;
clone->bi_end_io = clone_endio;
clone->bi_private = ci->io;

+ /* copy the original vector and adjust if necessary. */
+ memcpy(clone->bi_io_vec, bio->bi_io_vec + ci->idx,
+ vcnt * sizeof(*clone->bi_io_vec));
+ bv_len = len << SECTOR_SHIFT;
+ bio_for_each_segment(bv, clone, i) {
+ if (bv_len >= bv->bv_len) {
+ bv_len -= bv->bv_len;
+ } else {
+ bv->bv_len = bv_len;
+ clone->bi_vcnt = i + 1;
+ break;
+ }
+ }
+
+ /* submit this io */
+ __map_bio(ti, clone);
+
/* adjust the remaining io */
ci->sector += len;
ci->sector_count -= len;
- __map_bio(ti, clone);

/*
* If we are not performing all remaining io in this
@@ -395,9 +419,9 @@
*/
if (ci->sector_count) {
while (len) {
- struct bio_vec *bv = clone->bi_io_vec + ci->idx;
- sector_t bv_len = bv->bv_len >> SECTOR_SHIFT;
+ bv = bio->bi_io_vec + ci->idx;
+ bv_len = bv->bv_len >> SECTOR_SHIFT;
if (bv_len <= len)
len -= bv_len;


There is no need to use bio_alloc in preference to bio_clone, we're
not changing the bvec in any way. All of the above code is redundant.

- Joe

2002-12-11 12:12:15

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

md->pending was being incremented for each clone rather than just
once. [Kevin Corry]

--- diff/drivers/md/dm.c 2002-12-11 12:00:34.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-11 12:00:39.000000000 +0000
@@ -310,7 +310,6 @@
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&io->md->pending);
atomic_inc(&io->io_count);
r = ti->type->map(ti, clone);
if (r > 0)
@@ -424,6 +423,7 @@
ci.sector_count = bio_sectors(bio);
ci.idx = 0;

+ atomic_inc(&md->pending);
while (ci.sector_count)
__clone_and_map(&ci);

2002-12-11 12:13:02

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

Some fields in the duplicated bio weren't being set up properly in
__split_page(). [Kevin Corry]

--- diff/drivers/md/dm.c 2002-12-11 12:00:39.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-11 12:00:44.000000000 +0000
@@ -337,7 +337,7 @@
{
struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
struct bio *clone, *bio = ci->bio;
- struct bio_vec *bv = bio->bi_io_vec + (bio->bi_vcnt - 1);
+ struct bio_vec *bv = bio->bi_io_vec + ci->idx;

DMWARN("splitting page");

@@ -349,11 +349,13 @@

clone->bi_sector = ci->sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_flags = bio->bi_flags | (1 << BIO_SEG_VALID);
clone->bi_rw = bio->bi_rw;
+ clone->bi_vcnt = 1;
clone->bi_size = len << SECTOR_SHIFT;
clone->bi_end_io = clone_endio;
clone->bi_private = ci->io;
+ clone->bi_io_vec->bv_offset = bv->bv_len - clone->bi_size;
+ clone->bi_io_vec->bv_len = clone->bi_size;

ci->sector += len;
ci->sector_count -= len;

2002-12-11 12:14:21

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

Remove some paranoia in highmem.c, need to check this with Jens Axboe.

--- diff/mm/highmem.c 2002-11-11 11:09:43.000000000 +0000
+++ source/mm/highmem.c 2002-12-11 12:00:48.000000000 +0000
@@ -452,8 +452,6 @@
mempool_t *pool;
int bio_gfp;

- BUG_ON((*bio_orig)->bi_idx);
-
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,

2002-12-11 12:11:34

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

dec_pending(): only bother spin locking if io->error is going to be
updated. [Kevin Corry]

--- diff/drivers/md/dm.c 2002-12-11 12:00:29.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-11 12:00:34.000000000 +0000
@@ -238,10 +238,11 @@
static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
unsigned long flags;

- spin_lock_irqsave(&_uptodate_lock, flags);
- if (error)
+ if (error) {
+ spin_lock_irqsave(&_uptodate_lock, flags);
io->error = error;
- spin_unlock_irqrestore(&_uptodate_lock, flags);
+ spin_unlock_irqrestore(&_uptodate_lock, flags);
+ }

if (atomic_dec_and_test(&io->io_count)) {
if (atomic_dec_and_test(&io->md->pending))

2002-12-11 13:35:16

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On 11 December 2002 10:19, Joe Thornber wrote:
> dec_pending(): only bother spin locking if io->error is going to be
> updated. [Kevin Corry]
>
> --- diff/drivers/md/dm.c 2002-12-11 12:00:29.000000000 +0000
> +++ source/drivers/md/dm.c 2002-12-11 12:00:34.000000000 +0000
> @@ -238,10 +238,11 @@
> static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
> unsigned long flags;
>
> - spin_lock_irqsave(&_uptodate_lock, flags);
> - if (error)
> + if (error) {
> + spin_lock_irqsave(&_uptodate_lock, flags);
> io->error = error;
> - spin_unlock_irqrestore(&_uptodate_lock, flags);
> + spin_unlock_irqrestore(&_uptodate_lock, flags);
> + }
>
> if (atomic_dec_and_test(&io->io_count)) {
> if (atomic_dec_and_test(&io->md->pending))

This seems pointless, end result:

spin_lock_irqsave(&_uptodate_lock, flags);
io->error = error;
spin_unlock_irqrestore(&_uptodate_lock, flags);


BTW, the function we are looking at:

static inline void dec_pending(struct dm_io *io, int error)
^^^^^^
is too big for inlining.
--
vda

2002-12-11 13:31:23

by Kevin Corry

[permalink] [raw]
Subject: Re: [lvm-devel] Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wednesday 11 December 2002 06:17, Joe Thornber wrote:
> Kevin,
>
> On Tue, Dec 10, 2002 at 04:03:47PM -0600, Kevin Corry wrote:
> > Joe, Linus,
> >
> > This patch fixes problems with the device-mapper I/O path in 2.5.51. The
> > existing code does not properly split requests when necessary, and can
> > cause segfaults and/or data corruption. This can easily manifest itself
> > when running XFS on striped LVM volumes.
>
> Many thanks for doing this work, but _please_ split your patches up more.
> There are several changes rolled in here.

Sorry if I included too many changes in one post. I guess I didn't think the
patch was really that big.

> I've split the patch up, and will post the ones I'm accepting as
> replies to this current mail.
>
> The full set of changes for 2.5.51 is available here:
> http://people.sistina.com/~thornber/patches/2.5-stable/2.5.51/
>
> This works for me with xfs and stripes (limited testing).
>
>
> These are the bits of your patch that I have queries about:
>
> --- linux-2.5.51a/drivers/md/dm.c Tue Dec 10 11:01:13 2002
> +++ linux-2.5.51b/drivers/md/dm.c Tue Dec 10 11:03:55 2002
> @@ -242,4 +242,4 @@
> - bio_endio(io->bio, io->error ? 0 : io->bio->bi_size, io->error);
> + bio_endio(io->bio, io->bio->bi_size, io->error);
>
> You seem to be assuming that io->bio->bi_size will always be zero if
> an error occurs. I was not aware that this was the case.

I'm simply going by the convention used by the other kernel drivers. Do a
grep for bio_endio and bio_io_error in drivers/. Most drivers (e.g. md, loop,
rd, umem) call bio_endio() and bio_io_error() with the current bi_size of the
bio they're completing.


> @@ -261,15 +262,15 @@
> {
> struct dm_io *io = bio->bi_private;
>
> - /*
> - * Only call dec_pending if the clone has completely
> - * finished. If a partial io errors I'm assuming it won't
> - * be requeued. FIXME: check this.
> - */
> - if (error || !bio->bi_size) {
> - dec_pending(io, error);
> - bio_put(bio);
> + if (bio->bi_size)
> + return 1;
> +
> + if (error) {
> + struct gendisk *disk = dm_disk(io->md);
> + DMWARN("I/O error (%d) on device %s\n", error, disk->disk_name);
> }
> + dec_pending(io, error);
> + bio_put(bio);
>
> return 0;
> }
>
>
> All you're doing here is adding a warning (which is nice), and making
> the same assumption about bio->bi_size in the case of an error.

Again, I changed this based on conventions used by other drivers. Take a look
at loop_end_io_transfer() in drivers/block/loop.c, or end_request() and
end_sync_write() in drivers/md/raid1.c. If a driver doesn't want to bother
with partion bio completions (and DM shouldn't), it should do a
if (bio->bi_size) return 1;
statement at the top of its callback. Check out the original comments
regarding this in the BK tree at:
http://linux.bkbits.net:8080/linux-2.5/[email protected]?nav=index.html|ChangeSet@-1y

> @@ -457,13 +483,13 @@
> up_read(&md->lock);
>
> if (bio_rw(bio) == READA) {
> - bio_io_error(bio, 0);
> + bio_io_error(bio, bio->bi_size);
> return 0;
> }
>
> r = queue_io(md, bio);
> if (r < 0) {
> - bio_io_error(bio, 0);
> + bio_io_error(bio, bio->bi_size);
> return 0;
>
> } else if (r == 0)
>
> Why is it better to say that all the io was 'done' rather than none?
> It did fail after all.

See comments above.

> @@ -369,24 +369,48 @@
> {
> struct bio *clone, *bio = ci->bio;
> struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
> - sector_t len = max_io_len(ci->md, bio->bi_sector, ti);
> + sector_t bv_len, len = max_io_len(ci->md, ci->sector, ti);
> + struct bio_vec *bv;
> + int i, vcnt = bio->bi_vcnt - ci->idx;
>
> /* shorter than current target ? */
> if (ci->sector_count < len)
> len = ci->sector_count;
>
> /* create the clone */
> - clone = bio_clone(ci->bio, GFP_NOIO);
> + clone = bio_alloc(GFP_NOIO, vcnt);
> + if (!clone) {
> + dec_pending(ci->io, -ENOMEM);
> + return;
> + }
> clone->bi_sector = ci->sector;
> - clone->bi_idx = ci->idx;
> + clone->bi_bdev = bio->bi_bdev;
> + clone->bi_rw = bio->bi_rw;
> + clone->bi_vcnt = vcnt;
> clone->bi_size = len << SECTOR_SHIFT;
> clone->bi_end_io = clone_endio;
> clone->bi_private = ci->io;
>
> + /* copy the original vector and adjust if necessary. */
> + memcpy(clone->bi_io_vec, bio->bi_io_vec + ci->idx,
> + vcnt * sizeof(*clone->bi_io_vec));
> + bv_len = len << SECTOR_SHIFT;
> + bio_for_each_segment(bv, clone, i) {
> + if (bv_len >= bv->bv_len) {
> + bv_len -= bv->bv_len;
> + } else {
> + bv->bv_len = bv_len;
> + clone->bi_vcnt = i + 1;
> + break;
> + }
> + }
> +
> + /* submit this io */
> + __map_bio(ti, clone);
> +
> /* adjust the remaining io */
> ci->sector += len;
> ci->sector_count -= len;
> - __map_bio(ti, clone);
>
> /*
> * If we are not performing all remaining io in this
> @@ -395,9 +419,9 @@
> */
> if (ci->sector_count) {
> while (len) {
> - struct bio_vec *bv = clone->bi_io_vec + ci->idx;
> - sector_t bv_len = bv->bv_len >> SECTOR_SHIFT;
> + bv = bio->bi_io_vec + ci->idx;
> + bv_len = bv->bv_len >> SECTOR_SHIFT;
> if (bv_len <= len)
> len -= bv_len;
>
>
> There is no need to use bio_alloc in preference to bio_clone, we're
> not changing the bvec in any way. All of the above code is redundant.

Yes, we *are* going to have to change the bvec (as I implied in my original
post). If you split a bio, and that split occurs in the middle of one of the
bvecs (i.e. in the middle of a page), then the bv_len field in that bvec
*must* be adjusted accordingly (which is what the bio_for_each_segment loop
above does). Otherwise when blk_rq_map_sg() in drivers/block/ll_rw_block.c
processes that bio, it will think that bvec contains a full page. Since that
page obviously spans a stripe or PE boundary, this is going to cause data
corruption.

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2002-12-11 13:55:35

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wednesday 11 December 2002 12:19, Denis Vlasenko wrote:
> On 11 December 2002 10:19, Joe Thornber wrote:
> > dec_pending(): only bother spin locking if io->error is going to be
> > updated. [Kevin Corry]
> >
> > --- diff/drivers/md/dm.c 2002-12-11 12:00:29.000000000 +0000
> > +++ source/drivers/md/dm.c 2002-12-11 12:00:34.000000000 +0000
> > @@ -238,10 +238,11 @@
> > static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
> > unsigned long flags;
> >
> > - spin_lock_irqsave(&_uptodate_lock, flags);
> > - if (error)
> > + if (error) {
> > + spin_lock_irqsave(&_uptodate_lock, flags);
> > io->error = error;
> > - spin_unlock_irqrestore(&_uptodate_lock, flags);
> > + spin_unlock_irqrestore(&_uptodate_lock, flags);
> > + }
> >
> > if (atomic_dec_and_test(&io->io_count)) {
> > if (atomic_dec_and_test(&io->md->pending))
>
> This seems pointless, end result:
>
> spin_lock_irqsave(&_uptodate_lock, flags);
> io->error = error;
> spin_unlock_irqrestore(&_uptodate_lock, flags);

Are you saying the "if (error)" part is pointless? If so, I have to disagree.
A bio may be split into several sub-bio's. When each of those split bio's
completes, they are going to call this function. But if only one of those
split bio's has an error, then the error might get lost without that "if"
statement.

However, it might be a good idea to consider how bio's keep track of errors.
When a bio is created, it is marked UPTODATE. Then, if any part of a bio
takes an error, the UPTODATE flag is turned off. When the whole bio
completes, if the UPTODATE flag is still on, there were no errors during the
i/o. Perhaps the "error" field in "struct dm_io" could be modified to use
this method of error tracking? Then we could change dec_pending() to be
something like:

if (error)
clear_bit(DM_IO_UPTODATE, &io->error);

with a "set_bit(DM_IO_UPTODATE, &ci.io->error);" in __bio_split().

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2002-12-11 14:10:50

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wed, Dec 11, 2002 at 07:16:53AM -0600, Kevin Corry wrote:
> However, it might be a good idea to consider how bio's keep track of errors.
> When a bio is created, it is marked UPTODATE. Then, if any part of a bio
> takes an error, the UPTODATE flag is turned off. When the whole bio
> completes, if the UPTODATE flag is still on, there were no errors during the
> i/o. Perhaps the "error" field in "struct dm_io" could be modified to use
> this method of error tracking? Then we could change dec_pending() to be
> something like:
>
> if (error)
> clear_bit(DM_IO_UPTODATE, &io->error);
>
> with a "set_bit(DM_IO_UPTODATE, &ci.io->error);" in __bio_split().

The problem with this is you don't keep track of the specific error to
later pass to bio_endio(io->bio...). I guess it all comes down to
just how expensive that spin lock is; and since locking only occurs
when there's an error I'm happy with things as they are.

- Joe

2002-12-11 14:31:07

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On 11 December 2002 11:16, Kevin Corry wrote:
> > > --- diff/drivers/md/dm.c 2002-12-11 12:00:29.000000000 +0000
> > > +++ source/drivers/md/dm.c 2002-12-11 12:00:34.000000000 +0000
> > > @@ -238,10 +238,11 @@
> > > static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
> > > unsigned long flags;
> > >
> > > - spin_lock_irqsave(&_uptodate_lock, flags);
> > > - if (error)
> > > + if (error) {
> > > + spin_lock_irqsave(&_uptodate_lock, flags);
> > > io->error = error;
> > > - spin_unlock_irqrestore(&_uptodate_lock, flags);
> > > + spin_unlock_irqrestore(&_uptodate_lock, flags);
> > > + }
> > >
> > > if (atomic_dec_and_test(&io->io_count)) {
> > > if (atomic_dec_and_test(&io->md->pending))
> >
> > This seems pointless, end result:
> >
> > spin_lock_irqsave(&_uptodate_lock, flags);
> > io->error = error;
> > spin_unlock_irqrestore(&_uptodate_lock, flags);
>
> Are you saying the "if (error)" part is pointless? If so, I have to

No. Locking is pointless. What exactly you try to protect here?

> disagree. A bio may be split into several sub-bio's. When each of
> those split bio's completes, they are going to call this function.
> But if only one of those split bio's has an error, then the error
> might get lost without that "if" statement.
>
> However, it might be a good idea to consider how bio's keep track of
> errors. When a bio is created, it is marked UPTODATE. Then, if any
> part of a bio takes an error, the UPTODATE flag is turned off. When
> the whole bio completes, if the UPTODATE flag is still on, there were
> no errors during the i/o. Perhaps the "error" field in "struct dm_io"
> could be modified to use this method of error tracking? Then we could
> change dec_pending() to be something like:
>
> if (error)
> clear_bit(DM_IO_UPTODATE, &io->error);
>
> with a "set_bit(DM_IO_UPTODATE, &ci.io->error);" in __bio_split().

You seem to overestimate my knowledge of this part of the kernel. :(
--
vda

2002-12-11 14:36:35

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On 11 December 2002 12:18, Joe Thornber wrote:
> On Wed, Dec 11, 2002 at 07:16:53AM -0600, Kevin Corry wrote:
> > However, it might be a good idea to consider how bio's keep track
> > of errors. When a bio is created, it is marked UPTODATE. Then, if
> > any part of a bio takes an error, the UPTODATE flag is turned off.
> > When the whole bio completes, if the UPTODATE flag is still on,
> > there were no errors during the i/o. Perhaps the "error" field in
> > "struct dm_io" could be modified to use this method of error
> > tracking? Then we could change dec_pending() to be something like:
> >
> > if (error)
> > clear_bit(DM_IO_UPTODATE, &io->error);
> >
> > with a "set_bit(DM_IO_UPTODATE, &ci.io->error);" in __bio_split().
>
> The problem with this is you don't keep track of the specific error
> to later pass to bio_endio(io->bio...). I guess it all comes down to
> just how expensive that spin lock is; and since locking only occurs
> when there's an error I'm happy with things as they are.

lock();
a = b;
unlock();

Store of ints is atomic anyway. You need locking if a is a larger entity,
say, a struct.
--
vda

2002-12-11 14:41:24

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wednesday 11 December 2002 13:19, Denis Vlasenko wrote:
> On 11 December 2002 11:16, Kevin Corry wrote:
> > > > --- diff/drivers/md/dm.c 2002-12-11 12:00:29.000000000 +0000
> > > > +++ source/drivers/md/dm.c 2002-12-11 12:00:34.000000000 +0000
> > > > @@ -238,10 +238,11 @@
> > > > static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
> > > > unsigned long flags;
> > > >
> > > > - spin_lock_irqsave(&_uptodate_lock, flags);
> > > > - if (error)
> > > > + if (error) {
> > > > + spin_lock_irqsave(&_uptodate_lock, flags);
> > > > io->error = error;
> > > > - spin_unlock_irqrestore(&_uptodate_lock, flags);
> > > > + spin_unlock_irqrestore(&_uptodate_lock, flags);
> > > > + }
> > > >
> > > > if (atomic_dec_and_test(&io->io_count)) {
> > > > if (atomic_dec_and_test(&io->md->pending))
> > >
> > > This seems pointless, end result:
> > >
> > > spin_lock_irqsave(&_uptodate_lock, flags);
> > > io->error = error;
> > > spin_unlock_irqrestore(&_uptodate_lock, flags);
> >
> > Are you saying the "if (error)" part is pointless? If so, I have to
>
> No. Locking is pointless. What exactly you try to protect here?

The "struct dm_io *io" that is passed to dec_pending() can be accessed by
multiple threads at the same time, thus some form of locking is required.

I had been thinking about whether the "error" field could be an atomic_t,
which would remove the requirement for the spinlock in dec_pending().
However, I don't know how atomic_t's behave with negative values. I know
atomic_t's are only guaranteed to have 24-bits of precision, yet all arch's
define atomic_t with a signed integer. Can anyone enlighten me on this?

Perhaps we could make "error" and atomic_t, and store the absolute-value of
the error code, and always return -error in the bio_endio() call. Or is that
just too ugly?

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2002-12-11 14:44:46

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wednesday 11 December 2002 13:24, Denis Vlasenko wrote:
> On 11 December 2002 12:18, Joe Thornber wrote:
> > On Wed, Dec 11, 2002 at 07:16:53AM -0600, Kevin Corry wrote:
> > > However, it might be a good idea to consider how bio's keep track
> > > of errors. When a bio is created, it is marked UPTODATE. Then, if
> > > any part of a bio takes an error, the UPTODATE flag is turned off.
> > > When the whole bio completes, if the UPTODATE flag is still on,
> > > there were no errors during the i/o. Perhaps the "error" field in
> > > "struct dm_io" could be modified to use this method of error
> > > tracking? Then we could change dec_pending() to be something like:
> > >
> > > if (error)
> > > clear_bit(DM_IO_UPTODATE, &io->error);
> > >
> > > with a "set_bit(DM_IO_UPTODATE, &ci.io->error);" in __bio_split().
> >
> > The problem with this is you don't keep track of the specific error
> > to later pass to bio_endio(io->bio...). I guess it all comes down to
> > just how expensive that spin lock is; and since locking only occurs
> > when there's an error I'm happy with things as they are.
>
> lock();
> a = b;
> unlock();
>
> Store of ints is atomic anyway. You need locking if a is a larger entity,
> say, a struct.

Storing an int is *not* atomic unless it is declared as atomic_t and you use
the appropriate macros (see include/asm-*/atomic.h). Remember, we are talking
about a field in a data structure that can be accessed from multiple threads
on multiple CPUs.

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2002-12-11 14:50:18

by Joe Thornber

[permalink] [raw]
Subject: Re: [lvm-devel] Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wed, Dec 11, 2002 at 05:19:33PM -0200, Denis Vlasenko wrote:
> > Are you saying the "if (error)" part is pointless? If so, I have to
>
> No. Locking is pointless. What exactly you try to protect here?

io->error from being updated concurrently.

2002-12-11 15:04:18

by Joe Thornber

[permalink] [raw]
Subject: Re: [lvm-devel] Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wed, Dec 11, 2002 at 08:02:24AM -0600, Kevin Corry wrote:
> Perhaps we could make "error" and atomic_t, and store the absolute-value of
> the error code, and always return -error in the bio_endio() call. Or is that
> just too ugly?

Too ugly :)

2002-12-11 21:04:47

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

Kevin Corry writes:

> Storing an int is *not* atomic unless it is declared as atomic_t and you use
> the appropriate macros (see include/asm-*/atomic.h). Remember, we are talking
> about a field in a data structure that can be accessed from multiple threads
> on multiple CPUs.

As a practical matter, I believe that storing an int to an int-aligned
address _is_ actually atomic on any CPU that can run Linux. The
PowerPC architecture spec requires that single-word (i.e. 32-bit)
aligned stores are atomic, for instance, and I think that would be the
case on any other sane architecture as well.

The language lawyers would probably agree with you, though.

Regards,
Paul.

2002-12-12 13:10:41

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

On Wednesday 11 December 2002 15:12, Paul Mackerras wrote:
> Kevin Corry writes:
> > Storing an int is *not* atomic unless it is declared as atomic_t and you
> > use the appropriate macros (see include/asm-*/atomic.h). Remember, we are
> > talking about a field in a data structure that can be accessed from
> > multiple threads on multiple CPUs.
>
> As a practical matter, I believe that storing an int to an int-aligned
> address _is_ actually atomic on any CPU that can run Linux. The
> PowerPC architecture spec requires that single-word (i.e. 32-bit)
> aligned stores are atomic, for instance, and I think that would be the
> case on any other sane architecture as well.

Given the constraints of having properly aligned data on an SMP machine with
the correct cache-coherency hardware, then yes, I will agree that such stores
should be atomic. However, it has been my understanding that these conditions
cannot be guaranteed on every architecture. Thus we're stuck with atomic_t's
so everyone can play nicely together.

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2002-12-16 00:41:15

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes


Guys, can you please re-send the split-up version of these patches, I'm
currently ignoring them simply because I don't know what the end result of
the discussion really ended up being..

Linus

2002-12-16 09:58:04

by Joe Thornber

[permalink] [raw]
Subject: 1/19

Four constants:
DM_DIR,
DM_MAX_TYPE_NAME,
DM_NAME_LEN,
DM_UUID_LEN

Were being declared in device-mapper.h, these are all specific to
the ioctl interface, so they've been moved to dm-ioctl.h. Nobody
in userland should ever include <linux/device-mapper.h> so remove
ifdef __KERNEL guards.
--- diff/include/linux/device-mapper.h 2002-11-11 11:09:38.000000000 +0000
+++ source/include/linux/device-mapper.h 2002-12-16 09:40:25.000000000 +0000
@@ -7,13 +7,6 @@
#ifndef _LINUX_DEVICE_MAPPER_H
#define _LINUX_DEVICE_MAPPER_H

-#define DM_DIR "mapper" /* Slashes not supported */
-#define DM_MAX_TYPE_NAME 16
-#define DM_NAME_LEN 128
-#define DM_UUID_LEN 129
-
-#ifdef __KERNEL__
-
struct dm_target;
struct dm_table;
struct dm_dev;
@@ -101,6 +94,4 @@
int dm_register_target(struct target_type *t);
int dm_unregister_target(struct target_type *t);

-#endif /* __KERNEL__ */
-
#endif /* _LINUX_DEVICE_MAPPER_H */
--- diff/include/linux/dm-ioctl.h 2002-11-11 11:09:38.000000000 +0000
+++ source/include/linux/dm-ioctl.h 2002-12-16 09:40:25.000000000 +0000
@@ -7,9 +7,13 @@
#ifndef _LINUX_DM_IOCTL_H
#define _LINUX_DM_IOCTL_H

-#include <linux/device-mapper.h>
#include <linux/types.h>

+#define DM_DIR "mapper" /* Slashes not supported */
+#define DM_MAX_TYPE_NAME 16
+#define DM_NAME_LEN 128
+#define DM_UUID_LEN 129
+
/*
* Implements a traditional ioctl interface to the device mapper.
*/

2002-12-16 09:56:53

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] dm.c - device-mapper I/O path fixes

Linus,

On Sun, Dec 15, 2002 at 04:50:16PM -0800, Linus Torvalds wrote:
>
> Guys, can you please re-send the split-up version of these patches, I'm
> currently ignoring them simply because I don't know what the end result of
> the discussion really ended up being..

Here is the latest dm patchset (2.5.51-dm-3) that people have been
testing successfully over the weekend. Please apply.

- Joe

2002-12-16 10:05:12

by Joe Thornber

[permalink] [raw]
Subject: 10/19

dm_suspend(): Stop holding the read lock around the while loop that
waits for pending io to complete.
--- diff/drivers/md/dm.c 2002-12-16 09:41:03.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:08.000000000 +0000
@@ -715,15 +715,13 @@
}

set_bit(DMF_BLOCK_IO, &md->flags);
+ add_wait_queue(&md->wait, &wait);
up_write(&md->lock);

/*
* Then we wait for the already mapped ios to
* complete.
*/
- down_read(&md->lock);
-
- add_wait_queue(&md->wait, &wait);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);

@@ -734,11 +732,11 @@
}

current->state = TASK_RUNNING;
- remove_wait_queue(&md->wait, &wait);
- up_read(&md->lock);

- /* set_bit is atomic */
+ down_write(&md->lock);
+ remove_wait_queue(&md->wait, &wait);
set_bit(DMF_SUSPENDED, &md->flags);
+ up_write(&md->lock);

return 0;
}

2002-12-16 09:59:29

by Joe Thornber

[permalink] [raw]
Subject: 2/19

An error value was not being checked correctly in open_dev().
[Kevin Corry]
--- diff/drivers/md/dm-table.c 2002-11-28 11:30:39.000000000 +0000
+++ source/drivers/md/dm-table.c 2002-12-16 09:40:30.000000000 +0000
@@ -356,7 +356,7 @@
return -ENOMEM;

r = blkdev_get(d->bdev, d->mode, 0, BDEV_RAW);
- if (!r)
+ if (r)
return r;

r = bd_claim(d->bdev, _claim_ptr);

2002-12-16 10:00:13

by Joe Thornber

[permalink] [raw]
Subject: 3/19

Return -ENOTBLK if lookup_device() finds the inode, but it
is not a block device. [Cristoph Hellwig]

--- diff/drivers/md/dm-table.c 2002-12-16 09:40:30.000000000 +0000
+++ source/drivers/md/dm-table.c 2002-12-16 09:40:34.000000000 +0000
@@ -312,7 +312,7 @@
}

if (!S_ISBLK(inode->i_mode)) {
- r = -EINVAL;
+ r = -ENOTBLK;
goto out;
}

2002-12-16 10:04:06

by Joe Thornber

[permalink] [raw]
Subject: 6/19

minor change for dm-stripe.c. Tests for correct chunksize before it allocates
the stripe context. [Heinz Mauelshagen]

--- diff/drivers/md/dm-stripe.c 2002-11-18 10:11:54.000000000 +0000
+++ source/drivers/md/dm-stripe.c 2002-12-16 09:40:48.000000000 +0000
@@ -117,6 +117,14 @@
return -EINVAL;
}

+ /*
+ * chunk_size is a power of two
+ */
+ if (!chunk_size || (chunk_size & (chunk_size - 1))) {
+ ti->error = "dm-stripe: Invalid chunk size";
+ return -EINVAL;
+ }
+
if (!multiple(ti->len, stripes, &width)) {
ti->error = "dm-stripe: Target length not divisable by "
"number of stripes";
@@ -134,15 +142,6 @@
sc->stripe_width = width;
ti->split_io = chunk_size;

- /*
- * chunk_size is a power of two
- */
- if (!chunk_size || (chunk_size & (chunk_size - 1))) {
- ti->error = "dm-stripe: Invalid chunk size";
- kfree(sc);
- return -EINVAL;
- }
-
sc->chunk_mask = ((sector_t) chunk_size) - 1;
for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
chunk_size >>= 1;

2002-12-16 10:05:01

by Joe Thornber

[permalink] [raw]
Subject: 9/19

queue_io() was checking the DMF_SUSPENDED flag rather than the new
DMF_BLOCK_IO flag. This meant suspend could deadlock under load.
--- diff/drivers/md/dm.c 2002-12-16 09:40:58.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:03.000000000 +0000
@@ -206,7 +206,7 @@

down_write(&md->lock);

- if (!test_bit(DMF_SUSPENDED, &md->flags)) {
+ if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
up_write(&md->lock);
free_deferred(di);
return 1;

2002-12-16 10:02:21

by Joe Thornber

[permalink] [raw]
Subject: 4/19

No need to validate the parameters if we are doing a
REMOVE_ALL command.
--- diff/drivers/md/dm-ioctl.c 2002-12-11 11:50:25.000000000 +0000
+++ source/drivers/md/dm-ioctl.c 2002-12-16 09:40:39.000000000 +0000
@@ -986,6 +986,10 @@

static int validate_params(uint cmd, struct dm_ioctl *param)
{
+ /* Ignores parameters */
+ if (cmd == DM_REMOVE_ALL_CMD)
+ return 0;
+
/* Unless creating, either name of uuid but not both */
if (cmd != DM_DEV_CREATE_CMD) {
if ((!*param->uuid && !*param->name) ||

2002-12-16 10:01:41

by Joe Thornber

[permalink] [raw]
Subject: 5/19

check_device_area was comparing the bytes with sectors.
[Stefan Lauterbach]
--- diff/drivers/md/dm-table.c 2002-12-16 09:40:34.000000000 +0000
+++ source/drivers/md/dm-table.c 2002-12-16 09:40:44.000000000 +0000
@@ -388,7 +388,7 @@
static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
{
sector_t dev_size;
- dev_size = dd->bdev->bd_inode->i_size;
+ dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
return ((start < dev_size) && (len <= (dev_size - start)));
}

--- diff/drivers/md/dm.c 2002-11-28 11:30:39.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:40:44.000000000 +0000
@@ -16,7 +16,6 @@

static const char *_name = DM_NAME;
#define MAX_DEVICES (1 << KDEV_MINOR_BITS)
-#define SECTOR_SHIFT 9

static int major = 0;
static int _major = 0;
--- diff/drivers/md/dm.h 2002-11-18 10:11:54.000000000 +0000
+++ source/drivers/md/dm.h 2002-12-16 09:40:44.000000000 +0000
@@ -29,6 +29,8 @@
#define SECTOR_FORMAT "%lu"
#endif

+#define SECTOR_SHIFT 9
+
extern struct block_device_operations dm_blk_dops;

/*

2002-12-16 10:06:43

by Joe Thornber

[permalink] [raw]
Subject: 11/19

Add a blk_run_queues() call to encourage pending io to flush
when we're doing a dm_suspend().
--- diff/drivers/md/dm.c 2002-12-16 09:41:08.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:12.000000000 +0000
@@ -722,6 +722,7 @@
* Then we wait for the already mapped ios to
* complete.
*/
+ blk_run_queues();
while (1) {
set_current_state(TASK_INTERRUPTIBLE);

2002-12-16 10:09:14

by Joe Thornber

[permalink] [raw]
Subject: 13/19

md->pending was being incremented for each clone rather than just
once. [Kevin Corry]
--- diff/drivers/md/dm.c 2002-12-16 09:41:16.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:21.000000000 +0000
@@ -310,7 +310,6 @@
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&io->md->pending);
atomic_inc(&io->io_count);
r = ti->type->map(ti, clone);
if (r > 0)
@@ -424,6 +423,7 @@
ci.sector_count = bio_sectors(bio);
ci.idx = 0;

+ atomic_inc(&md->pending);
while (ci.sector_count)
__clone_and_map(&ci);

2002-12-16 10:12:40

by Joe Thornber

[permalink] [raw]
Subject: 18/19

The block layer does not honour bio->bi_size when issuing io, instead
it performs io to the complete bvecs. This means we have to change
the bio splitting code slightly.

Given a bio we repeatedly apply one of the following three operations
until there is no more io left in the bio:

1) The remaining io does not cross an io/target boundary, so just
create a clone and issue all of the io.

2) There are some bvecs at the start of the bio that are not split by
a target boundary. Create a clone for these bvecs only.

3) The first bvec needs splitting, use bio_alloc() to create *two*
bios, one for the first half of the bvec, the other for the second
half. A bvec can never contain more than one boundary.
--- diff/drivers/md/dm.c 2002-12-16 09:41:39.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:42:31.000000000 +0000
@@ -228,6 +228,15 @@
* interests of getting something for people to use I give
* you this clearly demarcated crap.
*---------------------------------------------------------------*/
+static inline sector_t to_sector(unsigned int bytes)
+{
+ return bytes >> SECTOR_SHIFT;
+}
+
+static inline unsigned int to_bytes(sector_t sector)
+{
+ return sector << SECTOR_SHIFT;
+}

/*
* Decrements the number of outstanding ios that a bio has been
@@ -270,16 +279,17 @@
static sector_t max_io_len(struct mapped_device *md,
sector_t sector, struct dm_target *ti)
{
- sector_t len = ti->len;
+ sector_t offset = sector - ti->begin;
+ sector_t len = ti->len - offset;

/* FIXME: obey io_restrictions ! */

+
/*
* Does the target need to split even further ?
*/
if (ti->split_io) {
sector_t boundary;
- sector_t offset = sector - ti->begin;
boundary = dm_round_up(offset + 1, ti->split_io) - offset;

if (len > boundary)
@@ -289,16 +299,17 @@
return len;
}

-static void __map_bio(struct dm_target *ti, struct bio *clone)
+static void __map_bio(struct dm_target *ti, struct bio *clone, struct dm_io *io)
{
- struct dm_io *io = clone->bi_private;
int r;

/*
* Sanity checks.
*/
- if (!clone->bi_size)
- BUG();
+ BUG_ON(!clone->bi_size);
+
+ clone->bi_end_io = clone_endio;
+ clone->bi_private = io;

/*
* Map the clone. If r == 0 we don't need to do
@@ -326,77 +337,125 @@
};

/*
- * Issues a little bio that just does the back end of a split page.
+ * Creates a little bio that is just does part of a bvec.
*/
-static void __split_page(struct clone_info *ci, unsigned int len)
+static struct bio *split_bvec(struct bio *bio, sector_t sector,
+ unsigned short idx, unsigned int offset,
+ unsigned int len)
{
- struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
- struct bio *clone, *bio = ci->bio;
- struct bio_vec *bv = bio->bi_io_vec + ci->idx;
-
- if (len > ci->sector_count)
- len = ci->sector_count;
+ struct bio *clone;
+ struct bio_vec *bv = bio->bi_io_vec + idx;

clone = bio_alloc(GFP_NOIO, 1);
- memcpy(clone->bi_io_vec, bv, sizeof(*bv));

- clone->bi_sector = ci->sector;
- clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw;
- clone->bi_vcnt = 1;
- clone->bi_size = len << SECTOR_SHIFT;
- clone->bi_end_io = clone_endio;
- clone->bi_private = ci->io;
- clone->bi_io_vec->bv_offset = bv->bv_len - clone->bi_size;
- clone->bi_io_vec->bv_len = clone->bi_size;
+ if (clone) {
+ memcpy(clone->bi_io_vec, bv, sizeof(*bv));

- ci->sector += len;
- ci->sector_count -= len;
+ clone->bi_sector = sector;
+ clone->bi_bdev = bio->bi_bdev;
+ clone->bi_rw = bio->bi_rw;
+ clone->bi_vcnt = 1;
+ clone->bi_size = to_bytes(len);
+ clone->bi_io_vec->bv_offset = offset;
+ clone->bi_io_vec->bv_len = clone->bi_size;
+ }

- __map_bio(ti, clone);
+ return clone;
+}
+
+/*
+ * Creates a bio that consists of range of complete bvecs.
+ */
+static struct bio *clone_bio(struct bio *bio, sector_t sector,
+ unsigned short idx, unsigned short bv_count,
+ unsigned int len)
+{
+ struct bio *clone;
+
+ clone = bio_clone(bio, GFP_NOIO);
+ clone->bi_sector = sector;
+ clone->bi_idx = idx;
+ clone->bi_vcnt = idx + bv_count;
+ clone->bi_size = to_bytes(len);
+
+ return clone;
}

static void __clone_and_map(struct clone_info *ci)
{
struct bio *clone, *bio = ci->bio;
struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
- sector_t len = max_io_len(ci->md, bio->bi_sector, ti);
+ sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);

- /* shorter than current target ? */
- if (ci->sector_count < len)
- len = ci->sector_count;
+ if (ci->sector_count <= max) {
+ /*
+ * Optimise for the simple case where we can do all of
+ * the remaining io with a single clone.
+ */
+ clone = clone_bio(bio, ci->sector, ci->idx,
+ bio->bi_vcnt - ci->idx, ci->sector_count);
+ __map_bio(ti, clone, ci->io);
+ ci->sector_count = 0;

- /* create the clone */
- clone = bio_clone(ci->bio, GFP_NOIO);
- clone->bi_sector = ci->sector;
- clone->bi_idx = ci->idx;
- clone->bi_size = len << SECTOR_SHIFT;
- clone->bi_end_io = clone_endio;
- clone->bi_private = ci->io;
+ } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
+ /*
+ * There are some bvecs that don't span targets.
+ * Do as many of these as possible.
+ */
+ int i;
+ sector_t remaining = max;
+ sector_t bv_len;

- /* adjust the remaining io */
- ci->sector += len;
- ci->sector_count -= len;
- __map_bio(ti, clone);
+ for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
+ bv_len = to_sector(bio->bi_io_vec[i].bv_len);

- /*
- * If we are not performing all remaining io in this
- * clone then we need to calculate ci->idx for the next
- * time round.
- */
- if (ci->sector_count) {
- while (len) {
- struct bio_vec *bv = clone->bi_io_vec + ci->idx;
- sector_t bv_len = bv->bv_len >> SECTOR_SHIFT;
- if (bv_len <= len)
- len -= bv_len;
+ if (bv_len > remaining)
+ break;

- else {
- __split_page(ci, bv_len - len);
- len = 0;
- }
- ci->idx++;
+ remaining -= bv_len;
+ len += bv_len;
}
+
+ clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
+ __map_bio(ti, clone, ci->io);
+
+ ci->sector += len;
+ ci->sector_count -= len;
+ ci->idx = i;
+
+ } else {
+ /*
+ * Create two copy bios to deal with io that has
+ * been split across a target.
+ */
+ struct bio_vec *bv = bio->bi_io_vec + ci->idx;
+
+ clone = split_bvec(bio, ci->sector, ci->idx,
+ bv->bv_offset, max);
+ if (!clone) {
+ dec_pending(ci->io, -ENOMEM);
+ return;
+ }
+
+ __map_bio(ti, clone, ci->io);
+
+ ci->sector += max;
+ ci->sector_count -= max;
+ ti = dm_table_find_target(ci->md->map, ci->sector);
+
+ len = to_sector(bv->bv_len) - max;
+ clone = split_bvec(bio, ci->sector, ci->idx,
+ bv->bv_offset + to_bytes(max), len);
+ if (!clone) {
+ dec_pending(ci->io, -ENOMEM);
+ return;
+ }
+
+ __map_bio(ti, clone, ci->io);
+
+ ci->sector += len;
+ ci->sector_count -= len;
+ ci->idx++;
}
}

2002-12-16 10:12:41

by Joe Thornber

[permalink] [raw]
Subject: 19/19

The linear target was getting the start sector wrong when doing a
dm_get_device(). [Kevin Corry]
--- diff/drivers/md/dm-linear.c 2002-11-18 10:11:54.000000000 +0000
+++ source/drivers/md/dm-linear.c 2002-12-16 09:43:39.000000000 +0000
@@ -43,7 +43,7 @@
goto bad;
}

- if (dm_get_device(ti, argv[0], ti->begin, ti->len,
+ if (dm_get_device(ti, argv[0], lc->start, ti->len,
dm_table_get_mode(ti->table), &lc->dev)) {
ti->error = "dm-linear: Device lookup failed";
goto bad;

2002-12-16 10:07:45

by Joe Thornber

[permalink] [raw]
Subject: 12/19

dec_pending(): only bother spin locking if io->error is going to be
updated. [Kevin Corry]
--- diff/drivers/md/dm.c 2002-12-16 09:41:12.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:16.000000000 +0000
@@ -238,10 +238,11 @@
static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
unsigned long flags;

- spin_lock_irqsave(&_uptodate_lock, flags);
- if (error)
+ if (error) {
+ spin_lock_irqsave(&_uptodate_lock, flags);
io->error = error;
- spin_unlock_irqrestore(&_uptodate_lock, flags);
+ spin_unlock_irqrestore(&_uptodate_lock, flags);
+ }

if (atomic_dec_and_test(&io->io_count)) {
if (atomic_dec_and_test(&io->md->pending))

2002-12-16 10:04:06

by Joe Thornber

[permalink] [raw]
Subject: 7/19

There's a bug in the dm-stripe.c constructor failing top check if enough
destinations are handed in. [Heinz Mauelshagen]
--- diff/drivers/md/dm-stripe.c 2002-12-16 09:40:48.000000000 +0000
+++ source/drivers/md/dm-stripe.c 2002-12-16 09:40:53.000000000 +0000
@@ -131,6 +131,15 @@
return -EINVAL;
}

+ /*
+ * Do we have enough arguments for that many stripes ?
+ */
+ if (argc != (2 + 2 * stripes)) {
+ ti->error = "dm-stripe: Not enough destinations "
+ "specified";
+ return -EINVAL;
+ }
+
sc = alloc_context(stripes);
if (!sc) {
ti->error = "dm-stripe: Memory allocation for striped context "
@@ -151,13 +160,6 @@
* Get the stripe destinations.
*/
for (i = 0; i < stripes; i++) {
- if (argc < 2) {
- ti->error = "dm-stripe: Not enough destinations "
- "specified";
- kfree(sc);
- return -EINVAL;
- }
-
argv += 2;

r = get_stripe(ti, sc, i, argv);

2002-12-16 10:11:33

by Joe Thornber

[permalink] [raw]
Subject: 17/19

o If there's an error you still need to call bio_endio with bio->bi_size
as the 'done' param.

o Simplify clone_endio.

[Kevin Corry]
--- diff/drivers/md/dm.c 2002-12-16 09:41:34.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:39.000000000 +0000
@@ -249,7 +249,7 @@
/* nudge anyone waiting on suspend queue */
wake_up(&io->md->wait);

- bio_endio(io->bio, io->error ? 0 : io->bio->bi_size, io->error);
+ bio_endio(io->bio, io->bio->bi_size, io->error);
free_io(io->md, io);
}
}
@@ -258,16 +258,11 @@
{
struct dm_io *io = bio->bi_private;

- /*
- * Only call dec_pending if the clone has completely
- * finished. If a partial io errors I'm assuming it won't
- * be requeued. FIXME: check this.
- */
- if (error || !bio->bi_size) {
- dec_pending(io, error);
- bio_put(bio);
- }
+ if (bio->bi_size)
+ return 1;

+ dec_pending(io, error);
+ bio_put(bio);
return 0;
}

@@ -454,13 +449,13 @@
up_read(&md->lock);

if (bio_rw(bio) == READA) {
- bio_io_error(bio, 0);
+ bio_io_error(bio, bio->bi_size);
return 0;
}

r = queue_io(md, bio);
if (r < 0) {
- bio_io_error(bio, 0);
+ bio_io_error(bio, bio->bi_size);
return 0;

} else if (r == 0)

2002-12-16 10:10:13

by Joe Thornber

[permalink] [raw]
Subject: 16/19

Remove verbose debug message 'Splitting page'.
--- diff/drivers/md/dm.c 2002-12-16 09:41:25.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:34.000000000 +0000
@@ -339,8 +339,6 @@
struct bio *clone, *bio = ci->bio;
struct bio_vec *bv = bio->bi_io_vec + ci->idx;

- DMWARN("splitting page");
-
if (len > ci->sector_count)
len = ci->sector_count;

2002-12-16 10:05:01

by Joe Thornber

[permalink] [raw]
Subject: 8/19

Give each device its own io mempool to avoid a potential
deadlock with stacked devices. [HM + EJT]
--- diff/drivers/md/dm.c 2002-12-16 09:40:44.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:40:58.000000000 +0000
@@ -58,11 +58,15 @@
* The current mapping.
*/
struct dm_table *map;
+
+ /*
+ * io objects are allocated from here.
+ */
+ mempool_t *io_pool;
};

#define MIN_IOS 256
static kmem_cache_t *_io_cache;
-static mempool_t *_io_pool;

static __init int local_init(void)
{
@@ -74,18 +78,10 @@
if (!_io_cache)
return -ENOMEM;

- _io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
- mempool_free_slab, _io_cache);
- if (!_io_pool) {
- kmem_cache_destroy(_io_cache);
- return -ENOMEM;
- }
-
_major = major;
r = register_blkdev(_major, _name, &dm_blk_dops);
if (r < 0) {
DMERR("register_blkdev failed");
- mempool_destroy(_io_pool);
kmem_cache_destroy(_io_cache);
return r;
}
@@ -98,7 +94,6 @@

static void local_exit(void)
{
- mempool_destroy(_io_pool);
kmem_cache_destroy(_io_cache);

if (unregister_blkdev(_major, _name) < 0)
@@ -178,14 +173,14 @@
return 0;
}

-static inline struct dm_io *alloc_io(void)
+static inline struct dm_io *alloc_io(struct mapped_device *md)
{
- return mempool_alloc(_io_pool, GFP_NOIO);
+ return mempool_alloc(md->io_pool, GFP_NOIO);
}

-static inline void free_io(struct dm_io *io)
+static inline void free_io(struct mapped_device *md, struct dm_io *io)
{
- mempool_free(io, _io_pool);
+ mempool_free(io, md->io_pool);
}

static inline struct deferred_io *alloc_deferred(void)
@@ -254,7 +249,7 @@
wake_up(&io->md->wait);

bio_endio(io->bio, io->error ? 0 : io->bio->bi_size, io->error);
- free_io(io);
+ free_io(io->md, io);
}
}

@@ -419,7 +414,7 @@

ci.md = md;
ci.bio = bio;
- ci.io = alloc_io();
+ ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
ci.io->bio = bio;
@@ -558,8 +553,17 @@
md->queue.queuedata = md;
blk_queue_make_request(&md->queue, dm_request);

+ md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+ mempool_free_slab, _io_cache);
+ if (!md->io_pool) {
+ free_minor(md->disk->first_minor);
+ kfree(md);
+ return NULL;
+ }
+
md->disk = alloc_disk(1);
if (!md->disk) {
+ mempool_destroy(md->io_pool);
free_minor(md->disk->first_minor);
kfree(md);
return NULL;
@@ -581,6 +585,7 @@
static void free_dev(struct mapped_device *md)
{
free_minor(md->disk->first_minor);
+ mempool_destroy(md->io_pool);
del_gendisk(md->disk);
put_disk(md->disk);
kfree(md);

2002-12-16 10:10:12

by Joe Thornber

[permalink] [raw]
Subject: 15/19

Remove some paranoia in highmem.c
--- diff/mm/highmem.c 2002-11-11 11:09:43.000000000 +0000
+++ source/mm/highmem.c 2002-12-16 09:41:30.000000000 +0000
@@ -452,8 +452,6 @@
mempool_t *pool;
int bio_gfp;

- BUG_ON((*bio_orig)->bi_idx);
-
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,

2002-12-16 10:09:16

by Joe Thornber

[permalink] [raw]
Subject: 14/19

Some fields in the duplicated bio weren't being set up properly in
__split_page(). [Kevin Corry]
--- diff/drivers/md/dm.c 2002-12-16 09:41:21.000000000 +0000
+++ source/drivers/md/dm.c 2002-12-16 09:41:25.000000000 +0000
@@ -337,7 +337,7 @@
{
struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
struct bio *clone, *bio = ci->bio;
- struct bio_vec *bv = bio->bi_io_vec + (bio->bi_vcnt - 1);
+ struct bio_vec *bv = bio->bi_io_vec + ci->idx;

DMWARN("splitting page");

@@ -349,11 +349,13 @@

clone->bi_sector = ci->sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_flags = bio->bi_flags | (1 << BIO_SEG_VALID);
clone->bi_rw = bio->bi_rw;
+ clone->bi_vcnt = 1;
clone->bi_size = len << SECTOR_SHIFT;
clone->bi_end_io = clone_endio;
clone->bi_private = ci->io;
+ clone->bi_io_vec->bv_offset = bv->bv_len - clone->bi_size;
+ clone->bi_io_vec->bv_len = clone->bi_size;

ci->sector += len;
ci->sector_count -= len;

2002-12-16 10:30:22

by Tomas Szepe

[permalink] [raw]
Subject: Re: 6/19

> > + /*
> > + * chunk_size is a power of two
> > + */
> > + if (!chunk_size || (chunk_size & (chunk_size - 1))) {
> > + ti->error = "dm-stripe: Invalid chunk size";
> > + return -EINVAL;
> > + }
>
> Is 1 a valid chunksize then? [It certainly is not a power of two. ;)]

Umm, 2 ^ 0 = 1. Sorry.

--
Tomas Szepe <[email protected]>

2002-12-16 10:28:21

by Tomas Szepe

[permalink] [raw]
Subject: Re: 6/19

> + /*
> + * chunk_size is a power of two
> + */
> + if (!chunk_size || (chunk_size & (chunk_size - 1))) {
> + ti->error = "dm-stripe: Invalid chunk size";
> + return -EINVAL;
> + }

Is 1 a valid chunksize then? [It certainly is not a power of two. ;)]
If not, you need

if (chink_size < 2 || (chunk_size & (chunk_size - 1))) { ... }

--
Tomas Szepe <[email protected]>

2002-12-16 16:58:17

by Linus Torvalds

[permalink] [raw]
Subject: Re: 1/19


Thanks for doing the split, great work.

For future reference, it would help if the subject line also had a short
description, something more like

Subject: [PATCH 1/19] dm: move ioctl numbers to sane place

but that's just a small technical detail and I'll make up my own "short
description" based on your longer in-email ones.

Linus