Hi,
Updated to work with 2.6.1-rc2. Changes since last version:
- Don't include jbd or xfs support, just ide + block layer bits. Makes
it easier to include, as no actual barriers will be generated.
- Failure case in ide should _clear_ barrier support, not reset it.
Bart, would you care to review the ide bits for sanity?
===== drivers/block/ll_rw_blk.c 1.223 vs edited =====
--- 1.223/drivers/block/ll_rw_blk.c Thu Jan 1 11:44:04 2004
+++ edited/drivers/block/ll_rw_blk.c Wed Jan 7 14:39:11 2004
@@ -245,6 +245,28 @@
EXPORT_SYMBOL(blk_queue_make_request);
/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q: the request queue
+ * @flag: see below
+ *
+ * Description:
+ * For journalled file systems, doing ordered writes on a commit
+ * block instead of explicitly doing wait_on_buffer (which is bad
+ * for performance) can be a big win. Block drivers supporting this
+ * feature should call this function and indicate so.
+ *
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+ if (flag)
+ set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+ else
+ clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags);
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
* @dma_addr: bus address limit
@@ -1831,6 +1853,8 @@
if (unlikely(!q))
return;
+
+ WARN_ON(!req->ref_count);
if (unlikely(--req->ref_count))
return;
@@ -1997,7 +2021,7 @@
static int __make_request(request_queue_t *q, struct bio *bio)
{
struct request *req, *freereq = NULL;
- int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+ int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra, err;
sector_t sector;
sector = bio->bi_sector;
@@ -2015,7 +2039,11 @@
spin_lock_prefetch(q->queue_lock);
- barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
+ barrier = bio_barrier(bio);
+ if (barrier && !test_bit(QUEUE_FLAG_ORDERED, &q->queue_flags)) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
@@ -2097,6 +2125,7 @@
/*
* READA bit set
*/
+ err = -EWOULDBLOCK;
if (ra)
goto end_io;
@@ -2152,7 +2181,7 @@
return 0;
end_io:
- bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+ bio_endio(bio, nr_sectors << 9, err);
return 0;
}
===== drivers/ide/ide-disk.c 1.62 vs edited =====
--- 1.62/drivers/ide/ide-disk.c Fri Sep 5 14:36:36 2003
+++ edited/drivers/ide/ide-disk.c Wed Jan 7 14:40:06 2004
@@ -1371,6 +1371,7 @@
static int write_cache (ide_drive_t *drive, int arg)
{
ide_task_t args;
+ int err;
if (!(drive->id->cfs_enable_2 & 0x3000))
return 1;
@@ -1380,7 +1381,10 @@
SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE;
args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES;
args.command_type = ide_cmd_type_parser(&args);
- (void) ide_raw_taskfile(drive, &args, NULL);
+
+ err = ide_raw_taskfile(drive, &args, NULL);
+ if (err)
+ return err;
drive->wcache = arg;
return 0;
@@ -1755,7 +1759,7 @@
drive->wcache = 0;
/* Cache enabled ? */
if (drive->id->csfo & 1)
- drive->wcache = 1;
+ drive->wcache = 1;
/* Cache command set available ? */
if (drive->id->cfs_enable_1 & (1<<5))
drive->wcache = 1;
===== drivers/ide/ide-io.c 1.21 vs edited =====
--- 1.21/drivers/ide/ide-io.c Thu Jan 1 11:49:12 2004
+++ edited/drivers/ide/ide-io.c Wed Jan 7 14:42:26 2004
@@ -54,30 +54,63 @@
#include <asm/io.h>
#include <asm/bitops.h>
-/**
- * ide_end_request - complete an IDE I/O
- * @drive: IDE device for the I/O
- * @uptodate:
- * @nr_sectors: number of sectors completed
- *
- * This is our end_request wrapper function. We complete the I/O
- * update random number input and dequeue the request, which if
- * it was tagged may be out of order.
+/*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
*/
-
-int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
+ struct request *rq, int post)
{
- struct request *rq;
- unsigned long flags;
- int ret = 1;
+ struct request *flush_rq = &HWGROUP(drive)->wrq;
- spin_lock_irqsave(&ide_lock, flags);
- rq = HWGROUP(drive)->rq;
+ /*
+ * write cache disabled, just return barrier write immediately
+ */
+ if (!drive->wcache)
+ return rq;
- BUG_ON(!(rq->flags & REQ_STARTED));
+ /*
+ * if last rq issued was the post-flush, we can skip the pre-flush
+ */
+ if (drive->last_rq_flush) {
+ rq->flags |= REQ_BAR_PREFLUSH;
+ return rq;
+ }
- if (!nr_sectors)
- nr_sectors = rq->hard_cur_sectors;
+ blkdev_dequeue_request(rq);
+
+ memset(drive->special_buf, 0, sizeof(drive->special_buf));
+
+ ide_init_drive_cmd(flush_rq);
+
+ flush_rq->flags = REQ_DRIVE_TASK;
+ flush_rq->buffer = drive->special_buf;
+ flush_rq->special = rq;
+ flush_rq->buffer[0] = WIN_FLUSH_CACHE;
+ flush_rq->nr_sectors = rq->nr_sectors;
+
+ if (drive->id->cfs_enable_2 & 0x2400)
+ flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
+
+ if (!post) {
+ drive->doing_barrier = 1;
+ flush_rq->flags |= REQ_BAR_PREFLUSH;
+ } else
+ flush_rq->flags |= REQ_BAR_POSTFLUSH;
+
+ flush_rq->flags |= REQ_STARTED;
+ flush_rq->timeout = jiffies;
+ list_add(&flush_rq->queuelist, &drive->queue->queue_head);
+ HWGROUP(drive)->rq = NULL;
+ return flush_rq;
+}
+
+static int __ide_end_request(ide_drive_t *drive, struct request *rq,
+ int uptodate, int nr_sectors)
+{
+ int ret = 1;
+
+ BUG_ON(!(rq->flags & REQ_STARTED));
/*
* if failfast is set on a request, override number of sectors and
@@ -97,14 +130,55 @@
if (!end_that_request_first(rq, uptodate, nr_sectors)) {
add_disk_randomness(rq->rq_disk);
- if (!blk_rq_tagged(rq))
- blkdev_dequeue_request(rq);
- else
+
+ if (blk_rq_tagged(rq))
blk_queue_end_tag(drive->queue, rq);
- HWGROUP(drive)->rq = NULL;
+ else if (!blk_barrier_rq(rq))
+ blkdev_dequeue_request(rq);
+
end_that_request_last(rq);
+ HWGROUP(drive)->rq = NULL;
ret = 0;
}
+
+ return ret;
+}
+
+/**
+ * ide_end_request - complete an IDE I/O
+ * @drive: IDE device for the I/O
+ * @uptodate:
+ * @nr_sectors: number of sectors completed
+ *
+ * This is our end_request wrapper function. We complete the I/O
+ * update random number input and dequeue the request, which if
+ * it was tagged may be out of order.
+ */
+
+int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
+{
+ struct request *rq;
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&ide_lock, flags);
+ rq = HWGROUP(drive)->rq;
+
+ if (!nr_sectors)
+ nr_sectors = rq->hard_cur_sectors;
+
+ if (!blk_barrier_rq(rq))
+ ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
+ else {
+ struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+ flush_rq->nr_sectors -= nr_sectors;
+ if (!flush_rq->nr_sectors) {
+ ide_queue_flush_cmd(drive, rq, 1);
+ ret = 0;
+ }
+ }
+
spin_unlock_irqrestore(&ide_lock, flags);
return ret;
}
@@ -140,6 +214,97 @@
spin_unlock_irqrestore(&ide_lock, flags);
}
+/*
+ * FIXME: probably move this somewhere else, name is bad too :)
+ */
+static sector_t ide_get_error_location(ide_drive_t *drive, char *args)
+{
+ u32 high, low;
+ u8 hcyl, lcyl, sect;
+ sector_t sector;
+
+ high = 0;
+ hcyl = args[5];
+ lcyl = args[4];
+ sect = args[3];
+
+ if (drive->id->cfs_enable_2 & 0x2400) {
+ low = (hcyl << 16) | (lcyl << 8) | sect;
+ HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG);
+ high = ide_read_24(drive);
+ } else {
+ u8 cur = HWIF(drive)->INB(IDE_SELECT_REG);
+ if (cur & 0x40)
+ low = (hcyl << 16) | (lcyl << 8) | sect;
+ else {
+ low = hcyl * drive->head * drive->sect;
+ low += lcyl * drive->sect;
+ low += sect - 1;
+ }
+ }
+
+ sector = ((sector_t) high << 24) | low;
+ return sector;
+}
+
+static void ide_complete_barrier(ide_drive_t *drive, struct request *rq,
+ int error)
+{
+ struct request *real_rq = rq->special;
+ int good_sectors, bad_sectors;
+ sector_t sector;
+
+ if (!error) {
+ if (blk_barrier_postflush(rq)) {
+ /*
+ * this completes the barrier write
+ */
+ __ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors);
+ drive->doing_barrier = 0;
+ drive->last_rq_flush = 1;
+ } else {
+ /*
+ * just indicate that we did the pre flush
+ */
+ real_rq->flags |= REQ_BAR_PREFLUSH;
+ __elv_add_request(drive->queue, real_rq, ELEVATOR_INSERT_FRONT, 0);
+ }
+
+#ifdef IDE_DUMP_FLUSH_TIMINGS
+ printk("%s: %sflush took %lu jiffies\n", drive->name, blk_barrier_postflush(rq) ? "post" : "pre", jiffies - rq->timeout);
+#endif
+
+ /*
+ * all is fine, return
+ */
+ return;
+ }
+
+ /*
+ * bummer, flush failed. if it was the pre-flush, fail the barrier.
+ * if it was the post-flush, complete the succesful part of the request
+ * and fail the rest
+ */
+ good_sectors = 0;
+ if (blk_barrier_postflush(rq)) {
+ sector = ide_get_error_location(drive, rq->buffer);
+
+ if ((sector >= real_rq->hard_sector) &&
+ (sector < real_rq->hard_sector + real_rq->hard_nr_sectors))
+ good_sectors = sector - real_rq->hard_sector;
+ } else
+ sector = real_rq->hard_sector;
+
+ bad_sectors = real_rq->hard_nr_sectors - good_sectors;
+ if (good_sectors)
+ __ide_end_request(drive, real_rq, 1, good_sectors);
+ if (bad_sectors)
+ __ide_end_request(drive, real_rq, 0, bad_sectors);
+
+ printk(KERN_ERR "%s: failed barrier write: sector=%Lx(good=%d/bad=%d)\n", drive->name, sector, good_sectors, bad_sectors);
+ blk_queue_ordered(drive->queue, 0);
+}
+
/**
* ide_end_drive_cmd - end an explicit drive command
* @drive: command
@@ -229,6 +394,10 @@
spin_lock_irqsave(&ide_lock, flags);
blkdev_dequeue_request(rq);
+
+ if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq))
+ ide_complete_barrier(drive, rq, err);
+
HWGROUP(drive)->rq = NULL;
end_that_request_last(rq);
spin_unlock_irqrestore(&ide_lock, flags);
@@ -717,6 +886,15 @@
repeat:
best = NULL;
drive = hwgroup->drive;
+
+ /*
+ * drive is doing pre-flush, ordered write, post-flush sequence. even
+ * though that is 3 requests, it must be seen as a single transaction.
+ * we must not preempt this drive until that is complete
+ */
+ if (drive->doing_barrier)
+ return drive;
+
do {
if ((!drive->sleep || time_after_eq(jiffies, drive->sleep))
&& !elv_queue_empty(drive->queue)) {
@@ -884,6 +1062,15 @@
}
/*
+ * if rq is a barrier write, issue pre cache flush if not
+ * already done
+ */
+ if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq))
+ rq = ide_queue_flush_cmd(drive, rq, 0);
+
+ drive->last_rq_flush = 0;
+
+ /*
* Sanity: don't accept a request that isn't a PM request
* if we are currently power managed. This is very important as
* blk_stop_queue() doesn't prevent the elv_next_request()
@@ -902,6 +1089,10 @@
break;
}
+ /*
+ * we can only queue read-write requests, so let the drive
+ * queue drain before continuing with this command.
+ */
if (!rq->bio && ata_pending_commands(drive))
break;
@@ -1307,6 +1498,7 @@
{
memset(rq, 0, sizeof(*rq));
rq->flags = REQ_DRIVE_CMD;
+ rq->ref_count = 1;
}
EXPORT_SYMBOL(ide_init_drive_cmd);
===== drivers/md/raid1.c 1.75 vs edited =====
--- 1.75/drivers/md/raid1.c Tue Dec 30 09:43:50 2003
+++ edited/drivers/md/raid1.c Wed Jan 7 14:38:34 2004
@@ -889,7 +889,7 @@
conf = mddev_to_conf(mddev);
bio = r1_bio->master_bio;
switch(r1_bio->cmd) {
- case SPECIAL:
+ case WRITESYNC:
sync_request_write(mddev, r1_bio);
break;
case READ:
@@ -999,7 +999,7 @@
r1_bio->mddev = mddev;
r1_bio->sector = sector_nr;
- r1_bio->cmd = SPECIAL;
+ r1_bio->cmd = WRITESYNC;
r1_bio->read_disk = disk;
bio = r1_bio->master_bio;
===== fs/buffer.c 1.216 vs edited =====
--- 1.216/fs/buffer.c Tue Dec 30 09:41:48 2003
+++ edited/fs/buffer.c Wed Jan 7 14:38:34 2004
@@ -2665,6 +2665,9 @@
if (rw == READ && buffer_dirty(bh))
buffer_error();
+ if (buffer_ordered(bh) && (rw == WRITE))
+ rw = WRITESYNC;
+
/* Only clear out a write error when rewriting */
if (test_set_buffer_req(bh) && rw == WRITE)
clear_buffer_write_io_error(bh);
===== include/linux/blkdev.h 1.130 vs edited =====
--- 1.130/include/linux/blkdev.h Thu Jan 1 11:44:04 2004
+++ edited/include/linux/blkdev.h Wed Jan 7 14:38:34 2004
@@ -195,6 +195,8 @@
__REQ_PM_SUSPEND, /* suspend request */
__REQ_PM_RESUME, /* resume request */
__REQ_PM_SHUTDOWN, /* shutdown request */
+ __REQ_BAR_PREFLUSH, /* barrier pre-flush done */
+ __REQ_BAR_POSTFLUSH, /* barrier post-flush */
__REQ_NR_BITS, /* stops here */
};
@@ -220,6 +222,8 @@
#define REQ_PM_SUSPEND (1 << __REQ_PM_SUSPEND)
#define REQ_PM_RESUME (1 << __REQ_PM_RESUME)
#define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN)
+#define REQ_BAR_PREFLUSH (1 << __REQ_BAR_PREFLUSH)
+#define REQ_BAR_POSTFLUSH (1 << __REQ_BAR_POSTFLUSH)
/*
* State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -369,6 +373,7 @@
#define QUEUE_FLAG_READFULL 3 /* write queue has been filled */
#define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */
#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
+#define QUEUE_FLAG_ORDERED 6 /* supports ordered writes */
#define blk_queue_plugged(q) !list_empty(&(q)->plug_list)
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -381,6 +386,10 @@
#define blk_pm_request(rq) \
((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME))
+#define blk_barrier_rq(rq) ((rq)->flags & REQ_HARDBARRIER)
+#define blk_barrier_preflush(rq) ((rq)->flags & REQ_BAR_PREFLUSH)
+#define blk_barrier_postflush(rq) ((rq)->flags & REQ_BAR_POSTFLUSH)
+
#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist)
#define rq_data_dir(rq) ((rq)->flags & 1)
@@ -563,6 +572,7 @@
extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
extern void blk_queue_dma_alignment(request_queue_t *, int);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern void blk_queue_ordered(request_queue_t *, int);
extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
===== include/linux/buffer_head.h 1.44 vs edited =====
--- 1.44/include/linux/buffer_head.h Tue Aug 19 07:30:30 2003
+++ edited/include/linux/buffer_head.h Wed Jan 7 14:38:34 2004
@@ -26,6 +26,7 @@
BH_Delay, /* Buffer is not yet allocated on disk */
BH_Boundary, /* Block is followed by a discontiguity */
BH_Write_EIO, /* I/O error on write */
+ BH_Ordered, /* ordered write */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -117,7 +118,8 @@
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
-BUFFER_FNS(Write_EIO,write_io_error)
+BUFFER_FNS(Write_EIO, write_io_error)
+BUFFER_FNS(Ordered, ordered)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
#define touch_buffer(bh) mark_page_accessed(bh->b_page)
===== include/linux/fs.h 1.276 vs edited =====
--- 1.276/include/linux/fs.h Mon Dec 29 22:37:20 2003
+++ edited/include/linux/fs.h Wed Jan 7 14:38:34 2004
@@ -81,7 +81,7 @@
#define READ 0
#define WRITE 1
#define READA 2 /* read-ahead - don't block if no resources */
-#define SPECIAL 4 /* For non-blockdevice requests in request queue */
+#define WRITESYNC ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
#define SEL_IN 1
#define SEL_OUT 2
===== include/linux/ide.h 1.79 vs edited =====
--- 1.79/include/linux/ide.h Thu Jan 1 11:49:12 2004
+++ edited/include/linux/ide.h Wed Jan 7 14:38:35 2004
@@ -754,6 +754,8 @@
u8 bios_head; /* BIOS/fdisk/LILO number of heads */
u8 bios_sect; /* BIOS/fdisk/LILO sectors per track */
u8 queue_depth; /* max queue depth */
+ u8 doing_barrier; /* state, 1=currently doing flush */
+ u8 last_rq_flush; /* last rq was a flush */
unsigned int bios_cyl; /* BIOS/fdisk/LILO number of cyls */
unsigned int cyl; /* "real" number of cyls */
@@ -768,6 +770,7 @@
int forced_lun; /* if hdxlun was given at boot */
int lun; /* logical unit */
int crc_count; /* crc counter to reduce drive speed */
+ char special_buf[8]; /* private command buffer */
struct list_head list;
struct device gendev;
struct semaphore gendev_rel_sem; /* to deal with device release() */
--
Jens Axboe
On Wednesday 07 of January 2004 14:43, Jens Axboe wrote:
> Bart, would you care to review the ide bits for sanity?
Yep, here is just a first sight...
> + struct request *flush_rq = &HWGROUP(drive)->wrq;
I want to remove drive->wrq in the future.
> + memset(drive->special_buf, 0, sizeof(drive->special_buf));
> +
> + ide_init_drive_cmd(flush_rq);
> +
> + flush_rq->flags = REQ_DRIVE_TASK;
> + flush_rq->buffer = drive->special_buf;
> + flush_rq->special = rq;
> + flush_rq->buffer[0] = WIN_FLUSH_CACHE;
> + flush_rq->nr_sectors = rq->nr_sectors;
I think you should try use REQ_DRIVE_TASKFILE,
instead of adding drive->special_buf.
> +/*
> + * FIXME: probably move this somewhere else, name is bad too :)
> + */
> +static sector_t ide_get_error_location(ide_drive_t *drive, char *args)
This is probably useful in few other places.
--bart
On Wed, Jan 07 2004, Bartlomiej Zolnierkiewicz wrote:
> On Wednesday 07 of January 2004 14:43, Jens Axboe wrote:
> > Bart, would you care to review the ide bits for sanity?
>
> Yep, here is just a first sight...
>
> > + struct request *flush_rq = &HWGROUP(drive)->wrq;
>
> I want to remove drive->wrq in the future.
Yes I've wanted that, too, but only because of the ugly (and often racy)
multimode crap. I'm just considering wrq a 'reserve rq' to be used where
you cannot reliably get a new request atomically.
I'm open to any better ideas you have for this...
> > + memset(drive->special_buf, 0, sizeof(drive->special_buf));
> > +
> > + ide_init_drive_cmd(flush_rq);
> > +
> > + flush_rq->flags = REQ_DRIVE_TASK;
> > + flush_rq->buffer = drive->special_buf;
> > + flush_rq->special = rq;
> > + flush_rq->buffer[0] = WIN_FLUSH_CACHE;
> > + flush_rq->nr_sectors = rq->nr_sectors;
>
> I think you should try use REQ_DRIVE_TASKFILE,
> instead of adding drive->special_buf.
How does that change anything? I still need a command buffer, if I use
REQ_DRIVE_TASKFILE I need an even bigger one.
> > +/*
> > + * FIXME: probably move this somewhere else, name is bad too :)
> > + */
> > +static sector_t ide_get_error_location(ide_drive_t *drive, char *args)
>
> This is probably useful in few other places.
Yeah, as the comment states I know it's not really in the right place.
Where do you want it? And any suggestions for a better name? :)
--
Jens Axboe
> + char special_buf[8]; /* private command buffer */
Why not put that in struct drive instead ? You must be a bit more
careful on the lifetime, but it's less bloat, there are much less
instances of struct drive than struct request :)
Ben.
On Fri, Jan 09 2004, Benjamin Herrenschmidt wrote:
>
> > + char special_buf[8]; /* private command buffer */
>
> Why not put that in struct drive instead ? You must be a bit more
> careful on the lifetime, but it's less bloat, there are much less
> instances of struct drive than struct request :)
Hmm? It _is_ in struct drive :-). It could be put in hwgroup actually,
but I felt it was cleaner in drive (and easier to manage).
--
Jens Axboe
> Hmm? It _is_ in struct drive :-). It could be put in hwgroup actually,
> but I felt it was cleaner in drive (and easier to manage).
Oops :) I misread the patch, or was on crack or something... sorry :)
Ben.