2004-03-10 12:46:43

by Jens Axboe

[permalink] [raw]
Subject: [PATCH] backing dev unplugging

Hi,

Here's a first cut at killing global plugging of block devices to reduce
the nasty contention blk_plug_lock caused. This introduceds per-queue
plugging, controlled by the backing_dev_info. Observations:

- Most uses of blk_run_queues() without a specific context was bogus.
Usually the act of kicking the targets in question should be (and
already are) performed by other activities, such as making the vm
flushing run to free memory.

- Some use of blk_run_queues() really just want to kick the final queue
where the bio goes to. I've added bio_sync() (BIO_RW_SYNC) to manage
those, if the queue needs unplugging we'll do it when holding the lock
for the queue already.

- The dm bit needs careful examination and checking of Joe. It could be
more clever and maintain plug state of each target, I just added a
dm_table unplug all functionality.

Patch is against 2.6.4-rc2-mm1.

diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/block/ll_rw_blk.c linux-2.6.4-rc2-mm1-plug/drivers/block/ll_rw_blk.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/block/ll_rw_blk.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/block/ll_rw_blk.c 2004-03-10 13:30:51.438072138 +0100
@@ -42,12 +42,6 @@
*/
static kmem_cache_t *request_cachep;

-/*
- * plug management
- */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
static wait_queue_head_t congestion_wqh[2];

/*
@@ -247,8 +241,6 @@
*/
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

- INIT_LIST_HEAD(&q->plug_list);
-
blk_queue_activity_fn(q, NULL, NULL);
}

@@ -1100,13 +1092,11 @@
* don't plug a stopped queue, it must be paired with blk_start_queue()
* which will restart the queueing
*/
- if (!blk_queue_plugged(q)
- && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
- spin_lock(&blk_plug_lock);
- list_add_tail(&q->plug_list, &blk_plug_list);
+ if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+ return;
+
+ if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
- spin_unlock(&blk_plug_lock);
- }
}

EXPORT_SYMBOL(blk_plug_device);
@@ -1118,15 +1108,12 @@
int blk_remove_plug(request_queue_t *q)
{
WARN_ON(!irqs_disabled());
- if (blk_queue_plugged(q)) {
- spin_lock(&blk_plug_lock);
- list_del_init(&q->plug_list);
- del_timer(&q->unplug_timer);
- spin_unlock(&blk_plug_lock);
- return 1;
- }

- return 0;
+ if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+ return 0;
+
+ del_timer(&q->unplug_timer);
+ return 1;
}

EXPORT_SYMBOL(blk_remove_plug);
@@ -1157,14 +1144,11 @@
* Linux uses plugging to build bigger requests queues before letting
* the device have at them. If a queue is plugged, the I/O scheduler
* is still adding and merging requests on the queue. Once the queue
- * gets unplugged (either by manually calling this function, or by
- * calling blk_run_queues()), the request_fn defined for the
- * queue is invoked and transfers started.
+ * gets unplugged, the request_fn defined for the queue is invoked and
+ * transfers started.
**/
-void generic_unplug_device(void *data)
+void generic_unplug_device(request_queue_t *q)
{
- request_queue_t *q = data;
-
spin_lock_irq(q->queue_lock);
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
@@ -1172,9 +1156,23 @@

EXPORT_SYMBOL(generic_unplug_device);

+void blk_backing_dev_unplug(struct backing_dev_info *bdi)
+{
+ request_queue_t *q = bdi->unplug_io_data;
+
+ /*
+ * devices don't necessarily have an ->unplug_fn defined
+ */
+ if (q->unplug_fn)
+ q->unplug_fn(q);
+}
+
+EXPORT_SYMBOL(blk_backing_dev_unplug);
+
static void blk_unplug_work(void *data)
{
request_queue_t *q = data;
+
q->unplug_fn(q);
}

@@ -1241,41 +1239,13 @@

EXPORT_SYMBOL(blk_run_queue);

-/**
- * blk_run_queues - fire all plugged queues
- *
- * Description:
- * Start I/O on all plugged queues known to the block layer. Queues that
- * are currently stopped are ignored. This is equivalent to the older
- * tq_disk task queue run.
- **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
-void blk_run_queues(void)
+void blk_run_backing_dev(struct backing_dev_info *bdi)
{
- LIST_HEAD(local_plug_list);
-
- spin_lock_irq(&blk_plug_lock);
-
- /*
- * this will happen fairly often
- */
- if (list_empty(&blk_plug_list))
- goto out;
-
- list_splice_init(&blk_plug_list, &local_plug_list);
-
- while (!list_empty(&local_plug_list)) {
- request_queue_t *q = blk_plug_entry(local_plug_list.next);
-
- spin_unlock_irq(&blk_plug_lock);
- q->unplug_fn(q);
- spin_lock_irq(&blk_plug_lock);
- }
-out:
- spin_unlock_irq(&blk_plug_lock);
+ if (bdi)
+ bdi->unplug_io_fn(bdi);
}

-EXPORT_SYMBOL(blk_run_queues);
+EXPORT_SYMBOL(blk_run_backing_dev);

/**
* blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
@@ -1382,6 +1352,10 @@
memset(q, 0, sizeof(*q));
init_timer(&q->unplug_timer);
atomic_set(&q->refcnt, 1);
+
+ q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+ q->backing_dev_info.unplug_io_data = q;
+
return q;
}

@@ -2036,7 +2010,6 @@
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[rw];

- blk_run_queues();
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
@@ -2294,7 +2267,7 @@
if (blk_queue_plugged(q)) {
int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];

- if (nr_queued == q->unplug_thresh)
+ if (nr_queued == q->unplug_thresh || bio_sync(bio))
__generic_unplug_device(q);
}
spin_unlock_irq(q->queue_lock);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm.c 2004-03-10 11:26:35.000000000 +0100
@@ -560,6 +560,13 @@
return 0;
}

+static void dm_unplug_all(request_queue_t *q)
+{
+ struct mapped_device *md = q->queuedata;
+
+ dm_table_unplug_all(md->map);
+}
+
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r;
@@ -657,6 +664,7 @@
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
+ md->queue->unplug_fn = dm_unplug_all;

md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
mempool_free_slab, _io_cache);
@@ -880,7 +888,7 @@
* Then we wait for the already mapped ios to
* complete.
*/
- blk_run_queues();
+ dm_table_unplug_all(md->map);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);

@@ -921,7 +929,7 @@
__flush_deferred_io(md, def);
up_write(&md->lock);

- blk_run_queues();
+ dm_table_unplug_all(md->map);

return 0;
}
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c 2004-03-09 15:27:36.000000000 +0100
@@ -668,7 +668,7 @@

/* out of memory -> run queues */
if (remaining)
- blk_run_queues();
+ blk_congestion_wait(bio_data_dir(clone), HZ/100);
}

/* drop reference, clones could have returned before we reach this */
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.h linux-2.6.4-rc2-mm1-plug/drivers/md/dm.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm.h 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm.h 2004-03-10 11:18:48.000000000 +0100
@@ -116,6 +116,7 @@
void dm_table_suspend_targets(struct dm_table *t);
void dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);

/*-----------------------------------------------------------------
* A registry of target types.
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-table.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-table.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-table.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-table.c 2004-03-10 13:42:26.244595842 +0100
@@ -882,6 +882,22 @@
return r;
}

+void dm_table_unplug_all(struct dm_table *t)
+{
+ struct list_head *d, *devices;
+
+ devices = dm_table_get_devices(t);
+ for (d = devices->next; d != devices; d = d->next) {
+ struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+ request_queue_t *q = bdev_get_queue(dd->bdev);
+
+ if (q->unplug_fn) {
+ set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+ q->unplug_fn(q);
+ }
+ }
+}
+
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/md.c linux-2.6.4-rc2-mm1-plug/drivers/md/md.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/md.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/md.c 2004-03-10 13:27:17.273517490 +0100
@@ -335,6 +335,8 @@
struct bio_vec vec;
struct completion event;

+ rw |= (1 << BIO_RW_SYNC);
+
bio_init(&bio);
bio.bi_io_vec = &vec;
vec.bv_page = page;
@@ -349,7 +351,6 @@
bio.bi_private = &event;
bio.bi_end_io = bi_complete;
submit_bio(rw, &bio);
- blk_run_queues();
wait_for_completion(&event);

return test_bit(BIO_UPTODATE, &bio.bi_flags);
@@ -2718,7 +2719,7 @@
run = thread->run;
if (run) {
run(thread->mddev);
- blk_run_queues();
+ blk_run_queue(thread->mddev->queue);
}
if (signal_pending(current))
flush_signals(current);
@@ -3286,7 +3287,7 @@
test_bit(MD_RECOVERY_ERR, &mddev->recovery))
break;

- blk_run_queues();
+ blk_run_queue(mddev->queue);

repeat:
if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid1.c linux-2.6.4-rc2-mm1-plug/drivers/md/raid1.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid1.c 2004-03-09 13:08:26.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/raid1.c 2004-03-10 12:14:46.000000000 +0100
@@ -451,6 +451,7 @@

static void device_barrier(conf_t *conf, sector_t sect)
{
+ blk_run_queue(conf->mddev->queue);
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);

@@ -478,6 +479,7 @@
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
+ blk_run_queue(conf->mddev->queue);
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
conf->nr_pending++;
@@ -644,6 +646,7 @@

static void close_sync(conf_t *conf)
{
+ blk_run_queue(conf->mddev->queue);
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid5.c linux-2.6.4-rc2-mm1-plug/drivers/md/raid5.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid5.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/raid5.c 2004-03-10 12:15:19.000000000 +0100
@@ -249,6 +249,7 @@
break;
if (!sh) {
conf->inactive_blocked = 1;
+ blk_run_queue(conf->mddev->queue);
wait_event_lock_irq(conf->wait_for_stripe,
!list_empty(&conf->inactive_list) &&
(atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
@@ -1292,9 +1293,8 @@
}
}
}
-static void raid5_unplug_device(void *data)
+static void raid5_unplug_device(request_queue_t *q)
{
- request_queue_t *q = data;
mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long flags;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid6main.c linux-2.6.4-rc2-mm1-plug/drivers/md/raid6main.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/raid6main.c 2004-03-09 13:08:48.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/md/raid6main.c 2004-03-10 11:20:20.000000000 +0100
@@ -1454,9 +1454,8 @@
}
}
}
-static void raid6_unplug_device(void *data)
+static void raid6_unplug_device(request_queue_t *q)
{
- request_queue_t *q = data;
mddev_t *mddev = q->queuedata;
raid6_conf_t *conf = mddev_to_conf(mddev);
unsigned long flags;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/mtd/devices/blkmtd.c linux-2.6.4-rc2-mm1-plug/drivers/mtd/devices/blkmtd.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/mtd/devices/blkmtd.c 2004-03-09 13:08:26.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/drivers/mtd/devices/blkmtd.c 2004-03-10 13:03:22.000000000 +0100
@@ -147,8 +147,7 @@
bio->bi_private = &event;
bio->bi_end_io = bi_read_complete;
if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
- submit_bio(READ, bio);
- blk_run_queues();
+ submit_bio(READ_SYNC, bio);
wait_for_completion(&event);
err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
bio_put(bio);
@@ -179,8 +178,7 @@
init_completion(&event);
bio->bi_private = &event;
bio->bi_end_io = bi_write_complete;
- submit_bio(WRITE, bio);
- blk_run_queues();
+ submit_bio(WRITE_SYNC, bio);
wait_for_completion(&event);
DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/buffer.c linux-2.6.4-rc2-mm1-plug/fs/buffer.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/buffer.c 2004-03-09 15:24:26.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/buffer.c 2004-03-10 11:15:33.000000000 +0100
@@ -132,7 +132,7 @@
do {
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
if (buffer_locked(bh)) {
- blk_run_queues();
+ blk_run_address_space(bh->b_bdev->bd_inode->i_mapping);
io_schedule();
}
} while (buffer_locked(bh));
@@ -491,7 +491,6 @@
pg_data_t *pgdat;

wakeup_bdflush(1024);
- blk_run_queues();
yield();

for_each_pgdat(pgdat) {
@@ -2928,7 +2927,7 @@

int block_sync_page(struct page *page)
{
- blk_run_queues();
+ blk_run_address_space(page->mapping);
return 0;
}

diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/direct-io.c linux-2.6.4-rc2-mm1-plug/fs/direct-io.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/direct-io.c 2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/direct-io.c 2004-03-10 10:50:48.000000000 +0100
@@ -364,7 +364,7 @@
if (dio->bio_list == NULL) {
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- blk_run_queues();
+ blk_run_address_space(dio->inode->i_mapping);
io_schedule();
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -1035,7 +1035,7 @@
if (ret == 0)
ret = dio->result;
finished_one_bio(dio); /* This can free the dio */
- blk_run_queues();
+ blk_run_address_space(inode->i_mapping);
if (should_wait) {
unsigned long flags;
/*
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/jfs/jfs_logmgr.c linux-2.6.4-rc2-mm1-plug/fs/jfs/jfs_logmgr.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/jfs/jfs_logmgr.c 2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/jfs/jfs_logmgr.c 2004-03-10 13:03:47.000000000 +0100
@@ -1971,8 +1971,7 @@

bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- submit_bio(READ, bio);
- blk_run_queues();
+ submit_bio(READ_SYNC, bio);

wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));

@@ -2116,9 +2115,8 @@

/* check if journaling to disk has been disabled */
if (!log->no_integrity) {
- submit_bio(WRITE, bio);
+ submit_bio(WRITE_SYNC, bio);
INCREMENT(lmStat.submitted);
- blk_run_queues();
}
else {
bio->bi_size = 0;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/ntfs/compress.c linux-2.6.4-rc2-mm1-plug/fs/ntfs/compress.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/ntfs/compress.c 2004-02-18 04:57:58.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/ntfs/compress.c 2004-03-10 12:10:38.000000000 +0100
@@ -668,7 +668,7 @@
"uptodate! Unplugging the disk queue "
"and rescheduling.");
get_bh(tbh);
- blk_run_queues();
+ blk_run_address_space(mapping);
schedule();
put_bh(tbh);
if (unlikely(!buffer_uptodate(tbh)))
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/ufs/truncate.c linux-2.6.4-rc2-mm1-plug/fs/ufs/truncate.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/ufs/truncate.c 2004-02-18 04:57:57.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/ufs/truncate.c 2004-03-10 12:11:06.000000000 +0100
@@ -456,7 +456,7 @@
break;
if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
ufs_sync_inode (inode);
- blk_run_queues();
+ blk_run_address_space(inode->i_mapping);
yield();
}
offset = inode->i_size & uspi->s_fshift;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c 2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c 2004-03-10 13:13:49.000000000 +0100
@@ -1013,7 +1013,7 @@
{
PB_TRACE(pb, "lock", 0);
if (atomic_read(&pb->pb_io_remaining))
- blk_run_queues();
+ blk_run_address_space(pb->pb_target->pbr_mapping);
down(&pb->pb_sema);
PB_SET_OWNER(pb);
PB_TRACE(pb, "locked", 0);
@@ -1109,7 +1109,7 @@
if (atomic_read(&pb->pb_pin_count) == 0)
break;
if (atomic_read(&pb->pb_io_remaining))
- blk_run_queues();
+ blk_run_address_space(pb->pb_target->pbr_mapping);
schedule();
}
remove_wait_queue(&pb->pb_waiters, &wait);
@@ -1407,7 +1407,7 @@
if (pb->pb_flags & PBF_RUN_QUEUES) {
pb->pb_flags &= ~PBF_RUN_QUEUES;
if (atomic_read(&pb->pb_io_remaining) > 1)
- blk_run_queues();
+ blk_run_address_space(pb->pb_target->pbr_mapping);
}
}

@@ -1471,7 +1471,7 @@
{
PB_TRACE(pb, "iowait", 0);
if (atomic_read(&pb->pb_io_remaining))
- blk_run_queues();
+ blk_run_address_space(pb->pb_target->pbr_mapping);
down(&pb->pb_iodonesema);
PB_TRACE(pb, "iowaited", (long)pb->pb_error);
return pb->pb_error;
@@ -1617,7 +1617,6 @@
pagebuf_daemon(
void *data)
{
- int count;
page_buf_t *pb;
struct list_head *curr, *next, tmp;

@@ -1640,7 +1639,6 @@

spin_lock(&pbd_delwrite_lock);

- count = 0;
list_for_each_safe(curr, next, &pbd_delwrite_queue) {
pb = list_entry(curr, page_buf_t, pb_list);

@@ -1657,7 +1655,7 @@
pb->pb_flags &= ~PBF_DELWRI;
pb->pb_flags |= PBF_WRITE;
list_move(&pb->pb_list, &tmp);
- count++;
+ blk_run_address_space(pb->pb_target->pbr_mapping);
}
}

@@ -1671,8 +1669,6 @@

if (as_list_len > 0)
purge_addresses();
- if (count)
- blk_run_queues();

force_flush = 0;
} while (pagebuf_daemon_active);
@@ -1734,13 +1730,11 @@
pagebuf_lock(pb);
pagebuf_iostrategy(pb);
if (++flush_cnt > 32) {
- blk_run_queues();
+ blk_run_address_space(pb->pb_target->pbr_mapping);
flush_cnt = 0;
}
}

- blk_run_queues();
-
while (!list_empty(&tmp)) {
pb = list_entry(tmp.next, page_buf_t, pb_list);

diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/backing-dev.h linux-2.6.4-rc2-mm1-plug/include/linux/backing-dev.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/backing-dev.h 2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/backing-dev.h 2004-03-10 11:06:45.000000000 +0100
@@ -28,6 +28,8 @@
int memory_backed; /* Cannot clean pages with writepage */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
+ void (*unplug_io_fn)(struct backing_dev_info *);
+ void *unplug_io_data;
};

extern struct backing_dev_info default_backing_dev_info;
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/bio.h linux-2.6.4-rc2-mm1-plug/include/linux/bio.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/bio.h 2004-02-18 04:59:06.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/bio.h 2004-03-10 13:01:45.000000000 +0100
@@ -124,6 +124,7 @@
#define BIO_RW_AHEAD 1
#define BIO_RW_BARRIER 2
#define BIO_RW_FAILFAST 3
+#define BIO_RW_SYNC 4

/*
* various member access, note that bio_data should of course not be used
@@ -138,6 +139,7 @@
#define bio_cur_sectors(bio) (bio_iovec(bio)->bv_len >> 9)
#define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio)))
#define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC))

/*
* will die
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/blkdev.h linux-2.6.4-rc2-mm1-plug/include/linux/blkdev.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/blkdev.h 2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/blkdev.h 2004-03-10 13:27:27.761319763 +0100
@@ -243,7 +243,7 @@
typedef void (request_fn_proc) (request_queue_t *q);
typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
typedef int (prep_rq_fn) (request_queue_t *, struct request *);
-typedef void (unplug_fn) (void *q);
+typedef void (unplug_fn) (request_queue_t *);

struct bio_vec;
typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
@@ -315,8 +315,6 @@
unsigned long bounce_pfn;
int bounce_gfp;

- struct list_head plug_list;
-
/*
* various queue flags, see QUEUE_* below
*/
@@ -369,8 +367,9 @@
#define QUEUE_FLAG_READFULL 3 /* write queue has been filled */
#define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */
#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
+#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */

-#define blk_queue_plugged(q) !list_empty(&(q)->plug_list)
+#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)

@@ -514,7 +513,8 @@
extern void blk_start_queue(request_queue_t *q);
extern void blk_stop_queue(request_queue_t *q);
extern void __blk_stop_queue(request_queue_t *q);
-extern void blk_run_queue(request_queue_t *q);
+extern void blk_run_queue(request_queue_t *);
+extern void blk_run_backing_dev(struct backing_dev_info *);
extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int, int);
@@ -525,6 +525,12 @@
return bdev->bd_disk->queue;
}

+static inline void blk_run_address_space(struct address_space *mapping)
+{
+ if (mapping)
+ blk_run_backing_dev(mapping->backing_dev_info);
+}
+
/*
* end_request() and friends. Must be called with the request queue spinlock
* acquired. All functions called within end_request() _must_be_ atomic.
@@ -571,7 +577,7 @@

extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(void *);
+extern void generic_unplug_device(request_queue_t *);
extern long nr_blockdev_pages(void);

int blk_get_queue(request_queue_t *);
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/fs.h linux-2.6.4-rc2-mm1-plug/include/linux/fs.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/fs.h 2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/fs.h 2004-03-10 13:03:04.000000000 +0100
@@ -82,6 +82,8 @@
#define WRITE 1
#define READA 2 /* read-ahead - don't block if no resources */
#define SPECIAL 4 /* For non-blockdevice requests in request queue */
+#define READ_SYNC (READ | BIO_RW_SYNC)
+#define WRITE_SYNC (WRITE | BIO_RW_SYNC)

#define SEL_IN 1
#define SEL_OUT 2
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/raid/md_k.h linux-2.6.4-rc2-mm1-plug/include/linux/raid/md_k.h
--- /opt/kernel/linux-2.6.4-rc2-mm1/include/linux/raid/md_k.h 2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/include/linux/raid/md_k.h 2004-03-10 12:15:43.000000000 +0100
@@ -326,7 +326,6 @@
if (condition) \
break; \
spin_unlock_irq(&lock); \
- blk_run_queues(); \
schedule(); \
spin_lock_irq(&lock); \
} \
@@ -341,30 +340,5 @@
__wait_event_lock_irq(wq, condition, lock); \
} while (0)

-
-#define __wait_disk_event(wq, condition) \
-do { \
- wait_queue_t __wait; \
- init_waitqueue_entry(&__wait, current); \
- \
- add_wait_queue(&wq, &__wait); \
- for (;;) { \
- set_current_state(TASK_UNINTERRUPTIBLE); \
- if (condition) \
- break; \
- blk_run_queues(); \
- schedule(); \
- } \
- current->state = TASK_RUNNING; \
- remove_wait_queue(&wq, &__wait); \
-} while (0)
-
-#define wait_disk_event(wq, condition) \
-do { \
- if (condition) \
- break; \
- __wait_disk_event(wq, condition); \
-} while (0)
-
#endif

diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/disk.c linux-2.6.4-rc2-mm1-plug/kernel/power/disk.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/disk.c 2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/kernel/power/disk.c 2004-03-10 12:17:32.000000000 +0100
@@ -84,7 +84,6 @@
while (shrink_all_memory(10000))
printk(".");
printk("|\n");
- blk_run_queues();
}


diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/pmdisk.c linux-2.6.4-rc2-mm1-plug/kernel/power/pmdisk.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/pmdisk.c 2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/kernel/power/pmdisk.c 2004-03-10 13:16:16.000000000 +0100
@@ -859,7 +859,6 @@

static void wait_io(void)
{
- blk_run_queues();
while(atomic_read(&io_done))
io_schedule();
}
@@ -895,6 +894,7 @@
goto Done;
}

+ rw |= BIO_RW_SYNC;
if (rw == WRITE)
bio_set_pages_dirty(bio);
start_io();
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/swsusp.c linux-2.6.4-rc2-mm1-plug/kernel/power/swsusp.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/kernel/power/swsusp.c 2004-03-09 13:08:30.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/kernel/power/swsusp.c 2004-03-10 12:17:52.000000000 +0100
@@ -707,11 +707,6 @@

free_some_memory();

- /* No need to invalidate any vfsmnt list --
- * they will be valid after resume, anyway.
- */
- blk_run_queues();
-
/* Save state of all device drivers, and stop them. */
if ((res = device_suspend(4))==0)
/* If stopping device drivers worked, we proceed basically into
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/mm/mempool.c linux-2.6.4-rc2-mm1-plug/mm/mempool.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/mm/mempool.c 2004-02-18 04:58:32.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/mm/mempool.c 2004-03-09 15:24:39.000000000 +0100
@@ -233,8 +233,6 @@
if (!(gfp_mask & __GFP_WAIT))
return NULL;

- blk_run_queues();
-
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
if (!pool->curr_nr)
io_schedule();
diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/mm/readahead.c linux-2.6.4-rc2-mm1-plug/mm/readahead.c
--- /opt/kernel/linux-2.6.4-rc2-mm1/mm/readahead.c 2004-03-09 13:08:49.000000000 +0100
+++ linux-2.6.4-rc2-mm1-plug/mm/readahead.c 2004-03-10 11:04:49.000000000 +0100
@@ -15,9 +15,14 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>

+static void default_unplug_io_fn(struct backing_dev_info *bdi)
+{
+}
+
struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
.state = 0,
+ .unplug_io_fn = default_unplug_io_fn,
};

EXPORT_SYMBOL_GPL(default_backing_dev_info);

--
Jens Axboe


2004-03-10 19:53:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Jens Axboe <[email protected]> wrote:
>
> Here's a first cut at killing global plugging of block devices to reduce
> the nasty contention blk_plug_lock caused. This introduceds per-queue
> plugging, controlled by the backing_dev_info.

This is such an improvement over what we have now it isn't funny.

Ken, the next -mm is starting to look like linux-3.1.0 so I think it
would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.

2004-03-10 20:04:21

by Chen, Kenneth W

[permalink] [raw]
Subject: RE: [PATCH] backing dev unplugging

Andrew Morton wrote on Wednesday, March 10, 2004 11:56 AM
> Jens Axboe <[email protected]> wrote:
> >
> > Here's a first cut at killing global plugging of block devices to reduce
> > the nasty contention blk_plug_lock caused. This introduceds per-queue
> > plugging, controlled by the backing_dev_info.
>
> This is such an improvement over what we have now it isn't funny.
>
> Ken, the next -mm is starting to look like linux-3.1.0 so I think it
> would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.

I'm planning on couple experiments: one is just Jens's change on top of
what we have so we can validate the backing dev unplug. Then we will run
2.6.4-rc2-mm1 + Jens's patch.

- Ken


2004-03-10 20:16:00

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Jens Axboe wrote:
> Hi,
>
> Here's a first cut at killing global plugging of block devices to reduce
> the nasty contention blk_plug_lock caused. This introduceds per-queue
> plugging, controlled by the backing_dev_info. Observations:
>
> - Most uses of blk_run_queues() without a specific context was bogus.
> Usually the act of kicking the targets in question should be (and
> already are) performed by other activities, such as making the vm
> flushing run to free memory.
>
> - Some use of blk_run_queues() really just want to kick the final queue
> where the bio goes to. I've added bio_sync() (BIO_RW_SYNC) to manage
> those, if the queue needs unplugging we'll do it when holding the lock
> for the queue already.
>
> - The dm bit needs careful examination and checking of Joe. It could be
> more clever and maintain plug state of each target, I just added a
> dm_table unplug all functionality.
>
> Patch is against 2.6.4-rc2-mm1.

I like it a lot.

Any chance some of these newly-shortened functions can become static
inline as well?


> @@ -1100,13 +1092,11 @@
> * don't plug a stopped queue, it must be paired with blk_start_queue()
> * which will restart the queueing
> */
> - if (!blk_queue_plugged(q)
> - && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
> - spin_lock(&blk_plug_lock);
> - list_add_tail(&q->plug_list, &blk_plug_list);
> + if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
> + return;
> +
> + if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
> - spin_unlock(&blk_plug_lock);
> - }
> }
>
> EXPORT_SYMBOL(blk_plug_device);
> @@ -1118,15 +1108,12 @@
> int blk_remove_plug(request_queue_t *q)
> {
> WARN_ON(!irqs_disabled());
> - if (blk_queue_plugged(q)) {
> - spin_lock(&blk_plug_lock);
> - list_del_init(&q->plug_list);
> - del_timer(&q->unplug_timer);
> - spin_unlock(&blk_plug_lock);
> - return 1;
> - }
>
> - return 0;
> + if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> + return 0;
> +
> + del_timer(&q->unplug_timer);
> + return 1;
> }
>
> EXPORT_SYMBOL(blk_remove_plug);

I tend to like

if (test_and_clear_bit())
call_out_of_line_function()

style for small functions like this, and inline those.

Jeff



2004-03-10 20:17:48

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Andrew Morton wrote:
> Jens Axboe <[email protected]> wrote:
> >
> > Here's a first cut at killing global plugging of block devices to reduce
> > the nasty contention blk_plug_lock caused. This introduceds per-queue
> > plugging, controlled by the backing_dev_info.
>
> This is such an improvement over what we have now it isn't funny.

It is pretty scary, once I dove into it...

--
Jens Axboe

2004-03-10 20:21:09

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Kenneth Chen wrote:
> Andrew Morton wrote on Wednesday, March 10, 2004 11:56 AM
> > Jens Axboe <[email protected]> wrote:
> > >
> > > Here's a first cut at killing global plugging of block devices to reduce
> > > the nasty contention blk_plug_lock caused. This introduceds per-queue
> > > plugging, controlled by the backing_dev_info.
> >
> > This is such an improvement over what we have now it isn't funny.
> >
> > Ken, the next -mm is starting to look like linux-3.1.0 so I think it
> > would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.
>
> I'm planning on couple experiments: one is just Jens's change on top of
> what we have so we can validate the backing dev unplug. Then we will run
> 2.6.4-rc2-mm1 + Jens's patch.

It'll be hard to apply the patch against anything but -mm, since it
builds (or at least will conflict with) other changes in there. I
deliberately made a -mm version this time though I usually make -linus
and adapt if necessary, for Andrew to get some testing on this.

--
Jens Axboe

2004-03-10 20:22:27

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Jeff Garzik wrote:
> Jens Axboe wrote:
> >Hi,
> >
> >Here's a first cut at killing global plugging of block devices to reduce
> >the nasty contention blk_plug_lock caused. This introduceds per-queue
> >plugging, controlled by the backing_dev_info. Observations:
> >
> >- Most uses of blk_run_queues() without a specific context was bogus.
> > Usually the act of kicking the targets in question should be (and
> > already are) performed by other activities, such as making the vm
> > flushing run to free memory.
> >
> >- Some use of blk_run_queues() really just want to kick the final queue
> > where the bio goes to. I've added bio_sync() (BIO_RW_SYNC) to manage
> > those, if the queue needs unplugging we'll do it when holding the lock
> > for the queue already.
> >
> >- The dm bit needs careful examination and checking of Joe. It could be
> > more clever and maintain plug state of each target, I just added a
> > dm_table unplug all functionality.
> >
> >Patch is against 2.6.4-rc2-mm1.
>
> I like it a lot.

Thanks

> Any chance some of these newly-shortened functions can become static
> inline as well?

Yeah most likely. Remember this is just a first version, there's room
for a little cleanup here and there for sure.

> >@@ -1100,13 +1092,11 @@
> > * don't plug a stopped queue, it must be paired with
> > blk_start_queue()
> > * which will restart the queueing
> > */
> >- if (!blk_queue_plugged(q)
> >- && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
> >- spin_lock(&blk_plug_lock);
> >- list_add_tail(&q->plug_list, &blk_plug_list);
> >+ if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
> >+ return;
> >+
> >+ if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> > mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
> >- spin_unlock(&blk_plug_lock);
> >- }
> > }
> >
> > EXPORT_SYMBOL(blk_plug_device);
> >@@ -1118,15 +1108,12 @@
> > int blk_remove_plug(request_queue_t *q)
> > {
> > WARN_ON(!irqs_disabled());
> >- if (blk_queue_plugged(q)) {
> >- spin_lock(&blk_plug_lock);
> >- list_del_init(&q->plug_list);
> >- del_timer(&q->unplug_timer);
> >- spin_unlock(&blk_plug_lock);
> >- return 1;
> >- }
> >
> >- return 0;
> >+ if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
> >+ return 0;
> >+
> >+ del_timer(&q->unplug_timer);
> >+ return 1;
> > }
> >
> > EXPORT_SYMBOL(blk_remove_plug);
>
> I tend to like
>
> if (test_and_clear_bit())
> call_out_of_line_function()
>
> style for small functions like this, and inline those.

I like to do

if (expected condition)
do_stuff

slow_path

--
Jens Axboe

2004-03-10 20:46:58

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10, 2004 at 09:20:25PM +0100, Jens Axboe wrote:
> It'll be hard to apply the patch against anything but -mm, since it
> builds (or at least will conflict with) other changes in there. I
> deliberately made a -mm version this time though I usually make -linus
> and adapt if necessary, for Andrew to get some testing on this.

I tried applying to a BK tree (that was fresh as of a couple of hours
ago), and ignored the dm related changes, and it seemed to work ok after
I fixed up rejects in direct-io.c and blkdev.h (which didn't seem hard,
unless I'm missing something):

10 qla2200 fc controllers, 64 cpus, 112 disks

stock BK tree: ~43945 I/Os per second
w/Jens' patch: ~47149 I/Os per second

Jesse

2004-03-10 20:49:46

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Jesse Barnes wrote:
> On Wed, Mar 10, 2004 at 09:20:25PM +0100, Jens Axboe wrote:
> > It'll be hard to apply the patch against anything but -mm, since it
> > builds (or at least will conflict with) other changes in there. I
> > deliberately made a -mm version this time though I usually make -linus
> > and adapt if necessary, for Andrew to get some testing on this.
>
> I tried applying to a BK tree (that was fresh as of a couple of hours
> ago), and ignored the dm related changes, and it seemed to work ok after
> I fixed up rejects in direct-io.c and blkdev.h (which didn't seem hard,
> unless I'm missing something):
>
> 10 qla2200 fc controllers, 64 cpus, 112 disks
>
> stock BK tree: ~43945 I/Os per second
> w/Jens' patch: ~47149 I/Os per second

Do you have a profile for both runs (what was the work load, btw)?

--
Jens Axboe

2004-03-10 21:00:33

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Jens Axboe <[email protected]> wrote:
>
> Here's a first cut at killing global plugging of block devices to reduce
> the nasty contention blk_plug_lock caused.

Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?

If so, what are the lock ranking issues here? The queue lock is not held
yet, so it seems pretty simple?

2004-03-10 21:04:39

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Andrew Morton wrote:
> Jens Axboe <[email protected]> wrote:
> >
> > Here's a first cut at killing global plugging of block devices to reduce
> > the nasty contention blk_plug_lock caused.
>
> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?

Ugh yes, we certainly should.

> If so, what are the lock ranking issues here? The queue lock is not
> held yet, so it seems pretty simple?

As far as I can tell, it's pretty straight forward. The unplug_fn() will
grab the queue lock for 'ordinary' devices, for dm on dm you'd nest the
maplock inside each other (which should be quite alright, as far as I
can tell, without pulling any nasty tricks).

--
Jens Axboe

2004-03-10 21:04:40

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Jesse Barnes wrote:
> On Wed, Mar 10, 2004 at 09:52:37PM +0100, Jens Axboe wrote:
> > And what fs, if any? The XFS changes probably need someone knowledgable
> > about XFS internals to verify them.
>
> Yeah, sorry. I forgot to post CPU time info too (I'm putting that
> together now). The benchmark is doing small, direct I/O requests
> directly to the block devices (e.g. /dev/sdd1).

Neat thanks, looking forward to seeing them.

--
Jens Axboe

2004-03-10 21:08:41

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10, 2004 at 09:52:37PM +0100, Jens Axboe wrote:
> And what fs, if any? The XFS changes probably need someone knowledgable
> about XFS internals to verify them.

Yeah, sorry. I forgot to post CPU time info too (I'm putting that
together now). The benchmark is doing small, direct I/O requests
directly to the block devices (e.g. /dev/sdd1).

Jesse

2004-03-10 21:36:28

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10, 2004 at 10:02:50PM +0100, Jens Axboe wrote:
> On Wed, Mar 10 2004, Jesse Barnes wrote:
> > On Wed, Mar 10, 2004 at 09:52:37PM +0100, Jens Axboe wrote:
> > > And what fs, if any? The XFS changes probably need someone knowledgable
> > > about XFS internals to verify them.
> >
> > Yeah, sorry. I forgot to post CPU time info too (I'm putting that
> > together now). The benchmark is doing small, direct I/O requests
> > directly to the block devices (e.g. /dev/sdd1).
>
> Neat thanks, looking forward to seeing them.

Here's a bit more info. I can probably do some more benchmarking later
if you need it (though sometimes getting time on that machine can be
hard).

The benchmark creates a thread for each block device. Each one opens
the device with O_DIRECT and starts doing I/O.

10 qla2200 fc controllers, 64 cpus, 112 disks

-------------------------------------
stock BK tree: ~43945 I/Os per second

[root@revenue sio]# readprofile -m /root/System.map | sort -nr +2 | head -20
204038 default_idle 6376.1875
700889 snidle 1825.2318
234641 cpu_idle 488.8354
25954 blk_run_queues 45.0590
4475 dio_bio_end_io 11.6536
6137 scsi_end_request 11.2812
18002 scsi_request_fn 9.5350
447 ia64_spinlock_contention 6.9844
19326 __make_request 5.3446
3020 sn_dma_flush 4.4940
856 generic_unplug_device 2.4318
961 scsi_device_unbusy 2.3101
360 current_kernel_time 1.8750
547 put_io_context 1.7094
932 dio_await_one 1.6181
409 blk_queue_bounce 1.5977
3484 qla2x00_start_scsi 1.4140
147 pid_alive 1.1484
436 as_set_request 1.1354
283 kmem_cache_alloc 1.1055

CPU time from vmstat:
usr sys id wa
0 15 20 65
0 19 0 81
0 17 0 83
0 19 0 81
0 16 0 83
0 18 0 82
0 18 0 82
0 13 0 87
0 16 0 84
0 12 26 62

-------------------------------------
w/Jens' patch: ~47149 I/Os per second

[root@revenue sio]# readprofile -m /root/System.map | sort -nr +2 | head -20
181993 default_idle 5687.2812
624772 snidle 1627.0104
209129 cpu_idle 435.6854
4755 dio_bio_end_io 12.3828
6593 scsi_end_request 12.1195
435 ia64_spinlock_contention 6.7969
2959 sn_dma_flush 4.4033
7459 scsi_request_fn 3.9507
449 blk_run_backing_dev 3.5078
1452 scsi_device_unbusy 3.4904
250 kobject_put 2.6042
590 put_io_context 1.8438
995 dio_await_one 1.6365
313 current_kernel_time 1.6302
361 kmem_cache_alloc 1.4102
3435 qla2x00_start_scsi 1.3941
5032 __make_request 1.3674
480 as_set_request 1.2500
603 scsi_put_command 1.1085
6019 schedule 1.1064

CPU time from vmstat:
usr sys id wa
0 5 60 35
0 8 28 64
0 8 20 72
0 8 19 73
0 8 17 75
0 8 15 77
0 8 15 77
0 8 12 80
0 8 11 81
0 8 11 81

2004-03-10 21:43:31

by Miquel van Smoorenburg

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

In article <[email protected]>,
Jens Axboe <[email protected]> wrote:
>On Wed, Mar 10 2004, Andrew Morton wrote:
>> Jens Axboe <[email protected]> wrote:
>> >
>> > Here's a first cut at killing global plugging of block devices to reduce
>> > the nasty contention blk_plug_lock caused.
>>
>> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?
>
>Ugh yes, we certainly should.

With the latest patches from Joe it would be more like

map = dm_get_table(md);
if (map) {
dm_table_unplug_all(map);
dm_table_put(map);
}

No lock ranking issues, you just get a refcounted map (table, really).

Mike.

2004-03-10 22:23:47

by Nathan Scott

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Hi Jens,

On Wed, Mar 10, 2004 at 01:45:07PM +0100, Jens Axboe wrote:
> ...[snip]...
> diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c
> --- /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c 2004-03-09 13:08:30.000000000 +0100
> +++ linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c 2004-03-10 13:13:49.000000000 +0100
> @@ -1013,7 +1013,7 @@
> {
> PB_TRACE(pb, "lock", 0);
> if (atomic_read(&pb->pb_io_remaining))
> - blk_run_queues();
> + blk_run_address_space(pb->pb_target->pbr_mapping);
> down(&pb->pb_sema);
> PB_SET_OWNER(pb);
> PB_TRACE(pb, "locked", 0);
> @@ -1109,7 +1109,7 @@
> if (atomic_read(&pb->pb_pin_count) == 0)
> break;
> if (atomic_read(&pb->pb_io_remaining))
> - blk_run_queues();
> + blk_run_address_space(pb->pb_target->pbr_mapping);
> schedule();
> }
> remove_wait_queue(&pb->pb_waiters, &wait);
> @@ -1407,7 +1407,7 @@
> if (pb->pb_flags & PBF_RUN_QUEUES) {
> pb->pb_flags &= ~PBF_RUN_QUEUES;
> if (atomic_read(&pb->pb_io_remaining) > 1)
> - blk_run_queues();
> + blk_run_address_space(pb->pb_target->pbr_mapping);
> }
> }
>
> @@ -1471,7 +1471,7 @@
> {
> PB_TRACE(pb, "iowait", 0);
> if (atomic_read(&pb->pb_io_remaining))
> - blk_run_queues();
> + blk_run_address_space(pb->pb_target->pbr_mapping);
> down(&pb->pb_iodonesema);
> PB_TRACE(pb, "iowaited", (long)pb->pb_error);
> return pb->pb_error;
> @@ -1617,7 +1617,6 @@
> pagebuf_daemon(
> void *data)
> {
> - int count;
> page_buf_t *pb;
> struct list_head *curr, *next, tmp;
>
> @@ -1640,7 +1639,6 @@
>
> spin_lock(&pbd_delwrite_lock);
>
> - count = 0;
> list_for_each_safe(curr, next, &pbd_delwrite_queue) {
> pb = list_entry(curr, page_buf_t, pb_list);
>
> @@ -1657,7 +1655,7 @@
> pb->pb_flags &= ~PBF_DELWRI;
> pb->pb_flags |= PBF_WRITE;
> list_move(&pb->pb_list, &tmp);
> - count++;
> + blk_run_address_space(pb->pb_target->pbr_mapping);
> }

This moves the blk_run_address_space to before we submit the
I/O (this bit of code is moving buffers off the delwri queue
onto a temporary queue, buffers on the temporary queue are
then submitted a little further down) - I suspect we need to
move this new blk_run_address_space call down into the temp
list processing, just after pagebuf_iostrategy.

> }
>
> @@ -1671,8 +1669,6 @@
>
> if (as_list_len > 0)
> purge_addresses();
> - if (count)
> - blk_run_queues();
>
> force_flush = 0;
> } while (pagebuf_daemon_active);
> @@ -1734,13 +1730,11 @@
> pagebuf_lock(pb);
> pagebuf_iostrategy(pb);
> if (++flush_cnt > 32) {
> - blk_run_queues();
> + blk_run_address_space(pb->pb_target->pbr_mapping);
> flush_cnt = 0;
> }
> }
>
> - blk_run_queues();
> -
> while (!list_empty(&tmp)) {
> pb = list_entry(tmp.next, page_buf_t, pb_list);
>

For this second one, we probably just want to ditch the flush_cnt
there (this change is doing blk_run_address_space on every 32nd
buffer target, and not the intervening ones). We will be doing a
bunch more blk_run_address_space calls than we probably need to,
not sure if thats going to become an issue or not, let me prod
some of the other XFS folks for more insight there...

thanks.

--
Nathan

2004-03-10 23:07:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging


(Please use reply-to-all)

"Miquel van Smoorenburg" <[email protected]> wrote:
>
> In article <[email protected]>,
> Jens Axboe <[email protected]> wrote:
> >On Wed, Mar 10 2004, Andrew Morton wrote:
> >> Jens Axboe <[email protected]> wrote:
> >> >
> >> > Here's a first cut at killing global plugging of block devices to reduce
> >> > the nasty contention blk_plug_lock caused.
> >>
> >> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?
> >
> >Ugh yes, we certainly should.
>
> With the latest patches from Joe it would be more like
>
> map = dm_get_table(md);
> if (map) {
> dm_table_unplug_all(map);
> dm_table_put(map);
> }
>
> No lock ranking issues, you just get a refcounted map (table, really).

Ah, OK. Jens, you'll be needing this (on rc2-mm1):

dm.c: protect md->map with a rw spin lock rather than the md->lock
semaphore. Also ensure that everyone accesses md->map through
dm_get_table(), rather than directly.


---

25-akpm/drivers/md/dm-table.c | 3 +
25-akpm/drivers/md/dm.c | 88 +++++++++++++++++++++++++-----------------
2 files changed, 57 insertions(+), 34 deletions(-)

diff -puN drivers/md/dm.c~dm-map-rwlock-ng drivers/md/dm.c
--- 25/drivers/md/dm.c~dm-map-rwlock-ng Mon Mar 8 13:58:42 2004
+++ 25-akpm/drivers/md/dm.c Mon Mar 8 13:58:42 2004
@@ -49,7 +49,7 @@ struct target_io {

struct mapped_device {
struct rw_semaphore lock;
- rwlock_t maplock;
+ rwlock_t map_lock;
atomic_t holders;

unsigned long flags;
@@ -238,6 +238,24 @@ static int queue_io(struct mapped_device
return 0; /* deferred successfully */
}

+/*
+ * Everyone (including functions in this file), should use this
+ * function to access the md->map field, and make sure they call
+ * dm_table_put() when finished.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md)
+{
+ struct dm_table *t;
+
+ read_lock(&md->map_lock);
+ t = md->map;
+ if (t)
+ dm_table_get(t);
+ read_unlock(&md->map_lock);
+
+ return t;
+}
+
/*-----------------------------------------------------------------
* CRUD START:
* A more elegant soln is in the works that uses the queue
@@ -346,6 +364,7 @@ static void __map_bio(struct dm_target *

struct clone_info {
struct mapped_device *md;
+ struct dm_table *map;
struct bio *bio;
struct dm_io *io;
sector_t sector;
@@ -399,7 +418,7 @@ static struct bio *clone_bio(struct bio
static void __clone_and_map(struct clone_info *ci)
{
struct bio *clone, *bio = ci->bio;
- struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
+ struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
struct target_io *tio;

@@ -460,7 +479,7 @@ static void __clone_and_map(struct clone

ci->sector += max;
ci->sector_count -= max;
- ti = dm_table_find_target(ci->md->map, ci->sector);
+ ti = dm_table_find_target(ci->map, ci->sector);

len = to_sector(bv->bv_len) - max;
clone = split_bvec(bio, ci->sector, ci->idx,
@@ -485,6 +504,7 @@ static void __split_bio(struct mapped_de
struct clone_info ci;

ci.md = md;
+ ci.map = dm_get_table(md);
ci.bio = bio;
ci.io = alloc_io(md);
ci.io->error = 0;
@@ -501,6 +521,7 @@ static void __split_bio(struct mapped_de

/* drop the extra reference count */
dec_pending(ci.io, 0);
+ dm_table_put(ci.map);
}
/*-----------------------------------------------------------------
* CRUD END
@@ -563,15 +584,16 @@ static int dm_request(request_queue_t *q
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r;
- struct mapped_device *md = congested_data;
+ struct mapped_device *md = (struct mapped_device *) congested_data;
+ struct dm_table *map = dm_get_table(md);

- read_lock(&md->maplock);
- if (md->map == NULL || test_bit(DMF_BLOCK_IO, &md->flags))
+ if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
+ /* FIXME: shouldn't suspended count a congested ? */
r = bdi_bits;
else
- r = dm_table_any_congested(md->map, bdi_bits);
- read_unlock(&md->maplock);
+ r = dm_table_any_congested(map, bdi_bits);

+ dm_table_put(map);
return r;
}

@@ -646,7 +668,7 @@ static struct mapped_device *alloc_dev(u

memset(md, 0, sizeof(*md));
init_rwsem(&md->lock);
- rwlock_init(&md->maplock);
+ rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);

md->queue = blk_alloc_queue(GFP_KERNEL);
@@ -746,12 +768,12 @@ static int __bind(struct mapped_device *
if (size == 0)
return 0;

- write_lock(&md->maplock);
+ write_lock(&md->map_lock);
md->map = t;
- write_unlock(&md->maplock);
- dm_table_event_callback(md->map, event_callback, md);
+ write_unlock(&md->map_lock);

dm_table_get(t);
+ dm_table_event_callback(md->map, event_callback, md);
dm_table_set_restrictions(t, q);
return 0;
}
@@ -764,9 +786,9 @@ static void __unbind(struct mapped_devic
return;

dm_table_event_callback(map, NULL, NULL);
- write_lock(&md->maplock);
+ write_lock(&md->map_lock);
md->map = NULL;
- write_unlock(&md->maplock);
+ write_unlock(&md->map_lock);
dm_table_put(map);
}

@@ -803,12 +825,16 @@ void dm_get(struct mapped_device *md)

void dm_put(struct mapped_device *md)
{
+ struct dm_table *map = dm_get_table(md);
+
if (atomic_dec_and_test(&md->holders)) {
- if (!test_bit(DMF_SUSPENDED, &md->flags) && md->map)
- dm_table_suspend_targets(md->map);
+ if (!test_bit(DMF_SUSPENDED, &md->flags) && map)
+ dm_table_suspend_targets(map);
__unbind(md);
free_dev(md);
}
+
+ dm_table_put(map);
}

/*
@@ -859,6 +885,7 @@ int dm_swap_table(struct mapped_device *
*/
int dm_suspend(struct mapped_device *md)
{
+ struct dm_table *map;
DECLARE_WAITQUEUE(wait, current);

down_write(&md->lock);
@@ -894,8 +921,11 @@ int dm_suspend(struct mapped_device *md)
down_write(&md->lock);
remove_wait_queue(&md->wait, &wait);
set_bit(DMF_SUSPENDED, &md->flags);
- if (md->map)
- dm_table_suspend_targets(md->map);
+
+ map = dm_get_table(md);
+ if (map)
+ dm_table_suspend_targets(map);
+ dm_table_put(map);
up_write(&md->lock);

return 0;
@@ -904,22 +934,25 @@ int dm_suspend(struct mapped_device *md)
int dm_resume(struct mapped_device *md)
{
struct bio *def;
+ struct dm_table *map = dm_get_table(md);

down_write(&md->lock);
- if (!md->map ||
+ if (!map ||
!test_bit(DMF_SUSPENDED, &md->flags) ||
- !dm_table_get_size(md->map)) {
+ !dm_table_get_size(map)) {
up_write(&md->lock);
+ dm_table_put(map);
return -EINVAL;
}

- dm_table_resume_targets(md->map);
+ dm_table_resume_targets(map);
clear_bit(DMF_SUSPENDED, &md->flags);
clear_bit(DMF_BLOCK_IO, &md->flags);

def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->lock);
+ dm_table_put(map);

blk_run_queues();

@@ -971,19 +1004,6 @@ struct gendisk *dm_disk(struct mapped_de
return md->disk;
}

-struct dm_table *dm_get_table(struct mapped_device *md)
-{
- struct dm_table *t;
-
- down_read(&md->lock);
- t = md->map;
- if (t)
- dm_table_get(t);
- up_read(&md->lock);
-
- return t;
-}
-
int dm_suspended(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED, &md->flags);
diff -puN drivers/md/dm-table.c~dm-map-rwlock-ng drivers/md/dm-table.c
--- 25/drivers/md/dm-table.c~dm-map-rwlock-ng Mon Mar 8 13:58:42 2004
+++ 25-akpm/drivers/md/dm-table.c Mon Mar 8 13:58:42 2004
@@ -279,6 +279,9 @@ void dm_table_get(struct dm_table *t)

void dm_table_put(struct dm_table *t)
{
+ if (!t)
+ return;
+
if (atomic_dec_and_test(&t->holders))
table_destroy(t);
}

_

2004-03-10 23:36:49

by Steve Lord

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Nathan Scott wrote:
>
> For this second one, we probably just want to ditch the flush_cnt
> there (this change is doing blk_run_address_space on every 32nd
> buffer target, and not the intervening ones). We will be doing a
> bunch more blk_run_address_space calls than we probably need to,
> not sure if thats going to become an issue or not, let me prod
> some of the other XFS folks for more insight there...
>
> thanks.
>

The concept there was that we were just pushing things down into the
elevator in a batch, then unplugging it afterwards. The do it every
32 I/O's was added to avoid some request starvation issues - which are
probably historical now.

I was lazy and did not look at the context on the code, but there
are two paths in here. One is unmount flushing all the entries for
a specific filesystem, the other is background flushing. I think the
background flush activity can live without the blk_run_address_space
call being there at all. If we need to grab the lock on the metadata
later we will make the call then. The unmount case needs to call it,
but since that specifies a specific target, it can just make one
call.

Steve

2004-03-10 23:52:36

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

[email protected] (Jesse Barnes) wrote:
>
> -------------------------------------
> w/Jens' patch: ~47149 I/Os per second

Happier.

> [root@revenue sio]# readprofile -m /root/System.map | sort -nr +2 | head -20
> 181993 default_idle 5687.2812
> 624772 snidle 1627.0104
> 209129 cpu_idle 435.6854
> 4755 dio_bio_end_io 12.3828
> 6593 scsi_end_request 12.1195
> 435 ia64_spinlock_contention 6.7969
> 2959 sn_dma_flush 4.4033

Do you know where that spinlock contention is coming from?

(We have a little patch for x86 which places the spinning code inline in the
caller of spin_lock() so it appears nicely in profiles.)

2004-03-11 00:04:07

by David Mosberger

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

>>>>> On Wed, 10 Mar 2004 15:54:19 -0800, Andrew Morton <[email protected]> said:

Andrew> [email protected] (Jesse Barnes) wrote:
>>
>> -------------------------------------
>> w/Jens' patch: ~47149 I/Os per second

Andrew> Happier.

>> [root@revenue sio]# readprofile -m /root/System.map | sort -nr +2
>> | head -20 181993 default_idle 5687.2812 624772 snidle 1627.0104
>> 209129 cpu_idle 435.6854 4755 dio_bio_end_io 12.3828 6593
>> scsi_end_request 12.1195 435 ia64_spinlock_contention 6.7969 2959
>> sn_dma_flush 4.4033

Andrew> Do you know where that spinlock contention is coming from?

Andrew> (We have a little patch for x86 which places the spinning
Andrew> code inline in the caller of spin_lock() so it appears
Andrew> nicely in profiles.)

And real men use profiling tools that provide a call-graph, so this
hack isn't necessary... ;-)

Jesse, if you want to try q-tools and need some help in getting the
per-CPU results merged, let me know (it's something that's planned for
a future release, but there is only so many hours in a day so this
hasn't been done yet).

--david

2004-03-11 00:07:44

by Miquel van Smoorenburg

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Thu, 11 Mar 2004 00:05:42, Andrew Morton wrote:
> "Miquel van Smoorenburg" <[email protected]> wrote:
> >
> >
> > With the latest patches from Joe it would be more like
> >
> > map = dm_get_table(md);
> > if (map) {
> > dm_table_unplug_all(map);
> > dm_table_put(map);
> > }
> >
> > No lock ranking issues, you just get a refcounted map (table, really).
>
> Ah, OK. Jens, you'll be needing this (on rc2-mm1):
>
> dm.c: protect md->map with a rw spin lock rather than the md->lock
> semaphore. Also ensure that everyone accesses md->map through
> dm_get_table(), rather than directly.
>
> 25-akpm/drivers/md/dm-table.c | 3 +
> 25-akpm/drivers/md/dm.c | 88 +++++++++++++++++++++++++-----------------

.. and this final one on top of it, presumably.

See https://www.redhat.com/archives/dm-devel/2004-March/msg00036.html

dm.c: remove __dm_request (merge with previous patch).
--- diff/drivers/md/dm.c 2004-03-08 15:48:05.000000000 +0000
+++ source/drivers/md/dm.c 2004-03-09 09:40:37.000000000 +0000
@@ -506,8 +506,13 @@ static void __split_bio(struct mapped_de
{
struct clone_info ci;

- ci.md = md;
ci.map = dm_get_table(md);
+ if (!ci.map) {
+ bio_io_error(bio, bio->bi_size);
+ return;
+ }
+
+ ci.md = md;
ci.bio = bio;
ci.io = alloc_io(md);
ci.io->error = 0;
@@ -530,17 +535,6 @@ static void __split_bio(struct mapped_de
* CRUD END
*---------------------------------------------------------------*/

-
-static inline void __dm_request(struct mapped_device *md, struct bio *bio)
-{
- if (!md->map) {
- bio_io_error(bio, bio->bi_size);
- return;
- }
-
- __split_bio(md, bio);
-}
-
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
@@ -579,7 +573,7 @@ static int dm_request(request_queue_t *q
down_read(&md->lock);
}

- __dm_request(md, bio);
+ __split_bio(md, bio);
up_read(&md->lock);
return 0;
}
@@ -591,7 +585,6 @@ static int dm_any_congested(void *conges
struct dm_table *map = dm_get_table(md);

if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
- /* FIXME: shouldn't suspended count a congested ? */
r = bdi_bits;
else
r = dm_table_any_congested(map, bdi_bits);
@@ -850,7 +843,7 @@ static void __flush_deferred_io(struct m
while (c) {
n = c->bi_next;
c->bi_next = NULL;
- __dm_request(md, c);
+ __split_bio(md, c);
c = n;
}
}


Mike.

2004-03-11 00:15:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Miquel van Smoorenburg <[email protected]> wrote:
>
> dm.c: protect md->map with a rw spin lock rather than the md->lock
> > semaphore. Also ensure that everyone accesses md->map through
> > dm_get_table(), rather than directly.
> >
> > 25-akpm/drivers/md/dm-table.c | 3 +
> > 25-akpm/drivers/md/dm.c | 88 +++++++++++++++++++++++++-----------------
>
> .. and this final one on top of it, presumably.
>
> See https://www.redhat.com/archives/dm-devel/2004-March/msg00036.html

Yup, thanks. Lots happening. I'll take a trot through the kernel
maternity ward, see if I can drag out another -mm later today.

2004-03-11 06:31:50

by Jesse Barnes

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10, 2004 at 04:03:55PM -0800, David Mosberger wrote:
> Jesse, if you want to try q-tools and need some help in getting the
> per-CPU results merged, let me know (it's something that's planned for
> a future release, but there is only so many hours in a day so this
> hasn't been done yet).

Yeah, I've been really wanting to try it out (esp. when you mentioned
that you'd be able to profile interrupt off code :). I'll try to get
some more time on the machine and take a look.

Also, wli just informed me that I screwed up another aspect of the
profile too, the readprofile command i grabbed from the manpage (as my
time on the machine ran out) didn't output things in a very useful
order.

Thanks,
Jesse

2004-03-11 06:43:53

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Wed, Mar 10 2004, Andrew Morton wrote:
>
> (Please use reply-to-all)
>
> "Miquel van Smoorenburg" <[email protected]> wrote:
> >
> > In article <[email protected]>,
> > Jens Axboe <[email protected]> wrote:
> > >On Wed, Mar 10 2004, Andrew Morton wrote:
> > >> Jens Axboe <[email protected]> wrote:
> > >> >
> > >> > Here's a first cut at killing global plugging of block devices to reduce
> > >> > the nasty contention blk_plug_lock caused.
> > >>
> > >> Shouldn't we take read_lock(&md->map_lock) in dm_table_unplug_all()?
> > >
> > >Ugh yes, we certainly should.
> >
> > With the latest patches from Joe it would be more like
> >
> > map = dm_get_table(md);
> > if (map) {
> > dm_table_unplug_all(map);
> > dm_table_put(map);
> > }
> >
> > No lock ranking issues, you just get a refcounted map (table, really).
>
> Ah, OK. Jens, you'll be needing this (on rc2-mm1):
>
> dm.c: protect md->map with a rw spin lock rather than the md->lock
> semaphore. Also ensure that everyone accesses md->map through
> dm_get_table(), rather than directly.

Neato, much better. I'll build on top of that.

--
Jens Axboe

2004-03-11 07:05:45

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Thu, Mar 11 2004, Nathan Scott wrote:
> Hi Jens,
>
> On Wed, Mar 10, 2004 at 01:45:07PM +0100, Jens Axboe wrote:
> > ...[snip]...
> > diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c
> > --- /opt/kernel/linux-2.6.4-rc2-mm1/fs/xfs/linux/xfs_buf.c 2004-03-09 13:08:30.000000000 +0100
> > +++ linux-2.6.4-rc2-mm1-plug/fs/xfs/linux/xfs_buf.c 2004-03-10 13:13:49.000000000 +0100
> > @@ -1013,7 +1013,7 @@
> > {
> > PB_TRACE(pb, "lock", 0);
> > if (atomic_read(&pb->pb_io_remaining))
> > - blk_run_queues();
> > + blk_run_address_space(pb->pb_target->pbr_mapping);
> > down(&pb->pb_sema);
> > PB_SET_OWNER(pb);
> > PB_TRACE(pb, "locked", 0);
> > @@ -1109,7 +1109,7 @@
> > if (atomic_read(&pb->pb_pin_count) == 0)
> > break;
> > if (atomic_read(&pb->pb_io_remaining))
> > - blk_run_queues();
> > + blk_run_address_space(pb->pb_target->pbr_mapping);
> > schedule();
> > }
> > remove_wait_queue(&pb->pb_waiters, &wait);
> > @@ -1407,7 +1407,7 @@
> > if (pb->pb_flags & PBF_RUN_QUEUES) {
> > pb->pb_flags &= ~PBF_RUN_QUEUES;
> > if (atomic_read(&pb->pb_io_remaining) > 1)
> > - blk_run_queues();
> > + blk_run_address_space(pb->pb_target->pbr_mapping);
> > }
> > }
> >
> > @@ -1471,7 +1471,7 @@
> > {
> > PB_TRACE(pb, "iowait", 0);
> > if (atomic_read(&pb->pb_io_remaining))
> > - blk_run_queues();
> > + blk_run_address_space(pb->pb_target->pbr_mapping);
> > down(&pb->pb_iodonesema);
> > PB_TRACE(pb, "iowaited", (long)pb->pb_error);
> > return pb->pb_error;
> > @@ -1617,7 +1617,6 @@
> > pagebuf_daemon(
> > void *data)
> > {
> > - int count;
> > page_buf_t *pb;
> > struct list_head *curr, *next, tmp;
> >
> > @@ -1640,7 +1639,6 @@
> >
> > spin_lock(&pbd_delwrite_lock);
> >
> > - count = 0;
> > list_for_each_safe(curr, next, &pbd_delwrite_queue) {
> > pb = list_entry(curr, page_buf_t, pb_list);
> >
> > @@ -1657,7 +1655,7 @@
> > pb->pb_flags &= ~PBF_DELWRI;
> > pb->pb_flags |= PBF_WRITE;
> > list_move(&pb->pb_list, &tmp);
> > - count++;
> > + blk_run_address_space(pb->pb_target->pbr_mapping);
> > }
>
> This moves the blk_run_address_space to before we submit the
> I/O (this bit of code is moving buffers off the delwri queue
> onto a temporary queue, buffers on the temporary queue are
> then submitted a little further down) - I suspect we need to
> move this new blk_run_address_space call down into the temp
> list processing, just after pagebuf_iostrategy.

I'm not surprised, the XFS 'conversion' was done quickly and is expected
to be half-assed :)

Thanks a lot for taking a look at this.

> > }
> >
> > @@ -1671,8 +1669,6 @@
> >
> > if (as_list_len > 0)
> > purge_addresses();
> > - if (count)
> > - blk_run_queues();
> >
> > force_flush = 0;
> > } while (pagebuf_daemon_active);
> > @@ -1734,13 +1730,11 @@
> > pagebuf_lock(pb);
> > pagebuf_iostrategy(pb);
> > if (++flush_cnt > 32) {
> > - blk_run_queues();
> > + blk_run_address_space(pb->pb_target->pbr_mapping);
> > flush_cnt = 0;
> > }
> > }
> >
> > - blk_run_queues();
> > -
> > while (!list_empty(&tmp)) {
> > pb = list_entry(tmp.next, page_buf_t, pb_list);
> >
>
> For this second one, we probably just want to ditch the flush_cnt
> there (this change is doing blk_run_address_space on every 32nd
> buffer target, and not the intervening ones). We will be doing a
> bunch more blk_run_address_space calls than we probably need to,
> not sure if thats going to become an issue or not, let me prod
> some of the other XFS folks for more insight there...

If any of you could send me a replacement xfs_buf bit, I'd much
appreciate it.

--
Jens Axboe

2004-03-11 09:16:15

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Thu, Mar 11 2004, Joe Thornber wrote:
> Jens,
>
> Small locking changes to the dm bits. I've just seen that you've
> posted an updated version of your patch to lkml, so I'll post another
> version of this patch to that thread too.

Thanks Joe, you don't have to generate a new diff, I'll just adapt this
version and post an update.

--
Jens Axboe

2004-03-11 09:13:26

by Joe Thornber

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Jens,

Small locking changes to the dm bits. I've just seen that you've
posted an updated version of your patch to lkml, so I'll post another
version of this patch to that thread too.

- Joe


Fix md->map protection in the global unplug removal patch.
--- diff/drivers/md/dm.c 2004-03-11 08:41:27.000000000 +0000
+++ source/drivers/md/dm.c 2004-03-11 08:57:14.000000000 +0000
@@ -578,8 +578,12 @@ static int dm_request(request_queue_t *q
static void dm_unplug_all(request_queue_t *q)
{
struct mapped_device *md = q->queuedata;
+ struct dm_table *map = dm_get_table(md);

- dm_table_unplug_all(md->map);
+ if (map) {
+ dm_table_unplug_all(map);
+ dm_table_put(map);
+ }
}

static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -904,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
add_wait_queue(&md->wait, &wait);
up_write(&md->lock);

+ /* unplug */
+ map = dm_get_table(md);
+ if (map) {
+ dm_table_unplug_all(map);
+ dm_table_put(map);
+ }
+
/*
* Then we wait for the already mapped ios to
* complete.
*/
- dm_table_unplug_all(md->map);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);

@@ -953,10 +963,9 @@ int dm_resume(struct mapped_device *md)
def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->lock);
+ dm_table_unplug_all(map);
dm_table_put(map);

- dm_table_unplug_all(md->map);
-
return 0;
}

2004-03-11 12:17:36

by Christophe Saout

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Am Mi, den 10.03.2004 schrieb Jens Axboe um 13:45:

> diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c
> --- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c 2004-03-09 13:08:48.000000000 +0100
> +++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c 2004-03-09 15:27:36.000000000 +0100
> @@ -668,7 +668,7 @@
>
> /* out of memory -> run queues */
> if (remaining)
> - blk_run_queues();
> + blk_congestion_wait(bio_data_dir(clone), HZ/100);

Why did you change this? It was the way I wanted it.

If we were out of memory the buffers were allocated from a mempool and I
want to get it out as soon as possible. If we are OOM the write will
most likely be the VM trying to free some memory and it would be
counterproductive to wait. It is not unlikely that we are the only
writer to that disk so there's a chance that the queue is not congested.


2004-03-11 12:22:15

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

On Thu, Mar 11 2004, Christophe Saout wrote:
> Am Mi, den 10.03.2004 schrieb Jens Axboe um 13:45:
>
> > diff -ur -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c
> > --- /opt/kernel/linux-2.6.4-rc2-mm1/drivers/md/dm-crypt.c 2004-03-09 13:08:48.000000000 +0100
> > +++ linux-2.6.4-rc2-mm1-plug/drivers/md/dm-crypt.c 2004-03-09 15:27:36.000000000 +0100
> > @@ -668,7 +668,7 @@
> >
> > /* out of memory -> run queues */
> > if (remaining)
> > - blk_run_queues();
> > + blk_congestion_wait(bio_data_dir(clone), HZ/100);
>
> Why did you change this? It was the way I wanted it.
>
> If we were out of memory the buffers were allocated from a mempool and I
> want to get it out as soon as possible. If we are OOM the write will
> most likely be the VM trying to free some memory and it would be
> counterproductive to wait. It is not unlikely that we are the only
> writer to that disk so there's a chance that the queue is not congested.

Because it wasn't right, like most of those blk_run_queues(). The vm
should get things going, you basically just want to take a little nap
and retry. The idea with a small wait is to allow io to make some
progress, you gain nothing retrying right away (except wasting CPU
spinning). It might want to be 1 instead of HZ/100, though. I doubt it
would make much difference, since it's an OOM condition.

--
Jens Axboe

2004-03-11 13:11:32

by Christophe Saout

[permalink] [raw]
Subject: Re: [PATCH] backing dev unplugging

Am Do, den 11.03.2004 schrieb Jens Axboe um 13:22:

> > If we were out of memory the buffers were allocated from a mempool and I
> > want to get it out as soon as possible. If we are OOM the write will
> > most likely be the VM trying to free some memory and it would be
> > counterproductive to wait. It is not unlikely that we are the only
> > writer to that disk so there's a chance that the queue is not congested.
>
> Because it wasn't right, like most of those blk_run_queues(). The vm
> should get things going, you basically just want to take a little nap
> and retry. The idea with a small wait is to allow io to make some
> progress, you gain nothing retrying right away (except wasting CPU
> spinning). It might want to be 1 instead of HZ/100, though. I doubt it
> would make much difference, since it's an OOM condition.

Well, okay. You're right then. I think the wait is unnecessary because
the next thing that the code tries to do is to allocate a new page,
waiting allowed and the VM will run the queue and wait itself. So if you
think it's safe you can even remove it.

Thanks.


2004-03-15 05:54:23

by Chen, Kenneth W

[permalink] [raw]
Subject: RE: [PATCH] backing dev unplugging

>>>>>> Andrew Morton on Wed March 10, 2004 11:56 AM
>Jens Axboe <[email protected]> wrote:
>> Here's a first cut at killing global plugging of block devices to reduce
>> the nasty contention blk_plug_lock caused. This introduceds per-queue
>> plugging, controlled by the backing_dev_info.
>
>This is such an improvement over what we have now it isn't funny.
>
>Ken, the next -mm is starting to look like linux-3.1.0 so I think it
>would be best if you could benchmark Jens's patch against 2.6.4-rc2-mm1.


Our latest measurement on the 32P, 1000 disks setup indicates this patch
is working as expected performance wise. We saw 200% improvement on the
throughput compare to global blk_plug_list. (compare to the per-cpu
blk_plug_list, performance is the same).

- Ken