2006-11-16 03:16:22

by Edward Falk

[permalink] [raw]
Subject: [PATCH] Introduce block I/O performance histograms

diff -uprN linux-2.6.18.1/block/Kconfig block_histogram/block/Kconfig
--- linux-2.6.18.1/block/Kconfig 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/block/Kconfig 2006-11-15 18:08:07.000000000 -0800
@@ -24,6 +24,31 @@ config BLK_DEV_IO_TRACE

git://brick.kernel.dk/data/git/blktrace.git

+config BLOCK_HISTOGRAM
+ bool "Performance histogram data"
+ help
+ This option causes block devices to collect statistics on transfer
+ sizes and times. Useful for performance-tuning a system. Creates
+ entries in /sys/block/. Increases kernel size about 21k.
+
+ If you are unsure, say N here.
+
+config HISTO_SIZE_BUCKETS
+ int "Number of size buckets in histogram"
+ depends on BLOCK_HISTOGRAM
+ default "6"
+ help
+ This option controls how many buckets are used to collect
+ transfer size statistics.
+
+config HISTO_TIME_BUCKETS
+ int "Number of time buckets in histogram"
+ depends on BLOCK_HISTOGRAM
+ default "8"
+ help
+ This option controls how many buckets are used to collect
+ transfer time statistics.
+
config LSF
bool "Support for Large Single Files"
depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
diff -uprN linux-2.6.18.1/block/genhd.c block_histogram/block/genhd.c
--- linux-2.6.18.1/block/genhd.c 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/block/genhd.c 2006-11-15 18:08:07.000000000 -0800
@@ -16,6 +16,13 @@
#include <linux/buffer_head.h>
#include <linux/mutex.h>

+#ifdef CONFIG_BLOCK_HISTOGRAM
+static ssize_t disk_read_request_histo_read(struct gendisk *, char *page);
+static ssize_t disk_read_dma_histo_read(struct gendisk *, char *page);
+static ssize_t disk_write_request_histo_read(struct gendisk *, char *page);
+static ssize_t disk_write_dma_histo_read(struct gendisk *, char *page);
+#endif
+
struct subsystem block_subsys;
static DEFINE_MUTEX(block_subsys_lock);

@@ -412,6 +419,25 @@ static struct disk_attribute disk_attr_s
.show = disk_stats_read
};

+#ifdef CONFIG_BLOCK_HISTOGRAM
+static struct disk_attribute disk_attr_read_request_histo = {
+ .attr = {.name = "read_request_histo", .mode = S_IRUGO },
+ .show = disk_read_request_histo_read,
+};
+static struct disk_attribute disk_attr_read_dma_histo = {
+ .attr = {.name = "read_dma_histo", .mode = S_IRUGO },
+ .show = disk_read_dma_histo_read,
+};
+static struct disk_attribute disk_attr_write_request_histo = {
+ .attr = {.name = "write_request_histo", .mode = S_IRUGO },
+ .show = disk_write_request_histo_read,
+};
+static struct disk_attribute disk_attr_write_dma_histo = {
+ .attr = {.name = "write_dma_histo", .mode = S_IRUGO },
+ .show = disk_write_dma_histo_read,
+};
+#endif
+
static struct attribute * default_attrs[] = {
&disk_attr_uevent.attr,
&disk_attr_dev.attr,
@@ -419,6 +445,12 @@ static struct attribute * default_attrs[
&disk_attr_removable.attr,
&disk_attr_size.attr,
&disk_attr_stat.attr,
+#ifdef CONFIG_BLOCK_HISTOGRAM
+ &disk_attr_read_request_histo.attr,
+ &disk_attr_read_dma_histo.attr,
+ &disk_attr_write_request_histo.attr,
+ &disk_attr_write_dma_histo.attr,
+#endif
NULL,
};

@@ -708,3 +740,288 @@ int invalidate_partition(struct gendisk
}

EXPORT_SYMBOL(invalidate_partition);
+
+
+
+#ifdef CONFIG_BLOCK_HISTOGRAM
+
+/*
+ * Explanation of histogram code:
+ *
+ * The block_histogram code implements a 2-variable histogram, with transfers
+ * tracked by transfer size and completion time. The /sysfs files are
+ * /sysfs/block/DEV/read_request_histo, /sysfs/block/DEV/write_request_histo,
+ * /sysfs/block/DEV/read_dma_histo, and /sysfs/block/DEV/write_dma_histo
+ *
+ * The *request_histo files measure time from when the request is first
+ * submitted into the drive's queue. The *dma_histo files measure time
+ * from when the request is transfered from the queue to the device.
+ *
+ * block_histogram_start_time() is used to note the time that the
+ * request was transfered to the device. block_histogram_completion()
+ * is used to note the time the transfer was completed.
+ */
+
+
+#define HISTO_REQUEST 0
+#define HISTO_DMA 1
+
+/**
+ * Clear one per-cpu instance of one channel of I/O histogram
+ */
+static inline void block_histogram_init2(struct disk_stats *stats,
+ int cmd, int type)
+{
+ if (type == HISTO_REQUEST) {
+ if (cmd == READ)
+ memset(&stats->read_req_histo, 0,
+ sizeof(stats->read_req_histo));
+ else
+ memset(&stats->write_req_histo, 0,
+ sizeof(stats->write_req_histo));
+ }
+ else {
+ if (cmd == READ)
+ memset(&stats->read_dma_histo, 0,
+ sizeof(stats->read_dma_histo));
+ else
+ memset(&stats->write_dma_histo, 0,
+ sizeof(stats->write_dma_histo));
+ }
+}
+
+
+/**
+ * Clear one channel of I/O histogram
+ */
+static void block_histogram_init1(struct gendisk *disk, int cmd, int type)
+{
+#ifdef CONFIG_SMP
+ int i;
+
+ for (i=0; i < NR_CPUS; ++i)
+ if (cpu_possible(i))
+ block_histogram_init2(per_cpu_ptr(disk->dkstats, i),
+ cmd, type);
+#else
+ block_histogram_init2(&disk->dkstats, cmd, type);
+#endif
+}
+
+
+/**
+ * Clear all channels of I/O histogram.
+ */
+void block_histogram_init(struct gendisk *disk)
+{
+ block_histogram_init1(disk, HISTO_REQUEST, READ);
+ block_histogram_init1(disk, HISTO_REQUEST, WRITE);
+ block_histogram_init1(disk, HISTO_DMA, READ);
+ block_histogram_init1(disk, HISTO_DMA, WRITE);
+}
+EXPORT_SYMBOL(block_histogram_init);
+
+
+
+/*
+ * Map transfer size to histogram bucket. Transfer sizes use an
+ * exponential backoff: 4, 8, 16, ... sectors.
+ */
+static inline int stats_size_bucket(int sectors)
+{
+ int i;
+ sectors /= BASE_HISTO_SIZE;
+ if (sectors >= (1<<(CONFIG_HISTO_SIZE_BUCKETS-2)))
+ return CONFIG_HISTO_SIZE_BUCKETS - 1;
+
+ for (i = 0; sectors > 0; ++i, sectors /= 2);
+ return i;
+}
+
+
+/*
+ * Map transfer time to histogram bucket. This also uses an exponential
+ * backoff, but we like the 1,2,5,10,20,50 progression. Start at 10 ms.
+ */
+static inline int stats_time_bucket(int jiffies)
+{
+ int i;
+ int ms = jiffies * ( 1000 / HZ);
+ int t = BASE_HISTO_TIME;
+
+ for (i = 0;; t *= 10) {
+ if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t) return i - 1;
+ if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t*2) return i - 1;
+ if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t*5) return i - 1;
+ }
+}
+
+
+/**
+ * Log I/O completion, update histogram.
+ *
+ * @disk: disk device
+ * @flags: r/w command
+ * @sectors: # sectors transferred
+ * @jiffies: time transfer required
+ * @jiffies_dma: time dma required, or -1 if n/a
+ */
+void block_histogram_completion(struct gendisk *disk,
+ unsigned long flags,
+ int sectors,
+ int jiffies,
+ int jiffies_dma)
+{
+ int size_idx = stats_size_bucket(sectors);
+ int req_time_idx = stats_time_bucket(jiffies);
+
+ if (!(flags & REQ_CMD))
+ return;
+
+ if (!(flags & REQ_RW)) {
+ __disk_stat_add(disk,
+ read_req_histo[size_idx][req_time_idx], 1);
+ } else {
+ __disk_stat_add(disk,
+ write_req_histo[size_idx][req_time_idx], 1);
+ }
+
+ if (jiffies_dma >= 0) {
+ int dma_time_idx = stats_time_bucket(jiffies_dma);
+ if (!(flags & REQ_RW)) {
+ __disk_stat_add(disk,
+ read_dma_histo[size_idx][dma_time_idx], 1);
+ } else {
+ __disk_stat_add(disk,
+ write_dma_histo[size_idx][dma_time_idx], 1);
+ }
+ }
+}
+EXPORT_SYMBOL(block_histogram_completion);
+
+
+/**
+ * Helper function: calls block_histogram_completion after a dma
+ * interrupt.
+ */
+void block_histogram_req_cmplt(struct gendisk *disk, struct request *req)
+{
+ static int count = 20;
+
+ if (req) {
+ int rq_elapsed = jiffies - req->start_time;
+ int io_elapsed;
+
+ if (req->io_start_time == 0) {
+ if( count > 0 ) {
+ printk( KERN_NOTICE
+ "%s: unexpected transfer w/out start time\n",
+ disk->disk_name);
+ dump_stack();
+ --count;
+ }
+ io_elapsed = -1;
+ } else {
+ io_elapsed = jiffies - req->io_start_time;
+ }
+ block_histogram_completion(disk,
+ req->flags,
+ req->nr_sectors,
+ rq_elapsed,
+ io_elapsed);
+ }
+}
+
+
+/**
+ * Tiny helper utility, append sprintf() output to a buffer, update pointers,
+ * and guard against overflow.
+ */
+static inline void sysfs_out(char **ptr, ssize_t *rem, const char *fmt, ...)
+{
+ va_list ap;
+ int i;
+ va_start(ap, fmt);
+ i = vsnprintf(*ptr, *rem, fmt, ap);
+ *ptr += i;
+ *rem -= i;
+ va_end(ap);
+}
+
+
+/**
+ * Dumps the specified 'type' of histogram for disk to out.
+ * The result must be less than PAGE_SIZE
+ */
+static int dump_histo (int cmd, int type, struct gendisk *disk, char* page)
+{
+ ssize_t rem = PAGE_SIZE;
+ char *optr = page;
+ int len, j, k;
+ int v;
+ int ms;
+ int size = BASE_HISTO_SIZE * 512;
+ static const int mults[3] = {1,2,5};
+
+ /* Key */
+ len = snprintf(page, rem, "rows = bytes columns = ms\n");
+ page += len; rem -= len;
+
+ /* Row header */
+ len = snprintf(page, rem, " "); page += len; rem -= len;
+
+ for (j = 0, ms = BASE_HISTO_TIME; j < CONFIG_HISTO_TIME_BUCKETS; ms *= 10)
+ {
+ for(k=0; k < 3 && j < CONFIG_HISTO_TIME_BUCKETS; ++k, ++j) {
+ len = snprintf(page, rem, "\t%d", ms * mults[k]);
+ page += len; rem -= len;
+ }
+ }
+ *page++ = '\n'; --rem;
+
+ /* Payload */
+ for (j = 0; j < CONFIG_HISTO_SIZE_BUCKETS; j++, size *= 2) {
+ len = snprintf(page, rem, "%7d", size);
+ page += len; rem -= len;
+ for (k = 0; k < CONFIG_HISTO_TIME_BUCKETS; k++) {
+ v = (type == HISTO_REQUEST) ?
+ ((cmd == READ) ?
+ disk_stat_read(disk, read_req_histo[j][k]) :
+ disk_stat_read(disk, write_req_histo[j][k])) :
+ ((cmd == READ) ?
+ disk_stat_read(disk, read_dma_histo[j][k]) :
+ disk_stat_read(disk, write_dma_histo[j][k]));
+ len = snprintf(page, rem, "\t%d", v);
+ page += len; rem -= len;
+ }
+ len = snprintf(page, rem, "\n");
+ page += len; rem -= len;
+ }
+ return page - optr;
+}
+
+
+/**
+ * sysfs show() methods for the four histogram channels.
+ */
+static ssize_t disk_read_request_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(READ, HISTO_REQUEST, disk, page);
+}
+
+static ssize_t disk_read_dma_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(READ, HISTO_DMA, disk, page);
+}
+
+static ssize_t disk_write_request_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(WRITE, HISTO_REQUEST, disk, page);
+}
+
+static ssize_t disk_write_dma_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(WRITE, HISTO_DMA, disk, page);
+}
+
+#endif
diff -uprN linux-2.6.18.1/block/ll_rw_blk.c block_histogram/block/ll_rw_blk.c
--- linux-2.6.18.1/block/ll_rw_blk.c 2006-11-15 15:12:00.000000000 -0800
+++ block_histogram/block/ll_rw_blk.c 2006-11-15 18:08:07.000000000 -0800
@@ -3469,6 +3469,7 @@ void end_that_request_last(struct reques
__disk_stat_add(disk, ticks[rw], duration);
disk_round_stats(disk);
disk->in_flight--;
+ block_histogram_req_cmplt(disk, req);
}
if (req->end_io)
req->end_io(req, error);
diff -uprN linux-2.6.18.1/drivers/scsi/scsi_lib.c block_histogram/drivers/scsi/scsi_lib.c
--- linux-2.6.18.1/drivers/scsi/scsi_lib.c 2006-11-15 16:37:34.000000000 -0800
+++ block_histogram/drivers/scsi/scsi_lib.c 2006-11-15 18:08:07.000000000 -0800
@@ -1477,6 +1477,7 @@ static void scsi_request_fn(struct reque
/*
* Dispatch the command to the low-level driver.
*/
+ block_histogram_start_time(req);
rtn = scsi_dispatch_cmd(cmd);
spin_lock_irq(q->queue_lock);
if(rtn) {
diff -uprN linux-2.6.18.1/include/linux/blkdev.h block_histogram/include/linux/blkdev.h
--- linux-2.6.18.1/include/linux/blkdev.h 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/include/linux/blkdev.h 2006-11-15 18:08:07.000000000 -0800
@@ -154,7 +154,10 @@ struct request {
int rq_status; /* should split this into a few status bits */
int errors;
struct gendisk *rq_disk;
- unsigned long start_time;
+ unsigned long start_time; /* when request submitted */
+#ifdef CONFIG_BLOCK_HISTOGRAM
+ unsigned long io_start_time; /* when passed to hardware */
+#endif

/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
@@ -840,5 +843,13 @@ void kblockd_flush(void);
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
MODULE_ALIAS("block-major-" __stringify(major) "-*")

+#ifdef CONFIG_BLOCK_HISTOGRAM
+inline static void block_histogram_start_time(struct request *req)
+{
+ req->io_start_time = jiffies;
+}
+#else
+#define block_histogram_start_time(req)
+#endif

#endif
diff -uprN linux-2.6.18.1/include/linux/genhd.h block_histogram/include/linux/genhd.h
--- linux-2.6.18.1/include/linux/genhd.h 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/include/linux/genhd.h 2006-11-15 18:08:07.000000000 -0800
@@ -11,6 +11,8 @@

#include <linux/types.h>

+struct request;
+
enum {
/* These three have identical behaviour; use the second one if DOS FDISK gets
confused about extended/logical partitions starting past cylinder 1023. */
@@ -89,6 +91,10 @@ struct hd_struct {
#define GENHD_FL_UP 16
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32

+#define BASE_HISTO_SIZE 4 /* smallest transfer size, sectors */
+#define BASE_HISTO_TIME 10 /* shortest transfer time, ms */
+
+
struct disk_stats {
unsigned long sectors[2]; /* READs and WRITEs */
unsigned long ios[2];
@@ -96,6 +102,12 @@ struct disk_stats {
unsigned long ticks[2];
unsigned long io_ticks;
unsigned long time_in_queue;
+#ifdef CONFIG_BLOCK_HISTOGRAM
+ int read_req_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+ int read_dma_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+ int write_req_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+ int write_dma_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+#endif
};

struct gendisk {
@@ -230,6 +242,21 @@ extern struct gendisk *get_gendisk(dev_t
extern void set_device_ro(struct block_device *bdev, int flag);
extern void set_disk_ro(struct gendisk *disk, int flag);

+#ifdef CONFIG_BLOCK_HISTOGRAM
+extern void block_histogram_init(struct gendisk *disk);
+extern void block_histogram_completion(struct gendisk *disk,
+ unsigned long flags,
+ int sectors,
+ int jiffies,
+ int jiffies_dma);
+extern void block_histogram_req_cmplt(struct gendisk *disk, struct request *);
+#else
+#define block_histogram_init(disk)
+#define block_histogram_completion(disk, flags, sectors, jiffies, jiffies_dma)
+#define block_histogram_req_cmplt(disk, req)
+#endif
+
+
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk);
extern void rand_initialize_disk(struct gendisk *disk);


Attachments:
block_histogram.patch (14.28 kB)

2006-11-16 06:59:21

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] Introduce block I/O performance histograms

On Wed, Nov 15 2006, Edward Falk wrote:
> This patch introduces performance histogram record keeping for block
> I/O, used for performance tuning. It is turned off by default.
>
> When turned on, you simply do something like:
>
> # cat /sys/block/sda/read_request_histo
> rows = bytes columns = ms
> 10 20 50 100 200 500 1000 2000
> 2048 5 0 0 0 0 0 0 0
> 4096 0 0 0 0 0 0 0 0
> 8192 17231 135 41 10 0 0 0 0
> 16384 4400 24 6 2 0 0 0 0
> 32768 2897 34 4 4 0 0 0 0
> 65536 7089 87 5 1 2 0 0 0

I don't see the point at all for including this piece of code in the
kernel. You can do the same from user space. Your help entry said it
even grows the kernel size about 21k, that's pretty nasty.

So NAK.

--
Jens Axboe

2006-11-16 20:04:34

by Edward Falk

[permalink] [raw]
Subject: Re: [PATCH] Introduce block I/O performance histograms

Jens Axboe wrote:

> I don't see the point at all for including this piece of code in the
> kernel. You can do the same from user space. Your help entry said it
> even grows the kernel size about 21k, that's pretty nasty.

How would you do this from user space? Also, the 21k increase is only
in effect if the feature is turned on in the config, and it's off by
default.

-ed falk

2006-11-16 20:22:29

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] Introduce block I/O performance histograms

On Thu, Nov 16 2006, Edward Falk wrote:
> Jens Axboe wrote:
>
> >I don't see the point at all for including this piece of code in the
> >kernel. You can do the same from user space. Your help entry said it
> >even grows the kernel size about 21k, that's pretty nasty.
>
> How would you do this from user space? Also, the 21k increase is only
> in effect if the feature is turned on in the config, and it's off by
> default.

You can use blktrace to do it. I realize that the 21k is only if it's
turned out, it's still quite a huge big (why does it turn out so big?).
The source is always there to maintain, though. The main point is that
you don't have to do this in the kernel, though.

--
Jens Axboe

2006-11-17 00:05:15

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH] Introduce block I/O performance histograms

On Wed, Nov 15, 2006 at 07:15:52PM -0800, Edward Falk wrote:
> This patch introduces performance histogram record keeping for block
> I/O, used for performance tuning. It is turned off by default.
>
> When turned on, you simply do something like:
>
> # cat /sys/block/sda/read_request_histo
> rows = bytes columns = ms
> 10 20 50 100 200 500 1000 2000
> 2048 5 0 0 0 0 0 0 0
> 4096 0 0 0 0 0 0 0 0
> 8192 17231 135 41 10 0 0 0 0
> 16384 4400 24 6 2 0 0 0 0
> 32768 2897 34 4 4 0 0 0 0
> 65536 7089 87 5 1 2 0 0 0

Eeek, no, sysfs is for ONE VALUE PER FILE please.

Yes, I know there are some other sysfs files that do not follow this
rule, and we are working on fixing some of them, but please, don't add
new ones that do violate this rule.

thanks,

greg k-h