2002-09-12 02:06:13

by Rick Lindsley

[permalink] [raw]
Subject: [RFC][PATCH] sard changes for 2.5.34

Here's a patch to put sard changes similar to those in 2.4 into 2.5. I
say "similar" because the I/O subsystem has changed sufficiently in 2.5
that making them exactly the same might be more effort that it's
worth. Still, we do record per-partition reads/writes/sectors, and
per-disk stats on the same plus queue time and number of I/O merges.
Once applied, "cat /proc/diskstats" outputs this information.

Because in 2.5.34, gendisk->part[0] no longer is an hd_struct that
refers to the whole disk, there wasn't a convenient place to record
this information. I gratuitously added an hd_struct to gendisk to have
a place to store the information, below, but that's distasteful and
ugly. I'd like to move it to a different place.

Also, with this patch, we are collecting stats twice, once for these
stats and once for /proc/stat (kstat). That's stupid and I'd like to
get the stats only once and use them, perhaps, in two places.

What follows works, but needs refinements. Comments welcome.

Rick

diff -ru linux-2.5.34/drivers/block/genhd.c clnstat-2.5.34/drivers/block/genhd.c
--- linux-2.5.34/drivers/block/genhd.c Mon Sep 9 10:35:11 2002
+++ clnstat-2.5.34/drivers/block/genhd.c Wed Sep 11 17:59:51 2002
@@ -57,6 +57,7 @@
memset(p, 0, size);
}
gp->part = p;
+ memset(&gp->whole_disk, 0, sizeof(struct hd_struct));

write_lock(&gendisk_lock);

@@ -184,7 +185,116 @@
stop: part_stop,
show: show_partition
};
-#endif
+
+/* iterator */
+static void *diskstats_start(struct seq_file *part, loff_t *pos)
+{
+ loff_t k = *pos;
+ struct gendisk *sgp;
+
+ read_lock(&gendisk_lock);
+ for (sgp = gendisk_head; sgp; sgp = sgp->next) {
+ if (!k--)
+ return sgp;
+ }
+ return NULL;
+}
+
+static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
+{
+ ++*pos;
+ return ((struct gendisk *)v)->next;
+}
+
+static void diskstats_stop(struct seq_file *part, void *v)
+{
+ read_unlock(&gendisk_lock);
+}
+
+#define MSEC(x) ((x) * 1000 / HZ)
+
+void dump_gendisk(struct seq_file *s)
+{
+ struct gendisk *gp = NULL;
+
+ read_lock(&gendisk_lock);
+ for (gp = gendisk_head; gp; gp = gp->next) {
+ seq_printf(s,
+ "%8lx: major=%d, part=%8lx, first_minor=%d, minor_shift=%d\n",
+ (unsigned long) gp,
+ (int) gp->major,
+ (unsigned long) gp->part,
+ (int) gp->first_minor,
+ (int) gp->minor_shift);
+ }
+ read_unlock(&gendisk_lock);
+}
+
+static int diskstats_show(struct seq_file *s, void *v)
+{
+ struct gendisk *gp = v;
+ char buf[64];
+ int n;
+ struct hd_struct *hd;
+
+ if (gp == gendisk_head) {
+ seq_puts(s, " major minor #blocks name"
+ " rio rmerge rsect ruse wio"
+ " wmerge wsect wuse running use"
+ " aveq"
+ "\n\n");
+ }
+
+ /* show the full disk ... */
+ hd = &gp->whole_disk;
+ if (hd) {
+ disk_round_stats(hd);
+ seq_printf(s, "%8x %4d %4d %10ld %-5s "
+ "%8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n",
+ (unsigned int) hd,
+ gp->major, gp->first_minor,
+ hd->nr_sects >> 1,
+ disk_name(gp, 0, buf),
+ hd->rd_ios, hd->rd_merges,
+ hd->rd_sectors, MSEC(hd->rd_ticks),
+ hd->wr_ios, hd->wr_merges,
+ hd->wr_sectors, MSEC(hd->wr_ticks),
+ hd->ios_in_flight, MSEC(hd->io_ticks),
+ MSEC(hd->aveq));
+ }
+
+ /* ... and all non-0 size partitions of it */
+ for (n = 0; n < (1 << gp->minor_shift)-1; n++) {
+ if (gp->part[n].nr_sects) {
+ hd = &gp->part[n];
+ disk_round_stats(hd);
+ seq_printf(s, "%8x %4d %4d %10ld %-5s "
+ "%8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n",
+ (unsigned int) hd,
+ gp->major, n + gp->first_minor + 1,
+ hd->nr_sects >> 1,
+ disk_name(gp, n + 1, buf),
+ hd->rd_ios, hd->rd_merges,
+ hd->rd_sectors, MSEC(hd->rd_ticks),
+ hd->wr_ios, hd->wr_merges,
+ hd->wr_sectors, MSEC(hd->wr_ticks),
+ hd->ios_in_flight, MSEC(hd->io_ticks),
+ MSEC(hd->aveq));
+ }
+ }
+ return 0;
+}
+
+#undef MSEC
+
+struct seq_operations diskstats_op = {
+ start: diskstats_start,
+ next: diskstats_next,
+ stop: diskstats_stop,
+ show: diskstats_show
+};
+
+#endif /* CONFIG_PROC_FS */


extern int blk_dev_init(void);
diff -ru linux-2.5.34/drivers/block/ll_rw_blk.c clnstat-2.5.34/drivers/block/ll_rw_blk.c
--- linux-2.5.34/drivers/block/ll_rw_blk.c Mon Sep 9 10:35:02 2002
+++ clnstat-2.5.34/drivers/block/ll_rw_blk.c Wed Sep 11 18:24:25 2002
@@ -1329,6 +1329,164 @@
}

/*
+ * Return an hd_struct on which to do IO accounting for a given request.
+ */
+static struct hd_struct *locate_hd_struct(struct request *req)
+{
+ struct gendisk *gd;
+ int partition;
+ struct hd_struct *hd = NULL;
+
+ gd = get_gendisk(req->rq_dev);
+ if (gd) {
+ /*
+ * need to get *just* the partition number here. minor() gives
+ * us both the minor number and the partition number
+ * combined. Note that starting in 2.5.34, gd->part[0] is the
+ * first partition (partition 1), not the whole disk.
+ */
+ partition =
+ (minor(req->rq_dev) & ((1 << gd->minor_shift) - 1)) - 1;
+ if (partition < 0) {
+ hd = &gd->whole_disk;
+ } else {
+ hd = &gd->part[partition];
+ }
+ }
+ return hd;
+}
+
+/*
+ * Round off the performance stats on an hd_struct.
+ *
+ * The average IO queue length and utilisation statistics are maintained
+ * by observing the current state of the queue length and the amount of
+ * time it has been in this state for.
+ * Normally, that accounting is done on IO completion, but that can result
+ * in more than a second's worth of IO being accounted for within any one
+ * second, leading to >100% utilisation. To deal with that, we do a
+ * round-off before returning the results when reading /proc/diskstats,
+ * accounting immediately for all queue usage up to the current jiffies and
+ * restarting the counters again.
+ */
+void disk_round_stats(struct hd_struct *hd)
+{
+ unsigned long now = jiffies;
+
+ hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change));
+ hd->last_queue_change = now;
+
+ if (hd->ios_in_flight)
+ hd->io_ticks += (now - hd->last_idle_time);
+ hd->last_idle_time = now;
+}
+
+static inline void down_ios(struct hd_struct *hd)
+{
+ disk_round_stats(hd);
+ --hd->ios_in_flight;
+}
+
+static inline void up_ios(struct hd_struct *hd)
+{
+ disk_round_stats(hd);
+ ++hd->ios_in_flight;
+}
+
+static void collect_part_stats(struct bio *bio)
+{
+ struct hd_struct *hd = NULL;
+ struct gendisk *gd;
+ kdev_t kdev;
+
+ kdev = to_kdev_t(bio->bi_bdev->bd_dev);
+ gd = get_gendisk(kdev);
+ if (gd) {
+ hd= &gd->part[(minor(kdev) & ((1 << gd->minor_shift) - 1)) - 1];
+ }
+
+ switch (bio->bi_rw) {
+ case READ:
+ if (hd) {
+ hd->rd_sectors += bio_sectors(bio);
+ hd->rd_ios++;
+ }
+ break;
+ case WRITE:
+ if (hd) {
+ hd->wr_sectors += bio_sectors(bio);
+ hd->wr_ios++;
+ }
+ break;
+ }
+}
+
+static void account_io_start(struct hd_struct *hd, struct request *req,
+ int merge, int sectors)
+{
+ switch (rq_data_dir(req)) {
+ case WRITE:
+ if (merge)
+ hd->wr_merges++;
+ hd->wr_sectors += sectors;
+ break;
+ case READ:
+ if (merge)
+ hd->rd_merges++;
+ hd->rd_sectors += sectors;
+ break;
+ }
+ if (!merge)
+ up_ios(hd);
+}
+
+static void account_io_end(struct hd_struct *hd, struct request *req, int tag)
+{
+ unsigned long duration = jiffies - req->start_time;
+
+ switch (rq_data_dir(req)) {
+ case WRITE:
+ hd->wr_ticks += duration;
+ hd->wr_ios++;
+ break;
+ case READ:
+ hd->rd_ticks += duration;
+ hd->rd_ios++;
+ break;
+ }
+
+ down_ios(hd);
+}
+
+void req_new_io(struct request *req, int merge, int sectors)
+{
+ struct hd_struct *hd;
+
+ hd = locate_hd_struct(req);
+ if (hd)
+ account_io_start(hd, req, merge, sectors);
+}
+
+void req_merged_io(struct request *req)
+{
+ struct hd_struct *hd;
+
+ hd = locate_hd_struct(req);
+ if (hd)
+ down_ios(hd);
+}
+
+void req_finished_io(struct request *req, int tag)
+{
+ struct hd_struct *hd;
+
+ hd = locate_hd_struct(req);
+ if (hd)
+ account_io_end(hd, req, tag);
+}
+EXPORT_SYMBOL(req_finished_io);
+
+/*
* add-request adds a request to the linked list.
* queue lock is held and interrupts disabled, as we muck with the
* request queue list.
@@ -1408,6 +1566,13 @@

elv_merge_requests(q, req, next);

+ /*
+ * One last thing: we have removed a request, so we now
+ * have one less expected IO to complete for accounting
+ * purposes.
+ */
+ req_merged_io(req);
+
blkdev_dequeue_request(next);
blk_put_request(next);
}
@@ -1510,6 +1675,7 @@
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
elv_merge_cleanup(q, req, nr_sectors);
drive_stat_acct(req, nr_sectors, 0);
+ req_new_io(req, 1, nr_sectors);
attempt_back_merge(q, req);
goto out;

@@ -1534,6 +1700,7 @@
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
elv_merge_cleanup(q, req, nr_sectors);
drive_stat_acct(req, nr_sectors, 0);
+ req_new_io(req, 1, nr_sectors);
attempt_front_merge(q, req);
goto out;

@@ -1602,6 +1769,8 @@
req->waiting = NULL;
req->bio = req->biotail = bio;
req->rq_dev = to_kdev_t(bio->bi_bdev->bd_dev);
+ req->start_time = jiffies;
+ req_new_io(req, 0, nr_sectors);
add_request(q, req, insert_here);
out:
if (freereq)
@@ -1706,6 +1875,12 @@
BUG_ON(bio_sectors(bio) > q->max_sectors);

/*
+ * collect per-partition i/o stats before we lose track
+ * of which partition this was.
+ */
+ collect_part_stats(bio);
+
+ /*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
*/
@@ -2040,6 +2215,7 @@
if (req->waiting)
complete(req->waiting);

+ req_finished_io(req,0);
blk_put_request(req);
}

diff -ru linux-2.5.34/fs/proc/proc_misc.c clnstat-2.5.34/fs/proc/proc_misc.c
--- linux-2.5.34/fs/proc/proc_misc.c Mon Sep 9 10:35:03 2002
+++ clnstat-2.5.34/fs/proc/proc_misc.c Mon Sep 9 19:03:41 2002
@@ -261,6 +261,18 @@
release: seq_release,
};

+extern struct seq_operations diskstats_op;
+static int diskstats_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &diskstats_op);
+}
+static struct file_operations proc_diskstats_operations = {
+ open: diskstats_open,
+ read: seq_read,
+ llseek: seq_lseek,
+ release: seq_release,
+};
+
#ifdef CONFIG_MODULES
extern struct seq_operations modules_op;
static int modules_open(struct inode *inode, struct file *file)
@@ -624,6 +636,7 @@
create_seq_entry("partitions", 0, &proc_partitions_operations);
create_seq_entry("interrupts", 0, &proc_interrupts_operations);
create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+ create_seq_entry("diskstats", 0, &proc_diskstats_operations);
#ifdef CONFIG_MODULES
create_seq_entry("modules", 0, &proc_modules_operations);
create_seq_entry("ksyms", 0, &proc_ksyms_operations);
diff -ru linux-2.5.34/include/linux/blkdev.h clnstat-2.5.34/include/linux/blkdev.h
--- linux-2.5.34/include/linux/blkdev.h Mon Sep 9 10:35:05 2002
+++ clnstat-2.5.34/include/linux/blkdev.h Mon Sep 9 19:03:41 2002
@@ -36,6 +36,7 @@
kdev_t rq_dev;
int errors;
sector_t sector;
+ unsigned long start_time;
unsigned long nr_sectors;
unsigned long hard_sector; /* the hard_* are block layer
* internals, no driver should
diff -ru linux-2.5.34/include/linux/genhd.h clnstat-2.5.34/include/linux/genhd.h
--- linux-2.5.34/include/linux/genhd.h Mon Sep 9 10:35:03 2002
+++ clnstat-2.5.34/include/linux/genhd.h Wed Sep 11 18:01:10 2002
@@ -63,6 +63,21 @@
unsigned long nr_sects;
devfs_handle_t de; /* primary (master) devfs entry */
struct device hd_driverfs_dev; /* support driverfs hiearchy */
+ /* Performance stats: */
+ unsigned int ios_in_flight;
+ unsigned int io_ticks;
+ unsigned int last_idle_time;
+ unsigned int last_queue_change;
+ unsigned int aveq;
+
+ unsigned int rd_ios;
+ unsigned int rd_merges;
+ unsigned int rd_ticks;
+ unsigned int rd_sectors;
+ unsigned int wr_ios;
+ unsigned int wr_merges;
+ unsigned int wr_ticks;
+ unsigned int wr_sectors;
};

#define GENHD_FL_REMOVABLE 1
@@ -76,7 +91,8 @@
int minor_shift; /* number of times minor is shifted to
get real minor */

- struct hd_struct *part; /* [indexed by minor] */
+ struct hd_struct *part; /* [indexed by minor-1] */
+ struct hd_struct whole_disk;
struct gendisk *next;
struct block_device_operations *fops;
sector_t capacity;
@@ -265,6 +281,12 @@
return g ? (minor(dev) >> g->minor_shift) : 0;
}

+struct request;
+extern void disk_round_stats(struct hd_struct *hd);
+extern void req_new_io(struct request *req, int merge, int sectors);
+extern void req_merged_io(struct request *req);
+extern void req_finished_io(struct request *req, int tag);
+
#endif

#endif


2002-09-12 02:22:25

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

Rick Lindsley wrote:
>
> Here's a patch to put sard changes similar to those in 2.4 into 2.5. I
> say "similar" because the I/O subsystem has changed sufficiently in 2.5
> that making them exactly the same might be more effort that it's
> worth. Still, we do record per-partition reads/writes/sectors, and
> per-disk stats on the same plus queue time and number of I/O merges.
> Once applied, "cat /proc/diskstats" outputs this information.

It would be very nice to get better disk accounting into the kernel.

> Because in 2.5.34, gendisk->part[0] no longer is an hd_struct that
> refers to the whole disk, there wasn't a convenient place to record
> this information. I gratuitously added an hd_struct to gendisk to have
> a place to store the information, below, but that's distasteful and
> ugly. I'd like to move it to a different place.
>
> Also, with this patch, we are collecting stats twice, once for these
> stats and once for /proc/stat (kstat). That's stupid and I'd like to
> get the stats only once and use them, perhaps, in two places.

kstat should be a lighter-weight per-cpu thing. But the current
disk accounting in there would make it 12 kilobytes per CPU.

My vote: remove the disk accounting from kernel_stat and use this.

> What follows works, but needs refinements. Comments welcome.

What are those refinements?

What userspace tools are available for interpreting this information?

2002-09-12 02:45:45

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

On Wed, 11 Sep 2002, Andrew Morton wrote:

> My vote: remove the disk accounting from kernel_stat and use this.

Whatever you decide on, I'll fix up procps to support it.

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

Spamtraps of the month: [email protected] [email protected]org

2002-09-12 06:35:50

by Rick Lindsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

kstat should be a lighter-weight per-cpu thing. But the current
disk accounting in there would make it 12 kilobytes per CPU.

My vote: remove the disk accounting from kernel_stat and use this.

I have a patch from another contributor that takes disk stats out of
kstat and puts them into their own global structure. I'll give that
some attention.

> What follows works, but needs refinements. Comments welcome.

What are those refinements?

A couple I mentioned in my message: double collection of stats, and an
ugly hd_struct added to gendisk. In addition, we should remove the
restriction on which and how many disks are reported on.

Lastly, a bit of a philosophical question. /proc/stat and (with this
patch) /proc/diskstats provide some of the same information. Should

a) all of it appear in /proc/stat?

b) all of it appear in /proc/diskstats?

c) keep the current (limited) info in /proc/stat (for backward
compatibility) and introduce the expanded info in
/proc/diskstats?

My preference is b, but I'm open to other opinions.

What userspace tools are available for interpreting this
information?

None that I'm aware of, although /proc/diskstats is formatted in a
program-friendly way. Sample output (warning: wide):

major minor #blocks name rio rmerge rsect ruse wio wmerge wsect wuse running use aveq

f7d1e414 8 0 0 sda 1657 1403 46534 17391 3633 3974 61480 179110 0 34876 196501
f7c18000 8 1 2562367 sda1 1609 0 41522 0 5285 0 42280 0 0 0 0
f7c1810c 8 2 1 sda2 0 0 0 0 0 0 0 0 0 0 0
f7c18324 8 4 56196 sda4 96 0 768 0 0 0 0 0 0 0 0
f7c18430 8 5 1052226 sda5 16 0 128 0 0 0 0 0 0 0 0
f7c1853c 8 6 1052226 sda6 379 0 2802 0 2380 0 19032 0 0 0 0
f7c18648 8 7 13044748 sda7 946 0 1202 0 18 0 168 0 0 0 0

Just realized as I printed this that the first field is leftover
debugging, and should be removed!

Rick

2002-09-12 07:00:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

Rick Lindsley wrote:
>
> kstat should be a lighter-weight per-cpu thing. But the current
> disk accounting in there would make it 12 kilobytes per CPU.
>
> My vote: remove the disk accounting from kernel_stat and use this.
>
> I have a patch from another contributor that takes disk stats out of
> kstat and puts them into their own global structure. I'll give that
> some attention.

OK, that's a start. I think there was some work done on making
kernel_stat percpu as well.

Cleaning up and speeding up kernel_stat is a spearate exercise
of course, but as we need to change userspace we may as well roll
it all up together.

> > What follows works, but needs refinements. Comments welcome.
>
> What are those refinements?
>
> A couple I mentioned in my message: double collection of stats, and an
> ugly hd_struct added to gendisk. In addition, we should remove the
> restriction on which and how many disks are reported on.
>
> Lastly, a bit of a philosophical question. /proc/stat and (with this
> patch) /proc/diskstats provide some of the same information. Should
>
> a) all of it appear in /proc/stat?
>
> b) all of it appear in /proc/diskstats?
>
> c) keep the current (limited) info in /proc/stat (for backward
> compatibility) and introduce the expanded info in
> /proc/diskstats?
>
> My preference is b, but I'm open to other opinions.

b). Let's get the kernel right and change userspace to follow. We have
another accounting patch which breaks top(1), so Rik has fixed it (and
is feeding the fixes upstream).

> What userspace tools are available for interpreting this
> information?
>
> None that I'm aware of, although /proc/diskstats is formatted in a
> program-friendly way. Sample output (warning: wide):

Does it work with the utilities at http://linux.inet.hr/? What is the
relationship with the 2.4 sard work? (I've never used sard, so words
of one syllable please ;))

If we can get this work playing nicely with those existing sard tools,
get the kernel_stat stuff cleaned up and get the relevant userspace
tools working and merged upstream then we have a neat bundle to submit.

2002-09-12 09:13:43

by Rick Lindsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

OK, that's a start. I think there was some work done on making
kernel_stat percpu as well.

Yes there's work on a couple of different fronts there. There is work
to specifically make disk stats per cpu (actually, I have some 2.4
patches already I could port), and there is a more general interface
(statctr_t) which Dipankar Sarma ([email protected]) is working on
for 2.5 for stat counters in general which generalizes the per-cpu
concept.

Regardless of which route we go, can you suggest a good exercise to
demonstrate the advantage of per-cpu counters? It seems intuitive to
me, but I'm much more comfortable when I have numbers to back me up.

Cleaning up and speeding up kernel_stat is a separate exercise of
course, but as we need to change userspace we may as well roll it
all up together.

That would be great ... but I want to be sure we don't take so long
working on the polish that we miss 10/31 with the main event. I can
spend a few days incorporating all of these things and repost, if you
don't think it makes it "too many changes at one time."

> a) all of it appear in /proc/stat?
>
> b) all of it appear in /proc/diskstats?
>
> c) keep the current (limited) info in /proc/stat (for backward
> compatibility) and introduce the expanded info in
> /proc/diskstats?

b). Let's get the kernel right and change userspace to follow. We
have another accounting patch which breaks top(1), so Rik has fixed
it (and is feeding the fixes upstream).

Easily done.

Does it work with the utilities at http://linux.inet.hr/? What is
the relationship with the 2.4 sard work? (I've never used sard, so
words of one syllable please ;))

The utilities won't without some minor changes. Those tools assume this
information appears in /proc/partitions, and that's a bad place for it
to appear. The tools would have to be updated to utilize
/proc/diskstats. Beyond that, the format is similar and the content is
similar. My guesstimate with a 3 minute look at iostat is it's less
than a day to modify the tools; possibly less than an hour.

This is a direct derivative of the 2.4 work. The work of collecting it
appears in the 2.4 tree already; the work of extracting it does not,
because Marcello has asked (smartly) that we do it first in 2.5 and then
backport to 2.4, so that the final interface is the same in each.

My understanding is that these are referred to as the "sard changes"
only because they provide the sort of information "sar -d" provided in
some unixes. There isn't a sar daemon running somewhere producing or
consuming this information.

The numbers in diskstats only increase. A tool like iostat can collect
the information from diskstats and do interesting things with it, like
provide just the delta from snapshot 1 and snapshot 2. (and 3, and 4,
and ..) and thus provide info like "# of write io's per 5 seconds".

If we can get this work playing nicely with those existing sard
tools, get the kernel_stat stuff cleaned up and get the relevant
userspace tools working and merged upstream then we have a neat
bundle to submit.

I think this is very doable.

Rick

2002-09-12 09:56:49

by Alexander Viro

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34



On Thu, 12 Sep 2002, Rick Lindsley wrote:

> OK, that's a start. I think there was some work done on making
> kernel_stat percpu as well.
>
> Yes there's work on a couple of different fronts there. There is work
> to specifically make disk stats per cpu (actually, I have some 2.4
> patches already I could port), and there is a more general interface
> (statctr_t) which Dipankar Sarma ([email protected]) is working on
> for 2.5 for stat counters in general which generalizes the per-cpu
> concept.

OK. First of all, I have serious problems with collecting all these
stats to hd_struct. Reasons:
a) _all_ stats you collect by request will be for entire disk.
By the time when request had been created, we simply don't remember
which partition it used to be. Pretending that they are per-partition
only obfuscates the things. Stats by bio are per-partition, but... see below
b) hd_struct had been completely removed from drivers (they neither
know nor care about it) and it will be completely gone RSN. There will be
a replacement, obviously, but it's not even certain that it will be a
single array.

> That would be great ... but I want to be sure we don't take so long
> working on the polish that we miss 10/31 with the main event. I can
> spend a few days incorporating all of these things and repost, if you
> don't think it makes it "too many changes at one time."

It might. There is a big rewrite of that area going on right now.
In particular, we are getting to the point when _all_ block devices are
going to acquire gendisks - partitioned or not. Next step after that
will be sane refcounting for gendisks and once that is done we are getting
rid of get_gendisk() for good - pointer will be cached in struct block_device
while it's open _and_ it will replace ->rq_dev in struct request.

These changes alone can cause a lot of fun, and then there is
reorganization of struct gendisk itself...

2002-09-12 10:29:21

by Rick Lindsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

OK. First of all, I have serious problems with collecting all these
stats to hd_struct.

I agree. They should be part of a disk statistics structure somewhere,
and that should probably be made part of this patch.

a) _all_ stats you collect by request will be for entire disk.
By the time when request had been created, we simply don't remember
which partition it used to be. Pretending that they are
per-partition only obfuscates the things. Stats by bio are
per-partition, but... see below

Yeah, I noticed :) The best we can do now, I think, is collect
per-partition information in generic_make_request() before the
partition remap. After that, the information is gone, and trying to
carry it along gets real ugly, especially when you remember that I/O
requests can get merged.

But it *is* still useful to know which partitions are getting hit up
with requests. If you've got some data that is continually getting
read/written to disk in a particular area, then (for instance) you
might want to make that data reside on a memdisk, if possible. Or
mirror it, if you're using a volume manager, to spread the I/O out over
multiple copies.

It might. There is a big rewrite of that area going on right
now. In particular, we are getting to the point when _all_
block devices are going to acquire gendisks - partitioned or not.
Next step after that will be sane refcounting for gendisks and once
that is done we are getting rid of get_gendisk() for good - pointer
will be cached in struct block_device while it's open _and_ it will
replace ->rq_dev in struct request.

In a development tree, everybody's got to adjust. I've no problem
keeping up the patches, or in consulting with the team making the
changes you described, to make these statistic changes stick and not
look like they were glued on at the last minute. The statistics are
valuable

a) at a gross level to sysadmins trying to balance their disk I/O
b) at a finer level by performance tuners, looking to maximize their
I/O rates

and they're worth the work.

Since 10/31 looms ever closer, I presume the changes you mentioned
will be rolling out in force in the next few weeks?

Rick

2002-09-12 16:11:25

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

On Thu, 12 Sep 2002, Andrew Morton wrote:

> b). Let's get the kernel right and change userspace to follow. We have
> another accounting patch which breaks top(1), so Rik has fixed it (and
> is feeding the fixes upstream).

Speaking of which, what happened to the iowait patch ?
Did you merge it with Linus or did it fall between the cracks ?

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

Spamtraps of the month: [email protected] [email protected]

2002-09-12 16:27:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

Rik van Riel wrote:
>
> On Thu, 12 Sep 2002, Andrew Morton wrote:
>
> > b). Let's get the kernel right and change userspace to follow. We have
> > another accounting patch which breaks top(1), so Rik has fixed it (and
> > is feeding the fixes upstream).
>
> Speaking of which, what happened to the iowait patch ?

I need to add a few words to Documentation/Changes and reintegrate it.

> Did you merge it with Linus or did it fall between the cracks ?

I don't have cracks ;) Not enough patches far that.

2002-09-12 18:37:07

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

On 12 September 2002 05:20, Andrew Morton wrote:
> > Lastly, a bit of a philosophical question. /proc/stat and (with this
> > patch) /proc/diskstats provide some of the same information. Should
> >
> > a) all of it appear in /proc/stat?
> >
> > b) all of it appear in /proc/diskstats?
> >
> > c) keep the current (limited) info in /proc/stat (for backward
> > compatibility) and introduce the expanded info in
> > /proc/diskstats?
> >
> > My preference is b, but I'm open to other opinions.
>
> b). Let's get the kernel right and change userspace to follow. We have
> another accounting patch which breaks top(1), so Rik has fixed it (and
> is feeding the fixes upstream).

Erm... Some of /proc/stat data has no relations to disks, namely CPU counts.
It would be strange to have CPU stats in /proc/diskstats...
--
vda

2002-09-12 18:44:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

Denis Vlasenko wrote:
>
> On 12 September 2002 05:20, Andrew Morton wrote:
> > > Lastly, a bit of a philosophical question. /proc/stat and (with this
> > > patch) /proc/diskstats provide some of the same information. Should
> > >
> > > a) all of it appear in /proc/stat?
> > >
> > > b) all of it appear in /proc/diskstats?
> > >
> > > c) keep the current (limited) info in /proc/stat (for backward
> > > compatibility) and introduce the expanded info in
> > > /proc/diskstats?
> > >
> > > My preference is b, but I'm open to other opinions.
> >
> > b). Let's get the kernel right and change userspace to follow. We have
> > another accounting patch which breaks top(1), so Rik has fixed it (and
> > is feeding the fixes upstream).
>
> Erm... Some of /proc/stat data has no relations to disks, namely CPU counts.
> It would be strange to have CPU stats in /proc/diskstats...

Rick is proposing that all the disk stuff be in /proc/diskstat and
that all the process stuff be in /proc/stat. ie: move all the disk
accounting out of /proc/stat.

2002-09-12 19:37:42

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

Rick Lindsley wrote:
>
> OK, that's a start. I think there was some work done on making
> kernel_stat percpu as well.
>
> Yes there's work on a couple of different fronts there. There is work
> to specifically make disk stats per cpu (actually, I have some 2.4
> patches already I could port), and there is a more general interface
> (statctr_t) which Dipankar Sarma ([email protected]) is working on
> for 2.5 for stat counters in general which generalizes the per-cpu
> concept.
>
> Regardless of which route we go, can you suggest a good exercise to
> demonstrate the advantage of per-cpu counters? It seems intuitive to
> me, but I'm much more comfortable when I have numbers to back me up.

I don't think this is enough to justify a new subsystem like
statctr_t (struct statctr, please).

Looks like we can take the disk stats out of kernel_stat, move all
the vm-related things out of kernel_stat into struct page_state and
what's left of kernel_stat?

unsigned int per_cpu_user[NR_CPUS],
per_cpu_nice[NR_CPUS],
per_cpu_system[NR_CPUS];
unsigned int irqs[NR_CPUS][NR_IRQS];

And that's good, because "kernel statistics" was clearly too
broad a concept. The above is just one concept: interrupts and
scheduler things.

I'll pull the VM accounting out of there; when you have a
close-to-final patch for the disk stats we can give it a whizz
in the -mm patches and then get all the userspace tools working
nicely against that, OK?

I'm not sure that I want to add 14 more fields to /proc/meminfo.
So a new /proc/vmstat may appear. We would then have:

/proc/stat scheduler things
/proc/diskstat disk things
/proc/vmstat vm things

2002-09-12 19:45:48

by Rick Lindsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

Denis Vlasenko wrote:
Erm... Some of /proc/stat data has no relations to disks, namely CPU counts.
It would be strange to have CPU stats in /proc/diskstats...

and Andrew Morton responded:
Rick is proposing that all the disk stuff be in /proc/diskstat and
that all the process stuff be in /proc/stat. ie: move all the disk
accounting out of /proc/stat.

Right. diskstat or some other arbitrary name (I haven't been argued to
a different one yet :)

Rick

2002-09-12 20:34:00

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

On Thu, 12 Sep 2002, Andrew Morton wrote:

> Looks like we can take the disk stats out of kernel_stat, move all
> the vm-related things out of kernel_stat into struct page_state and
> what's left of kernel_stat?
>
> unsigned int per_cpu_user[NR_CPUS],
> per_cpu_nice[NR_CPUS],
> per_cpu_system[NR_CPUS];

[ insert idle and iowait stats here ;) ]

> unsigned int irqs[NR_CPUS][NR_IRQS];
>
> And that's good, because "kernel statistics" was clearly too
> broad a concept. The above is just one concept: interrupts and
> scheduler things.

Absolutely agreed, this makes things much more manageable.

Btw, how about accounting for the number of syscalls made,
like some other Unix systems do ? ;)

> I'm not sure that I want to add 14 more fields to /proc/meminfo.
> So a new /proc/vmstat may appear. We would then have:
>
> /proc/stat scheduler things
> /proc/diskstat disk things
> /proc/vmstat vm things

Sounds fair, current procps doesn't support the new /proc/stat
fields anyway. Let me know what stuff will look like and I'll
get procps into gear.

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

Spamtraps of the month: [email protected] [email protected]

2002-09-13 11:29:44

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

In article <[email protected]> Andrew Morton wrote:
> Rick Lindsley wrote:
>> Regardless of which route we go, can you suggest a good exercise to
>> demonstrate the advantage of per-cpu counters? It seems intuitive to
>> me, but I'm much more comfortable when I have numbers to back me up.

> I don't think this is enough to justify a new subsystem like
> statctr_t (struct statctr, please).

statctr_t was originally used to hide the fact that in UP kernels
it reduces to unsigned long. But I think that was not necessary -

#ifdef CONFIG_SMP
struct statctr {
unsigned long ctr;
};
#else
struct statctr {
unsigned long *ctr;
};
#endif

struct disk_stats {
....
....
struct statctr nr_reads;
};

I would presume that for UP kernels,

static inline void statctr_inc(struct statctr *stctr)
{
(stctr->ctr)++;
}

statctr_inc(&disk_stats.nr_reads) would generate the same code
as diskstats.nr_reads++; We will verify this and remove statctr_t.

I think justifying statctrs based on numbers from one single usage will
likely be difficult. Our measurements (e.g. net_device_stats and
cache event profiling with kernprof) showed that while cache misses
in those routines are reduced significantly, it isn't enough to make
an impact in the benchmark throughput. We need to adopt statctrs
in many subsystems. Perhaps that will show some benefits specially
on higher-end hardware (16CPU+).

Since we are beginning to run linux on such hardware, why not adopt a simple
framework now ?

Thanks
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-09-14 18:51:44

by Lev Makhlis

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

> +#define MSEC(x) ((x) * 1000 / HZ)

Perhaps it would be better to report the times in ticks using
jiffies_to_clock_t(), and let the userland do further conversions?
The macro above has an overflow problem, it creates a counter
that wraps at 2^32 / HZ (instead of 2^32), and theoretically, the
userland doesn't even know what the internal HZ is. The overflow
can be avoided with something like
#define MSEC(x) (((x) / HZ) * 1000 + ((x) % HZ) * 1000 / HZ)
but I think it would be cleaner just to change the units to ticks,
especially if we're moving it to a different file and procps will
need to be changed anyway.

2002-09-18 17:53:33

by Randy.Dunlap

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

On Sat, 14 Sep 2002, Lev Makhlis wrote:

| > +#define MSEC(x) ((x) * 1000 / HZ)
|
| Perhaps it would be better to report the times in ticks using
| jiffies_to_clock_t(), and let the userland do further conversions?
| The macro above has an overflow problem, it creates a counter
| that wraps at 2^32 / HZ (instead of 2^32), and theoretically, the
| userland doesn't even know what the internal HZ is. The overflow
| can be avoided with something like
| #define MSEC(x) (((x) / HZ) * 1000 + ((x) % HZ) * 1000 / HZ)
| but I think it would be cleaner just to change the units to ticks,
| especially if we're moving it to a different file and procps will
| need to be changed anyway.

Thanks for pointing this out.

I'd rather not expose more ticks in /proc, so for now
I'll ask Rick to use this #define for MSEC, which does
indeed work nicely.

--
~Randy
"Linux is not a research project. Never was, never will be."
-- Linus, 2002-09-02

2002-09-18 19:33:10

by Lev Makhlis

[permalink] [raw]
Subject: Re: [RFC][PATCH] sard changes for 2.5.34

On Wednesday 18 September 2002 01:54 pm, Randy.Dunlap wrote:
> On Sat, 14 Sep 2002, Lev Makhlis wrote:
> | > +#define MSEC(x) ((x) * 1000 / HZ)
> |
> | Perhaps it would be better to report the times in ticks using
> | jiffies_to_clock_t(), and let the userland do further conversions?
> | The macro above has an overflow problem, it creates a counter
> | that wraps at 2^32 / HZ (instead of 2^32), and theoretically, the
> | userland doesn't even know what the internal HZ is. The overflow
> | can be avoided with something like
> | #define MSEC(x) (((x) / HZ) * 1000 + ((x) % HZ) * 1000 / HZ)
> | but I think it would be cleaner just to change the units to ticks,
> | especially if we're moving it to a different file and procps will
> | need to be changed anyway.
>
> Thanks for pointing this out.
>
> I'd rather not expose more ticks in /proc, so for now
> I'll ask Rick to use this #define for MSEC, which does
> indeed work nicely.

In that case, I also suggest manual optimization for "convenient"
values of HZ, because from what I've seen, GCC can't figure this
out on its own:

#if 1000 % HZ == 0
#define MSEC(x) ((x) * (1000 / HZ))
#elif HZ % 1000 == 0
#define MSEC(x) ((x) / (HZ / 1000))
#else
#define MSEC(x) (((x) / HZ) * 1000 + ((x) % HZ) * 1000 / HZ)
#endif