2002-08-06 14:05:23

by [email protected]

[permalink] [raw]
Subject: [PATCH] conditionally re-enable per-disk stats, convert to seq_file

This patch against 2.4.20-pre1 converts /proc/partitions to the seq_file
interface as in 2.5, makes it report the sard-style extended disk
statistics condititional on CONFIG_BLK_STATS and disables the gathering
of those totally otherwise to not waste memory and processing power.

This patch is based on the concept of Kurt's statistics patch, although
it is implemented very differently to avoid #ifdef hell.


diff -uNr -Xdontdiff -p linux-2.4.20-pre1/Documentation/Configure.help linux/Documentation/Configure.help
--- linux-2.4.20-pre1/Documentation/Configure.help Tue Aug 6 11:31:46 2002
+++ linux/Documentation/Configure.help Tue Aug 6 14:04:50 2002
@@ -531,6 +531,17 @@ CONFIG_BLK_DEV_NBD

If unsure, say N.

+Per partition statistics in /proc/partitions
+CONFIG_BLK_STATS
+ If you say yes here, your kernel will keep statistical information
+ for every partition. The information includes things as numbers of
+ read and write accesses, thenumber of merged requests etc.
+
+ This is required for the full functionality of sar(8) and interesting
+ if you want to do performance tuning, by tweaking the elevator, e.g.
+
+ If unsure, say N.
+
ATA/IDE/MFM/RLL support
CONFIG_IDE
If you say Y here, your kernel will be able to manage low cost mass
diff -uNr -Xdontdiff -p linux-2.4.20-pre1/drivers/block/Config.in linux/drivers/block/Config.in
--- linux-2.4.20-pre1/drivers/block/Config.in Tue Aug 6 11:29:02 2002
+++ linux/drivers/block/Config.in Tue Aug 6 14:05:46 2002
@@ -48,4 +48,6 @@ if [ "$CONFIG_BLK_DEV_RAM" = "y" -o "$CO
fi
dep_bool ' Initial RAM disk (initrd) support' CONFIG_BLK_DEV_INITRD $CONFIG_BLK_DEV_RAM

+bool 'Per partition statistics in /proc/partitions' CONFIG_BLK_STATS
+
endmenu
diff -uNr -Xdontdiff -p linux-2.4.20-pre1/drivers/block/genhd.c linux/drivers/block/genhd.c
--- linux-2.4.20-pre1/drivers/block/genhd.c Tue Aug 6 11:30:40 2002
+++ linux/drivers/block/genhd.c Tue Aug 6 15:13:41 2002
@@ -22,10 +22,9 @@
#include <linux/blk.h>
#include <linux/init.h>
#include <linux/spinlock.h>
+#include <linux/seq_file.h>


-static rwlock_t gendisk_lock;
-
/*
* Global kernel list of partitioning information.
*
@@ -34,6 +33,7 @@ static rwlock_t gendisk_lock;
*/
/*static*/ struct gendisk *gendisk_head;
static struct gendisk *gendisk_array[MAX_BLKDEV];
+static rwlock_t gendisk_lock = RW_LOCK_UNLOCKED;

EXPORT_SYMBOL(gendisk_head);

@@ -153,46 +153,83 @@ walk_gendisk(int (*walk)(struct gendisk
return error;
}

-
#ifdef CONFIG_PROC_FS
-int
-get_partition_list(char *page, char **start, off_t offset, int count)
+/* iterator */
+static void *part_start(struct seq_file *s, loff_t *pos)
{
struct gendisk *gp;
- struct hd_struct *hd;
- char buf[64];
- int len, n;

- len = sprintf(page, "major minor #blocks name\n\n");
-
read_lock(&gendisk_lock);
- for (gp = gendisk_head; gp; gp = gp->next) {
- for (n = 0; n < (gp->nr_real << gp->minor_shift); n++) {
- if (gp->part[n].nr_sects == 0)
- continue;
-
- hd = &gp->part[n]; disk_round_stats(hd);
- len += sprintf(page + len,
- "%4d %4d %10d %s\n", gp->major,
- n, gp->sizes[n], disk_name(gp, n, buf));
-
- if (len < offset)
- offset -= len, len = 0;
- else if (len >= offset + count)
- goto out;
- }
- }
+ for (gp = gendisk_head; gp; gp = gp->next)
+ if (!*pos--)
+ return gp;
+ return NULL;
+}

-out:
+static void *part_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ ++*pos;
+ return ((struct gendisk *)v)->next;
+}
+
+static void part_stop(struct seq_file *s, void *v)
+{
read_unlock(&gendisk_lock);
- *start = page + offset;
- len -= offset;
- if (len < 0)
- len = 0;
- return len > count ? count : len;
}
+
+static int part_show(struct seq_file *s, void *v)
+{
+ struct gendisk *gp = v;
+ char buf[64];
+ int n;
+
+ if (gp == gendisk_head) {
+ seq_puts(s, "major minor #blocks name"
+#ifdef CONFIG_BLK_STATS
+ " rio rmerge rsect ruse wio wmerge "
+ "wsect wuse running use aveq"
#endif
+ "\n\n");
+ }

+ /* show the full disk and all non-0 size partitions of it */
+ for (n = 0; n < (gp->nr_real << gp->minor_shift); n++) {
+ int mask = (1<<gp->minor_shift) - 1;
+
+ if (!(n & mask) || gp->part[n].nr_sects) {
+ struct hd_struct *hd = &gp->part[n];
+
+#ifdef CONFIG_BLK_STATS
+ disk_round_stats(hd);
+ seq_printf(s, "%4d %4d %10d %s "
+ "%d %d %d %d %d %d %d %d %d %d %d\n",
+ gp->major, n, gp->sizes[n],
+ disk_name(gp, n, buf),
+ hd->rd_ios, hd->rd_merges,
+#define MSEC(x) ((x) * 1000 / HZ)
+ hd->rd_sectors, MSEC(hd->rd_ticks),
+ hd->wr_ios, hd->wr_merges,
+ hd->wr_sectors, MSEC(hd->wr_ticks),
+ hd->ios_in_flight, MSEC(hd->io_ticks),
+ MSEC(hd->aveq));
+#else
+ seq_printf(s, "%4d %4d %10d %s\n",
+ gp->major, n, gp->sizes[n],
+ disk_name(gp, n, buf));
+#endif /* CONFIG_BLK_STATS */
+ }
+ }
+
+ return 0;
+}
+
+struct seq_operations partitions_op = {
+ .start = part_start,
+ .next = part_next,
+ .stop = part_stop,
+ .show = part_show,
+};
+#endif

extern int blk_dev_init(void);
extern int net_dev_init(void);
@@ -201,7 +238,6 @@ extern int atmdev_init(void);

int __init device_init(void)
{
- rwlock_init(&gendisk_lock);
blk_dev_init();
sti();
#ifdef CONFIG_NET
diff -uNr -Xdontdiff -p linux-2.4.20-pre1/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
--- linux-2.4.20-pre1/drivers/block/ll_rw_blk.c Tue Aug 6 11:27:59 2002
+++ linux/drivers/block/ll_rw_blk.c Tue Aug 6 15:13:53 2002
@@ -594,9 +594,14 @@ inline void drive_stat_acct (kdev_t dev,
printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
}

-/* Return up to two hd_structs on which to do IO accounting for a given
- * request. On a partitioned device, we want to account both against
- * the partition and against the whole disk. */
+#ifdef CONFIG_BLK_STATS
+/*
+ * Return up to two hd_structs on which to do IO accounting for a given
+ * request.
+ *
+ * On a partitioned device, we want to account both against the partition
+ * and against the whole disk.
+ */
static void locate_hd_struct(struct request *req,
struct hd_struct **hd1,
struct hd_struct **hd2)
@@ -611,22 +616,26 @@ static void locate_hd_struct(struct requ
/* Mask out the partition bits: account for the entire disk */
int devnr = MINOR(req->rq_dev) >> gd->minor_shift;
int whole_minor = devnr << gd->minor_shift;
+
*hd1 = &gd->part[whole_minor];
if (whole_minor != MINOR(req->rq_dev))
*hd2= &gd->part[MINOR(req->rq_dev)];
}
}

-/* Round off the performance stats on an hd_struct. The average IO
- * queue length and utilisation statistics are maintained by observing
- * the current state of the queue length and the amount of time it has
- * been in this state for. Normally, that accounting is done on IO
- * completion, but that can result in more than a second's worth of IO
- * being accounted for within any one second, leading to >100%
- * utilisation. To deal with that, we do a round-off before returning
- * the results when reading /proc/partitions, accounting immediately for
- * all queue usage up to the current jiffies and restarting the counters
- * again. */
+/*
+ * Round off the performance stats on an hd_struct.
+ *
+ * The average IO queue length and utilisation statistics are maintained
+ * by observing the current state of the queue length and the amount of
+ * time it has been in this state for.
+ * Normally, that accounting is done on IO completion, but that can result
+ * in more than a second's worth of IO being accounted for within any one
+ * second, leading to >100% utilisation. To deal with that, we do a
+ * round-off before returning the results when reading /proc/partitions,
+ * accounting immediately for all queue usage up to the current jiffies and
+ * restarting the counters again.
+ */
void disk_round_stats(struct hd_struct *hd)
{
unsigned long now = jiffies;
@@ -639,7 +648,6 @@ void disk_round_stats(struct hd_struct *
hd->last_idle_time = now;
}

-
static inline void down_ios(struct hd_struct *hd)
{
disk_round_stats(hd);
@@ -690,6 +698,7 @@ static void account_io_end(struct hd_str
void req_new_io(struct request *req, int merge, int sectors)
{
struct hd_struct *hd1, *hd2;
+
locate_hd_struct(req, &hd1, &hd2);
if (hd1)
account_io_start(hd1, req, merge, sectors);
@@ -697,15 +706,29 @@ void req_new_io(struct request *req, int
account_io_start(hd2, req, merge, sectors);
}

+void req_merged_io(struct request *req)
+{
+ struct hd_struct *hd1, *hd2;
+
+ locate_hd_struct(req, &hd1, &hd2);
+ if (hd1)
+ down_ios(hd1);
+ if (hd2)
+ down_ios(hd2);
+}
+
void req_finished_io(struct request *req)
{
struct hd_struct *hd1, *hd2;
+
locate_hd_struct(req, &hd1, &hd2);
if (hd1)
account_io_end(hd1, req);
if (hd2)
account_io_end(hd2, req);
}
+EXPORT_SYMBOL(req_finished_io);
+#endif /* CONFIG_BLK_STATS */

/*
* add-request adds a request to the linked list.
@@ -764,7 +787,6 @@ static void attempt_merge(request_queue_
int max_segments)
{
struct request *next;
- struct hd_struct *hd1, *hd2;

next = blkdev_next_request(req);
if (req->sector + req->nr_sectors != next->sector)
@@ -791,12 +813,8 @@ static void attempt_merge(request_queue_

/* One last thing: we have removed a request, so we now have one
less expected IO to complete for accounting purposes. */
+ req_merged_io(req);

- locate_hd_struct(req, &hd1, &hd2);
- if (hd1)
- down_ios(hd1);
- if (hd2)
- down_ios(hd2);
blkdev_release_request(next);
}

@@ -1439,5 +1457,4 @@ EXPORT_SYMBOL(blk_queue_headactive);
EXPORT_SYMBOL(blk_queue_make_request);
EXPORT_SYMBOL(generic_make_request);
EXPORT_SYMBOL(blkdev_release_request);
-EXPORT_SYMBOL(req_finished_io);
EXPORT_SYMBOL(generic_unplug_device);
diff -uNr -Xdontdiff -p linux-2.4.20-pre1/fs/proc/proc_misc.c linux/fs/proc/proc_misc.c
--- linux-2.4.20-pre1/fs/proc/proc_misc.c Tue Aug 6 11:28:14 2002
+++ linux/fs/proc/proc_misc.c Tue Aug 6 13:42:21 2002
@@ -55,7 +55,6 @@ extern int get_stram_list(char *);
extern int get_module_list(char *);
#endif
extern int get_device_list(char *);
-extern int get_partition_list(char *, char **, off_t, int);
extern int get_filesystem_list(char *);
extern int get_exec_domain_list(char *);
extern int get_irq_list(char *);
@@ -254,6 +253,18 @@ static int stram_read_proc(char *page, c
}
#endif

+extern struct seq_operations partitions_op;
+static int partitions_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &partitions_op);
+}
+static struct file_operations proc_partitions_operations = {
+ open: partitions_open,
+ read: seq_read,
+ llseek: seq_lseek,
+ release: seq_release,
+};
+
#ifdef CONFIG_MODULES
static int modules_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
@@ -377,14 +388,6 @@ static int devices_read_proc(char *page,
return proc_calc_metrics(page, start, off, count, eof, len);
}

-static int partitions_read_proc(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len = get_partition_list(page, start, off, count);
- if (len < count) *eof = 1;
- return len;
-}
-
#if !defined(CONFIG_ARCH_S390)
static int interrupts_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
@@ -561,7 +564,6 @@ void __init proc_misc_init(void)
#endif
{"stat", kstat_read_proc},
{"devices", devices_read_proc},
- {"partitions", partitions_read_proc},
#if !defined(CONFIG_ARCH_S390)
{"interrupts", interrupts_read_proc},
#endif
@@ -588,6 +590,7 @@ void __init proc_misc_init(void)
if (entry)
entry->proc_fops = &proc_kmsg_operations;
create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
+ create_seq_entry("partitions", 0, &proc_partitions_operations);
create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
#ifdef CONFIG_MODULES
create_seq_entry("ksyms", 0, &proc_ksyms_operations);
diff -uNr -Xdontdiff -p linux-2.4.20-pre1/include/linux/genhd.h linux/include/linux/genhd.h
--- linux-2.4.20-pre1/include/linux/genhd.h Tue Aug 6 11:28:19 2002
+++ linux/include/linux/genhd.h Tue Aug 6 15:14:13 2002
@@ -62,8 +62,8 @@ struct hd_struct {
unsigned long start_sect;
unsigned long nr_sects;
devfs_handle_t de; /* primary (master) devfs entry */
- int number; /* stupid old code wastes space */

+#ifdef CONFIG_BLK_STATS
/* Performance stats: */
unsigned int ios_in_flight;
unsigned int io_ticks;
@@ -79,6 +79,7 @@ struct hd_struct {
unsigned int wr_merges;
unsigned int wr_ticks;
unsigned int wr_sectors;
+#endif /* CONFIG_BLK_STATS */
};

#define GENHD_FL_REMOVABLE 1
@@ -259,18 +260,22 @@ struct unixware_disklabel {

char *disk_name (struct gendisk *hd, int minor, char *buf);

-/*
- * disk_round_stats is used to round off the IO statistics for a disk
- * for a complete clock tick.
- */
-void disk_round_stats(struct hd_struct *hd);
-
/*
* Account for the completion of an IO request (used by drivers which
* bypass the normal end_request processing)
*/
struct request;
-void req_finished_io(struct request *);
+
+#ifdef CONFIG_BLK_STATS
+extern void disk_round_stats(struct hd_struct *hd);
+extern void req_new_io(struct request *req, int merge, int sectors);
+extern void req_merged_io(struct request *req);
+extern void req_finished_io(struct request *req);
+#else
+static inline void req_new_io(struct request *req, int merge, int sectors) { }
+static inline void req_merged_io(struct request *req) { }
+static inline void req_finished_io(struct request *req) { }
+#endif /* CONFIG_BLK_STATS */

extern void devfs_register_partitions (struct gendisk *dev, int minor,
int unregister);


2002-08-07 20:59:02

by Kurt Garloff

[permalink] [raw]
Subject: Re: [PATCH] conditionally re-enable per-disk stats, convert to seq_file

Hi Christoph,

On Tue, Aug 06, 2002 at 04:08:48PM +0200, Christoph Hellwig wrote:
> This patch against 2.4.20-pre1 converts /proc/partitions to the seq_file
> interface as in 2.5, makes it report the sard-style extended disk
> statistics condititional on CONFIG_BLK_STATS and disables the gathering
> of those totally otherwise to not waste memory and processing power.
>
> This patch is based on the concept of Kurt's statistics patch, although
> it is implemented very differently to avoid #ifdef hell.

Actually, I was expecting criticism due to the ifdef stuff I did and was
prepared to come up with something better. But I also learned never to put
effort into something before being asked to ...

Your patch looks fine to me. Thanks for preparing it!

Regards,
--
Kurt Garloff <[email protected]> [Eindhoven, NL]
Physics: Plasma simulations <[email protected]> [TU Eindhoven, NL]
Linux: SCSI, Security <[email protected]> [SuSE Nuernberg, DE]
(See mail header or public key servers for PGP2 and GPG public keys.)


Attachments:
(No filename) (1.04 kB)
(No filename) (189.00 B)
Download all attachments

2002-08-07 21:15:17

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] conditionally re-enable per-disk stats, convert to seq_file


> On Tue, Aug 06, 2002 at 04:08:48PM +0200, Christoph Hellwig wrote:
> > This patch against 2.4.20-pre1 converts /proc/partitions to the seq_file
> > interface as in 2.5, makes it report the sard-style extended disk
> > statistics condititional on CONFIG_BLK_STATS and disables the gathering
> > of those totally otherwise to not waste memory and processing power.

But why in /proc/partitions ?
Maybe /proc/partitions can go away eventually with all info available
under driverfs or so. But for the time being, /proc/partitions is used,
and some changes are planned to make identification of the devices
involved easier.
It is really ugly to stuff a lot of garbage into a file just because
it happens to exist already. If you want disk statistics, why not
put it in /proc/diskstatistics?

Andries

2002-08-07 22:29:49

by [email protected]

[permalink] [raw]
Subject: Re: [PATCH] conditionally re-enable per-disk stats, convert to seq_file

On Wed, Aug 07, 2002 at 11:18:56PM +0200, Andries Brouwer wrote:
> But why in /proc/partitions ?
> Maybe /proc/partitions can go away eventually with all info available
> under driverfs or so. But for the time being, /proc/partitions is used,
> and some changes are planned to make identification of the devices
> involved easier.
> It is really ugly to stuff a lot of garbage into a file just because
> it happens to exist already. If you want disk statistics, why not
> put it in /proc/diskstatistics?

Because it's where existing tools expect it. I agree with you that we
want a nicer interface for 2.6, but give the userbase some time to
prepare for a new interface.

2002-08-07 22:56:00

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] conditionally re-enable per-disk stats, convert to seq_file

On Thu, Aug 08, 2002 at 12:33:25AM +0200, Christoph Hellwig wrote:
> On Wed, Aug 07, 2002 at 11:18:56PM +0200, Andries Brouwer wrote:
> > But why in /proc/partitions ?
> > Maybe /proc/partitions can go away eventually with all info available
> > under driverfs or so. But for the time being, /proc/partitions is used,
> > and some changes are planned to make identification of the devices
> > involved easier.
> > It is really ugly to stuff a lot of garbage into a file just because
> > it happens to exist already. If you want disk statistics, why not
> > put it in /proc/diskstatistics?
>
> Because it's where existing tools expect it. I agree with you that we
> want a nicer interface for 2.6, but give the userbase some time to
> prepare for a new interface.

You create a mess in the official kernel because your user space tools
are broken? And it is easier to patch the kernel than to fix them,
even though you'll have to fix them eventually?
And fixing these tools consists of replacing one filename?

2002-08-07 23:09:42

by Randy.Dunlap

[permalink] [raw]
Subject: Re: [PATCH] conditionally re-enable per-disk stats, convert to seq_file

On Wed, 7 Aug 2002, Andries Brouwer wrote:

|
| > On Tue, Aug 06, 2002 at 04:08:48PM +0200, Christoph Hellwig wrote:
| > > This patch against 2.4.20-pre1 converts /proc/partitions to the seq_file
| > > interface as in 2.5, makes it report the sard-style extended disk
| > > statistics condititional on CONFIG_BLK_STATS and disables the gathering
| > > of those totally otherwise to not waste memory and processing power.
|
| But why in /proc/partitions ?
| Maybe /proc/partitions can go away eventually with all info available
| under driverfs or so. But for the time being, /proc/partitions is used,
| and some changes are planned to make identification of the devices
| involved easier.
| It is really ugly to stuff a lot of garbage into a file just because
| it happens to exist already. If you want disk statistics, why not
| put it in /proc/diskstatistics?

me too.

I'd like to see the disk stats added, but not in /proc/partitions.

--
~Randy

2002-08-07 23:14:41

by [email protected]

[permalink] [raw]
Subject: Re: [PATCH] conditionally re-enable per-disk stats, convert to seq_file

On Wed, Aug 07, 2002 at 04:10:48PM -0700, Randy.Dunlap wrote:
> | It is really ugly to stuff a lot of garbage into a file just because
> | it happens to exist already. If you want disk statistics, why not
> | put it in /proc/diskstatistics?
>
> me too.
>
> I'd like to see the disk stats added, but not in /proc/partitions.

Feel free to implement it. This is the interface use by all major
vendors for ages and the -ac kernel series. It's supported by unpatched
upstream performace tools.

I'll keept this patch around for the few poort soul complaining that
they don't get that's anymore after upgrading to the latest kernel.org
source.