Userspace may wish to know whether a given disk is active or idle, for
example to modify power management policy based on access patterns. This
patch adds a deferrable timer to the block layer which will fire if the
disk is idle for a user-definable period of time, generating a uevent. A
uevent will also be generated if an access is received while the disk is
classified as idle.
This patch seems to work as designed, but introduces a noticable amount of
userspace overhead in udevd. I'm guessing that this is because change events
on block devices are normally associated with disk removal/insertion, so
a large quantity of complex rules end up getting run in order to deal with
RAID setup or whatever. Is there a better way to deliver these events?
---
Documentation/ABI/testing/sysfs-block | 9 +++++
block/blk-core.c | 9 +++++
block/genhd.c | 55 +++++++++++++++++++++++++++++++++
fs/partitions/check.c | 3 ++
include/linux/genhd.h | 6 +++
5 files changed, 82 insertions(+), 0 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 5f3beda..5519720 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -128,3 +128,12 @@ Description:
preferred request size for workloads where sustained
throughput is desired. If no optimal I/O size is
reported this file contains 0.
+
+What: /sys/block/<disk>/idle_hysteresis
+Date: November 2009
+Contact: Matthew Garrett <[email protected]>
+Description:
+ Contains the number of milliseconds to wait after an access
+ before declaring that a disk is idle. Any accesses during
+ this time will reset the timer. "0" (the default) indicates
+ that no events will be generated.
\ No newline at end of file
diff --git a/block/blk-core.c b/block/blk-core.c
index 71da511..f278817 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio)
if (should_fail_request(bio))
goto end_io;
+ if (bio->bi_bdev->bd_disk->hysteresis_time &&
+ bio_has_data(bio) &&
+ !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer,
+ jiffies+msecs_to_jiffies
+ (bio->bi_bdev->bd_disk->hysteresis_time))) {
+ bio->bi_bdev->bd_disk->idle = 0;
+ schedule_work(&bio->bi_bdev->bd_disk->idle_notify);
+ }
+
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..f59fbe0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -504,6 +504,26 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
+static void disk_idle(unsigned long data)
+{
+ struct gendisk *gd = (struct gendisk *)data;
+
+ gd->idle = 1;
+ schedule_work(&gd->idle_notify);
+}
+
+static void disk_idle_notify_thread(struct work_struct *work)
+{
+ struct gendisk *gd = container_of(work, struct gendisk, idle_notify);
+ char event[] = "IDLE=0";
+ char *envp[] = { event, NULL };
+
+ if (gd->idle)
+ event[5] = '1';
+
+ kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
+}
+
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
@@ -543,6 +563,10 @@ void add_disk(struct gendisk *disk)
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
+
+ init_timer(&disk->hysteresis_timer);
+ setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk);
+
register_disk(disk);
blk_register_queue(disk);
@@ -861,6 +885,32 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}
+static ssize_t disk_idle_hysteresis_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%d\n", disk->hysteresis_time);
+}
+
+static ssize_t disk_idle_hysteresis_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ unsigned long timeout;
+ int res;
+
+ res = strict_strtoul(buf, 10, &timeout);
+ if (res)
+ return -EINVAL;
+
+ disk->hysteresis_time = timeout;
+
+ return count;
+}
+
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -870,6 +920,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show,
+ disk_idle_hysteresis_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -890,6 +942,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_idle_hysteresis.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1183,6 +1236,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
device_initialize(disk_to_dev(disk));
INIT_WORK(&disk->async_notify,
media_change_notify_thread);
+ INIT_WORK(&disk->idle_notify,
+ disk_idle_notify_thread);
}
return disk;
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..d55dd29 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -652,6 +652,9 @@ void del_gendisk(struct gendisk *disk)
struct disk_part_iter piter;
struct hd_struct *part;
+ cancel_work_sync(&disk->idle_notify);
+ del_timer_sync(&disk->hysteresis_timer);
+
/* invalidate stuff */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45..7e969a5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
+#include <linux/timer.h>
#ifdef CONFIG_BLOCK
@@ -163,10 +164,15 @@ struct gendisk {
atomic_t sync_io; /* RAID */
struct work_struct async_notify;
+ struct work_struct idle_notify;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
+
+ bool idle;
+ int hysteresis_time;
+ struct timer_list hysteresis_timer;
};
static inline struct gendisk *part_to_disk(struct hd_struct *part)
--
1.6.5.2
On Tue, Nov 17, 2009 at 15:37, Matthew Garrett <[email protected]> wrote:
> Userspace may wish to know whether a given disk is active or idle, for
> example to modify power management policy based on access patterns. This
> patch adds a deferrable timer to the block layer which will fire if the
> disk is idle for a user-definable period of time, generating a uevent. A
> uevent will also be generated if an access is received while the disk is
> classified as idle.
>
> This patch seems to work as designed, but introduces a noticable amount of
> userspace overhead in udevd. I'm guessing that this is because change events
> on block devices are normally associated with disk removal/insertion, so
> a large quantity of complex rules end up getting run in order to deal with
> RAID setup or whatever. Is there a better way to deliver these events?
I guess, at the moment the disk tells it's idle, udev will open() the
disk and look for changed signatures, and end its idle state. :)
Uevents might not be the right interface, they are usually used if the
device needs to be (re-)examined, which will the idle thing into a
loop with the current setups, I guess.
Maybe we can use a sysfs file which can be open()'d and something can
watch with poll(), and gets woken up by the kernel, after the drive
changes its state? MD raid, as an example, has files like this in
sysfs to allow monitoring. That way, there is also no overhead, if the
requesting process goes away, which is usually the nicer interface,
than a global switch, which does not care about if the requesting
process still exists.
Thanks,
Kay
On Tue, 2009-11-17 at 16:55 +0100, Kay Sievers wrote:
> I guess, at the moment the disk tells it's idle, udev will open() the
> disk and look for changed signatures, and end its idle state. :)
Huh, this will probably cause interesting loops then ;-)
> Uevents might not be the right interface, they are usually used if the
> device needs to be (re-)examined, which will the idle thing into a
> loop with the current setups, I guess.
Yeah.
> Maybe we can use a sysfs file which can be open()'d and something can
> watch with poll(), and gets woken up by the kernel, after the drive
> changes its state? MD raid, as an example, has files like this in
> sysfs to allow monitoring. That way, there is also no overhead, if the
> requesting process goes away, which is usually the nicer interface,
> than a global switch, which does not care about if the requesting
> process still exists.
Can't we just use /sys/block/sdX/stat? I believe this file already has
the property that the contents of the file will stay constant exactly
when the device is idle. Being able to use poll() for change
notifications seems like a good interface.
Thanks,
David
Ok. How about something like this? It adds an extra field to the stat
file and introduces David's suggestion of making it pollable.
commit ba6d4c7ab7940ae8dc11a884281d0a36b20455b9
Author: Matthew Garrett <[email protected]>
Date: Mon Nov 16 17:44:03 2009 -0500
[RFC] Add support for uevents on block device idle changes
Userspace may wish to know whether a given disk is active or idle, for
example to modify power management policy based on access patterns. This
patch adds a deferrable timer to the block layer which will fire if the
disk is idle for a user-definable period of time, generating a uevent. A
uevent will also be generated if an access is received while the disk is
classified as idle.
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 5f3beda..8747f42 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -3,7 +3,7 @@ Date: February 2008
Contact: Jerome Marchand <[email protected]>
Description:
The /sys/block/<disk>/stat files displays the I/O
- statistics of disk <disk>. They contain 11 fields:
+ statistics of disk <disk>. They contain 12 fields:
1 - reads completed succesfully
2 - reads merged
3 - sectors read
@@ -15,6 +15,7 @@ Description:
9 - I/Os currently in progress
10 - time spent doing I/Os (ms)
11 - weighted time spent doing I/Os (ms)
+ 12 - 1 if the disk is idle (determined by idle_hysteresis)
For more details refer Documentation/iostats.txt
@@ -128,3 +129,12 @@ Description:
preferred request size for workloads where sustained
throughput is desired. If no optimal I/O size is
reported this file contains 0.
+
+What: /sys/block/<disk>/idle_hysteresis
+Date: November 2009
+Contact: Matthew Garrett <[email protected]>
+Description:
+ Contains the number of milliseconds to wait after an access
+ before declaring that a disk is idle. Any accesses during
+ this time will reset the timer. "0" (the default) indicates
+ that no events will be generated.
\ No newline at end of file
diff --git a/block/blk-core.c b/block/blk-core.c
index 71da511..f278817 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio)
if (should_fail_request(bio))
goto end_io;
+ if (bio->bi_bdev->bd_disk->hysteresis_time &&
+ bio_has_data(bio) &&
+ !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer,
+ jiffies+msecs_to_jiffies
+ (bio->bi_bdev->bd_disk->hysteresis_time))) {
+ bio->bi_bdev->bd_disk->idle = 0;
+ schedule_work(&bio->bi_bdev->bd_disk->idle_notify);
+ }
+
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..ea37e48 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -504,6 +504,21 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
+static void disk_idle(unsigned long data)
+{
+ struct gendisk *gd = (struct gendisk *)data;
+
+ gd->idle = 1;
+ schedule_work(&gd->idle_notify);
+}
+
+static void disk_idle_notify_thread(struct work_struct *work)
+{
+ struct gendisk *gd = container_of(work, struct gendisk, idle_notify);
+
+ sysfs_notify(&disk_to_dev(gd)->kobj, NULL, "stat");
+}
+
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
@@ -543,6 +558,10 @@ void add_disk(struct gendisk *disk)
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
+
+ init_timer(&disk->hysteresis_timer);
+ setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk);
+
register_disk(disk);
blk_register_queue(disk);
@@ -861,6 +880,32 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}
+static ssize_t disk_idle_hysteresis_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%d\n", disk->hysteresis_time);
+}
+
+static ssize_t disk_idle_hysteresis_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ unsigned long timeout;
+ int res;
+
+ res = strict_strtoul(buf, 10, &timeout);
+ if (res)
+ return -EINVAL;
+
+ disk->hysteresis_time = timeout;
+
+ return count;
+}
+
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -870,6 +915,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show,
+ disk_idle_hysteresis_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -890,6 +937,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_idle_hysteresis.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1183,6 +1231,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
device_initialize(disk_to_dev(disk));
INIT_WORK(&disk->async_notify,
media_change_notify_thread);
+ INIT_WORK(&disk->idle_notify,
+ disk_idle_notify_thread);
}
return disk;
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..cccfb7d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -230,6 +230,7 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
+ struct gendisk *gd = dev_to_disk(dev);
int cpu;
cpu = part_stat_lock();
@@ -238,7 +239,7 @@ ssize_t part_stat_show(struct device *dev,
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
- "%8u %8u %8u"
+ "%8u %8u %8u %1u"
"\n",
part_stat_read(p, ios[READ]),
part_stat_read(p, merges[READ]),
@@ -250,7 +251,8 @@ ssize_t part_stat_show(struct device *dev,
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
part_in_flight(p),
jiffies_to_msecs(part_stat_read(p, io_ticks)),
- jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+ jiffies_to_msecs(part_stat_read(p, time_in_queue)),
+ gd->idle);
}
ssize_t part_inflight_show(struct device *dev,
@@ -652,6 +654,9 @@ void del_gendisk(struct gendisk *disk)
struct disk_part_iter piter;
struct hd_struct *part;
+ del_timer_sync(&disk->hysteresis_timer);
+ cancel_work_sync(&disk->idle_notify);
+
/* invalidate stuff */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45..7e969a5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
+#include <linux/timer.h>
#ifdef CONFIG_BLOCK
@@ -163,10 +164,15 @@ struct gendisk {
atomic_t sync_io; /* RAID */
struct work_struct async_notify;
+ struct work_struct idle_notify;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
+
+ bool idle;
+ int hysteresis_time;
+ struct timer_list hysteresis_timer;
};
static inline struct gendisk *part_to_disk(struct hd_struct *part)
--
Matthew Garrett | [email protected]
On Tue, Nov 17, 2009 at 19:57, Matthew Garrett <[email protected]> wrote:
> Ok. How about something like this? It adds an extra field to the stat
> file and introduces David's suggestion of making it pollable.
Hmm, while thinking more about it, I'm still not sure if we should add
an interface which can work for a single user only, and add a timer to
the kernel which is only used by a userspace policy thing.
Wouldn't it be good enough, if we add a file "idle_since" which
contains the time of the actual disk idle time, and userspace can
schedule a re-examination of that value at the actual end of the idle
time it is looking for?
Thanks,
Kay
On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote:
> Wouldn't it be good enough, if we add a file "idle_since" which
> contains the time of the actual disk idle time, and userspace can
> schedule a re-examination of that value at the actual end of the idle
> time it is looking for?
That would require either polling or waking up a userspace application
on every disk access. Doing it in-kernel involves only a single timer
wakeup for every active/idle transition.
--
Matthew Garrett | [email protected]
On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote:
>
>> Wouldn't it be good enough, if we add a file "idle_since" which
>> contains the time of the actual disk idle time, and userspace can
>> schedule a re-examination of that value at the actual end of the idle
>> time it is looking for?
>
> That would require either polling or waking up a userspace application
> on every disk access. Doing it in-kernel involves only a single timer
> wakeup for every active/idle transition.
How would it? If you look for, like a 60 seconds timeout, and the file
contains 20, you schedule a wakeup in 40 seconds. If the file after
the 40 seconds contains 60, you reached your idle timeout exactly at
that moment, if it's less, then you re-calculate and start from the
beginning.
Kay
On Wed, Nov 18, 2009 at 08:47:37PM +0100, Kay Sievers wrote:
> On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <[email protected]> wrote:
> > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote:
> >
> >> Wouldn't it be good enough, if we add a file "idle_since" which
> >> contains the time of the actual disk idle time, and userspace can
> >> schedule a re-examination of that value at the actual end of the idle
> >> time it is looking for?
> >
> > That would require either polling or waking up a userspace application
> > on every disk access. Doing it in-kernel involves only a single timer
> > wakeup for every active/idle transition.
>
> How would it? If you look for, like a 60 seconds timeout, and the file
> contains 20, you schedule a wakeup in 40 seconds. If the file after
> the 40 seconds contains 60, you reached your idle timeout exactly at
> that moment, if it's less, then you re-calculate and start from the
> beginning.
How is that not polling? You'll repeatedly read a file looking for a
value that may never appear - imagine the case where you're waiting for
60 seconds of idleness, but the disk always becomes active again after
50.
--
Matthew Garrett | [email protected]
On Wed, Nov 18, 2009 at 20:53, Matthew Garrett <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 08:47:37PM +0100, Kay Sievers wrote:
>> On Wed, Nov 18, 2009 at 20:40, Matthew Garrett <[email protected]> wrote:
>> > On Wed, Nov 18, 2009 at 08:30:07PM +0100, Kay Sievers wrote:
>> >
>> >> Wouldn't it be good enough, if we add a file "idle_since" which
>> >> contains the time of the actual disk idle time, and userspace can
>> >> schedule a re-examination of that value at the actual end of the idle
>> >> time it is looking for?
>> >
>> > That would require either polling or waking up a userspace application
>> > on every disk access. Doing it in-kernel involves only a single timer
>> > wakeup for every active/idle transition.
>>
>> How would it? If you look for, like a 60 seconds timeout, and the file
>> contains 20, you schedule a wakeup in 40 seconds. If the file after
>> the 40 seconds contains 60, you reached your idle timeout exactly at
>> that moment, if it's less, then you re-calculate and start from the
>> beginning.
>
> How is that not polling? You'll repeatedly read a file looking for a
> value that may never appear - imagine the case where you're waiting for
> 60 seconds of idleness, but the disk always becomes active again after
> 50.
Sure, but what's wrong with reading that file every 50 seconds? Almost
all boxes poll for media changes of optical drives and usb card
readers anyway, so it's not that we are not doing stuff like this
already.
Kay
On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote:
> Sure, but what's wrong with reading that file every 50 seconds? Almost
> all boxes poll for media changes of optical drives and usb card
> readers anyway, so it's not that we are not doing stuff like this
> already.
We poll for media because there's no event-based way of avoiding it - in
this case there is.
--
Matthew Garrett | [email protected]
On Wed, Nov 18, 2009 at 21:07, Matthew Garrett <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote:
>
>> Sure, but what's wrong with reading that file every 50 seconds? Almost
>> all boxes poll for media changes of optical drives and usb card
>> readers anyway, so it's not that we are not doing stuff like this
>> already.
>
> We poll for media because there's no event-based way of avoiding it - in
> this case there is.
That's true, but I think there is a significant difference between
polling every one or two seconds for media changes, and usually one or
two minutes for a disk idle. It's not that we poll in a rather hight
frequency, in an arbitrary interval, and check if some condition is
met.
I still don't think that we should add new event interfaces which are
single-subscriber only, and use global values for a specific user.
What if there will be another independent user for this, which might
want a different timeout? They fight over the trigger value to set in
sysfs?
>From my perspective, the once-at-timeout wakeup is more acceptable
than an in-kernel policy setting for a single-subscriber event
interface.
Thanks,
Kay
On Wed, Nov 18, 2009 at 22:06, Kay Sievers <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 21:07, Matthew Garrett <[email protected]> wrote:
>> On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote:
>>
>>> Sure, but what's wrong with reading that file every 50 seconds? Almost
>>> all boxes poll for media changes of optical drives and usb card
>>> readers anyway, so it's not that we are not doing stuff like this
>>> already.
>>
>> We poll for media because there's no event-based way of avoiding it - in
>> this case there is.
>
> That's true, but I think there is a significant difference between
> polling every one or two seconds for media changes, and usually one or
> two minutes for a disk idle. It's not that we poll in a rather hight
> frequency, in an arbitrary interval, and check if some condition is
> met.
>
> I still don't think that we should add new event interfaces which are
> single-subscriber only, and use global values for a specific user.
> What if there will be another independent user for this, which might
> want a different timeout? They fight over the trigger value to set in
> sysfs?
>
> From my perspective, the once-at-timeout wakeup is more acceptable
> than an in-kernel policy setting for a single-subscriber event
> interface.
I guess, the "idle_since" file could be made poll()able, and throw an
event when the idle time is re-set to 0, so the value checking needs
only to happen as long we wait for the disk to become idle. As long as
it's busy anyway, the rare wakeups should not matter much. :)
Kay
On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote:
> That's true, but I think there is a significant difference between
> polling every one or two seconds for media changes, and usually one or
> two minutes for a disk idle. It's not that we poll in a rather hight
> frequency, in an arbitrary interval, and check if some condition is
> met.
My use cases are on the order of a second.
> I still don't think that we should add new event interfaces which are
> single-subscriber only, and use global values for a specific user.
> What if there will be another independent user for this, which might
> want a different timeout? They fight over the trigger value to set in
> sysfs?
You can trivially multiplex without any additional wakeups. Something
like devkit-disks can simply trigger on the lowest requested time and
then schedule wakeups for subscribers who want a different timeout.
> From my perspective, the once-at-timeout wakeup is more acceptable
> than an in-kernel policy setting for a single-subscriber event
> interface.
I'd be open to it being something for multiple subscribers, though that
would add to the complexity in the block code and I'm not sure that's
needed.
--
Matthew Garrett | [email protected]
On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote:
> I guess, the "idle_since" file could be made poll()able, and throw an
> event when the idle time is re-set to 0, so the value checking needs
> only to happen as long we wait for the disk to become idle. As long as
> it's busy anyway, the rare wakeups should not matter much. :)
That'd be a userspace wakeup every time something gets submitted to the
block device, which sounds far from ideal...
--
Matthew Garrett | [email protected]
On Wed, Nov 18, 2009 at 22:35, Matthew Garrett <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote:
>
>> I guess, the "idle_since" file could be made poll()able, and throw an
>> event when the idle time is re-set to 0, so the value checking needs
>> only to happen as long we wait for the disk to become idle. As long as
>> it's busy anyway, the rare wakeups should not matter much. :)
>
> That'd be a userspace wakeup every time something gets submitted to the
> block device, which sounds far from ideal...
No, you would only poll() when you reached the timeout and the disk
entered the idle state. This can not happen more frequently than the
timeout itself.
Kay
On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote:
>
>> That's true, but I think there is a significant difference between
>> polling every one or two seconds for media changes, and usually one or
>> two minutes for a disk idle. It's not that we poll in a rather hight
>> frequency, in an arbitrary interval, and check if some condition is
>> met.
>
> My use cases are on the order of a second.
>
>> I still don't think that we should add new event interfaces which are
>> single-subscriber only, and use global values for a specific user.
>> What if there will be another independent user for this, which might
>> want a different timeout? They fight over the trigger value to set in
>> sysfs?
>
> You can trivially multiplex without any additional wakeups. Something
> like devkit-disks can simply trigger on the lowest requested time and
> then schedule wakeups for subscribers who want a different timeout.
No, it can't do this race-free. And it's far from trivial. It can not
know when something changes the single global value.
Kay
On Wed, Nov 18, 2009 at 10:39:26PM +0100, Kay Sievers wrote:
> On Wed, Nov 18, 2009 at 22:35, Matthew Garrett <[email protected]> wrote:
> > On Wed, Nov 18, 2009 at 10:29:23PM +0100, Kay Sievers wrote:
> >
> >> I guess, the "idle_since" file could be made poll()able, and throw an
> >> event when the idle time is re-set to 0, so the value checking needs
> >> only to happen as long we wait for the disk to become idle. As long as
> >> it's busy anyway, the rare wakeups should not matter much. :)
> >
> > That'd be a userspace wakeup every time something gets submitted to the
> > block device, which sounds far from ideal...
>
> No, you would only poll() when you reached the timeout and the disk
> entered the idle state. This can not happen more frequently than the
> timeout itself.
I don't understand. idle_since would be reset on every access to the
block device. The alternative is to generate an event when the disk goes
idle, but that goes back to requiring a timer in the kernel...
--
Matthew Garrett | [email protected]
On Tuesday 17 November 2009 15:37:39 Matthew Garrett wrote:
> @@ -1452,6 +1452,15 @@ static inline void __generic_make_request(struct bio *bio)
> if (should_fail_request(bio))
> goto end_io;
>
> + if (bio->bi_bdev->bd_disk->hysteresis_time &&
> + bio_has_data(bio) &&
> + !mod_timer(&bio->bi_bdev->bd_disk->hysteresis_timer,
> + jiffies+msecs_to_jiffies
> + (bio->bi_bdev->bd_disk->hysteresis_time))) {
> + bio->bi_bdev->bd_disk->idle = 0;
> + schedule_work(&bio->bi_bdev->bd_disk->idle_notify);
> + }
> +
Wouldn't it be more reliable to hook into places where block requests are
issued/completed instead of queued? This way you will not miss special
requests (some of them are quite common in practice and may take a relatively
long time to execute, i.e. FLUSH CACHE).
--
Bartlomiej Zolnierkiewicz
On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <[email protected]> wrote:
> On Wed, Nov 18, 2009 at 10:06:33PM +0100, Kay Sievers wrote:
>
>> That's true, but I think there is a significant difference between
>> polling every one or two seconds for media changes, and usually one or
>> two minutes for a disk idle. It's not that we poll in a rather hight
>> frequency, in an arbitrary interval, and check if some condition is
>> met.
>
> My use cases are on the order of a second.
Ok, what's the specific use case, which should be triggered after a
second? I thought you were thinking about disk spindown or similar.
Kay
On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote:
> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <[email protected]> wrote:
> > My use cases are on the order of a second.
>
> Ok, what's the specific use case, which should be triggered after a
> second? I thought you were thinking about disk spindown or similar.
The first is altering ALPM policy. ALPM will be initiated by the host if
the number of queued requests hits zero - if there's no hysteresis
implemented, then that can result in a significant performance hit. We
don't need /much/ hysteresis, but it's the difference between a 50%
performance hit and not having that.
The other I'm currently looking at is monitoring disk usage in order to
be able to apply an adaptive policy to disk spindown. This doesn't need
the information to be as accurate, but it would still be better at
closer to the second granularity than the minute level.
--
Matthew Garrett | [email protected]
On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <[email protected]> wrote:
> On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote:
>> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <[email protected]> wrote:
>> > My use cases are on the order of a second.
>>
>> Ok, what's the specific use case, which should be triggered after a
>> second? I thought you were thinking about disk spindown or similar.
>
> The first is altering ALPM policy. ALPM will be initiated by the host if
> the number of queued requests hits zero - if there's no hysteresis
> implemented, then that can result in a significant performance hit. We
> don't need /much/ hysteresis, but it's the difference between a 50%
> performance hit and not having that.
Can't that logic live entirely in the kernel, instead of being a
rather generic userspace event interface (with the current limitation
to a single user)?
Kay
On Thu, Nov 19, 2009 at 02:29:29PM +0100, Kay Sievers wrote:
> On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <[email protected]> wrote:
> > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote:
> >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <[email protected]> wrote:
> >> > My use cases are on the order of a second.
> >>
> >> Ok, what's the specific use case, which should be triggered after a
> >> second? I thought you were thinking about disk spindown or similar.
> >
> > The first is altering ALPM policy. ALPM will be initiated by the host if
> > the number of queued requests hits zero - if there's no hysteresis
> > implemented, then that can result in a significant performance hit. We
> > don't need /much/ hysteresis, but it's the difference between a 50%
> > performance hit and not having that.
>
> Can't that logic live entirely in the kernel, instead of being a
> rather generic userspace event interface (with the current limitation
> to a single user)?
It could, but it seems a bit of a hack. It'd still also require the
timer to be in the kernel, so we might as well expose that to userspace.
--
Matthew Garrett | [email protected]
On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <[email protected]> wrote:
> On Thu, Nov 19, 2009 at 02:29:29PM +0100, Kay Sievers wrote:
>> On Thu, Nov 19, 2009 at 14:01, Matthew Garrett <[email protected]> wrote:
>> > On Thu, Nov 19, 2009 at 12:09:30PM +0100, Kay Sievers wrote:
>> >> On Wed, Nov 18, 2009 at 22:33, Matthew Garrett <[email protected]> wrote:
>> >> > My use cases are on the order of a second.
>> >>
>> >> Ok, what's the specific use case, which should be triggered after a
>> >> second? I thought you were thinking about disk spindown or similar.
>> >
>> > The first is altering ALPM policy. ALPM will be initiated by the host if
>> > the number of queued requests hits zero - if there's no hysteresis
>> > implemented, then that can result in a significant performance hit. We
>> > don't need /much/ hysteresis, but it's the difference between a 50%
>> > performance hit and not having that.
>>
>> Can't that logic live entirely in the kernel, instead of being a
>> rather generic userspace event interface (with the current limitation
>> to a single user)?
>
> It could, but it seems a bit of a hack. It'd still also require the
> timer to be in the kernel, so we might as well expose that to userspace.
Sure, but a userspace configurable policy for an in-kernel disk-idle
powermanagent sounds fine, compared to a single-subscriber
userspace-only disk-idle event interface. :)
Kay
On Thu, Nov 19, 2009 at 03:25:59PM +0100, Kay Sievers wrote:
> On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <[email protected]> wrote:
> > It could, but it seems a bit of a hack. It'd still also require the
> > timer to be in the kernel, so we might as well expose that to userspace.
>
> Sure, but a userspace configurable policy for an in-kernel disk-idle
> powermanagent sounds fine, compared to a single-subscriber
> userspace-only disk-idle event interface. :)
Well, we still need to expose this for the access pattern modifying. I
really don't see the issue with the single subscriber being devkit-disks
- none of the operations involved are atomic, so we're inherently racy
here.
--
Matthew Garrett | [email protected]
On Thu, Nov 19, 2009 at 15:30, Matthew Garrett <[email protected]> wrote:
> On Thu, Nov 19, 2009 at 03:25:59PM +0100, Kay Sievers wrote:
>> On Thu, Nov 19, 2009 at 15:16, Matthew Garrett <[email protected]> wrote:
>> > It could, but it seems a bit of a hack. It'd still also require the
>> > timer to be in the kernel, so we might as well expose that to userspace.
>>
>> Sure, but a userspace configurable policy for an in-kernel disk-idle
>> powermanagent sounds fine, compared to a single-subscriber
>> userspace-only disk-idle event interface. :)
>
> Well, we still need to expose this for the access pattern modifying. I
> really don't see the issue with the single subscriber being devkit-disks
> - none of the operations involved are atomic, so we're inherently racy
> here.
Single-subscriber event interfaces are usually a no-go for generic
infrastructure like this. We still have the unmodified HAL running
until it is dead, and this works only because there are no such
awkward interfaces. In a few years we will probably have diskfoo
replacing dk-disks, and then ... :)
Kay
On Thu, Nov 19, 2009 at 03:34:53PM +0100, Kay Sievers wrote:
> Single-subscriber event interfaces are usually a no-go for generic
> infrastructure like this. We still have the unmodified HAL running
> until it is dead, and this works only because there are no such
> awkward interfaces. In a few years we will probably have diskfoo
> replacing dk-disks, and then ... :)
If you've got any ideas for what a multi-subscriber interface would look
like, I'm happy look at it. I don't think there's an especially
compelling use-case for one right now so I'm not enthusiastic about the
additional complexity that'd be required, but as long as there's basic
agreement that it's not practical to do this in userspace then we're at
least on the same page.
--
Matthew Garrett | [email protected]
On Thu, Nov 19, 2009 at 15:48, Matthew Garrett <[email protected]> wrote:
> On Thu, Nov 19, 2009 at 03:34:53PM +0100, Kay Sievers wrote:
>
>> Single-subscriber event interfaces are usually a no-go for generic
>> infrastructure like this. We still have the unmodified HAL running
>> until it is dead, and this works only because there are no such
>> awkward interfaces. In a few years we will probably have diskfoo
>> replacing dk-disks, and then ... :)
>
> If you've got any ideas for what a multi-subscriber interface would look
> like, I'm happy look at it.
Yeah, it would not be as simple as your patch. It probably involves a
way to get a file descriptor per listener, to let the kernel know if
anybody is interested, and to auto-cleanup when the listener dies, and
to have per instance timers.
> I don't think there's an especially
> compelling use-case for one right now so I'm not enthusiastic about the
> additional complexity that'd be required,
Right, but we've been there, and it's a pain, if you can not subscribe
to an interface because something else is already using it/expecting
it is the only user ever. So there needs to be a good reason for
adding something like this as a new interface, which will very likely
hit us back some day.
> but as long as there's basic
> agreement that it's not practical to do this in userspace then we're at
> least on the same page.
I'm all for executing the policy inside the kernel and let userspace
only enable/configure it. It think there is not much to disagr4ee
about such an approach.
Thanks,
Kay
On Thu, Nov 19, 2009 at 04:00:46PM +0100, Kay Sievers wrote:
> Yeah, it would not be as simple as your patch. It probably involves a
> way to get a file descriptor per listener, to let the kernel know if
> anybody is interested, and to auto-cleanup when the listener dies, and
> to have per instance timers.
This would seem to involve a lot of extra locking in the block
submission and completion code. I don't think it's ideal. How about
this:
* idle_hysteresis contains a value. If it's greater than 0, attempting
to increase it will give -EINVAL. It can be polled.
* On idle state transition, applications listening to the stat sysfs
node will get woken. The stat output will include the number of msecs
that the disk has been idle. If this is less than the application
requested, it can set a timer to wake it up again in the future and
recheck.
* When an application exits, if (and only if) it wrote a value to
idle_hysteresis, it should set this back to 0. This will notify any
other apps, which may then set their own wakeup time.
It's not beautiful but it satisfies the constraints. There's a minimum
of extra wakeups, it doesn't complicate the block path any further and
multiple applications can take advantage of it.
--
Matthew Garrett | [email protected]
On Wed 2009-11-18 20:07:12, Matthew Garrett wrote:
> On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote:
>
> > Sure, but what's wrong with reading that file every 50 seconds? Almost
> > all boxes poll for media changes of optical drives and usb card
> > readers anyway, so it's not that we are not doing stuff like this
> > already.
>
> We poll for media because there's no event-based way of avoiding it - in
> this case there is.
...when you add overhead to every disk operation. I'd say that polling
once in 50 seconds is preferable to that.
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Mon, Nov 23, 2009 at 12:37:49AM +0100, Pavel Machek wrote:
> On Wed 2009-11-18 20:07:12, Matthew Garrett wrote:
> > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote:
> >
> > > Sure, but what's wrong with reading that file every 50 seconds? Almost
> > > all boxes poll for media changes of optical drives and usb card
> > > readers anyway, so it's not that we are not doing stuff like this
> > > already.
> >
> > We poll for media because there's no event-based way of avoiding it - in
> > this case there is.
>
> ...when you add overhead to every disk operation. I'd say that polling
> once in 50 seconds is preferable to that.
Yeah, but 50 seconds isn't the timescale we're talking about here.
--
Matthew Garrett | [email protected]
On Mon, Nov 23 2009, Pavel Machek wrote:
> On Wed 2009-11-18 20:07:12, Matthew Garrett wrote:
> > On Wed, Nov 18, 2009 at 09:03:21PM +0100, Kay Sievers wrote:
> >
> > > Sure, but what's wrong with reading that file every 50 seconds? Almost
> > > all boxes poll for media changes of optical drives and usb card
> > > readers anyway, so it's not that we are not doing stuff like this
> > > already.
> >
> > We poll for media because there's no event-based way of avoiding it - in
> > this case there is.
>
> ...when you add overhead to every disk operation. I'd say that polling
> once in 50 seconds is preferable to that.
I have to agree, doing a mod_timer() on every single IO is going to suck
big time. I went to great lengths to avoid doing that even for timeout
detection. So that's pretty much a non-starter to begin with.
Additionally, as Bart also wrote, you are not doing this in the right
place. You want to do this post-merge, not for each incoming IO. Have
you looked at laptop mode? Looks like you are essentially re-inventing
that, but in a bad way.
--
Jens Axboe
On Mon, Nov 23, 2009 at 03:17:54PM +0100, Jens Axboe wrote:
> I have to agree, doing a mod_timer() on every single IO is going to suck
> big time. I went to great lengths to avoid doing that even for timeout
> detection. So that's pretty much a non-starter to begin with.
It's conditional on a (default off) setting, so it's not a hit unless
the user requests it. But yeah, the performance hit is obviously a
concern. It may be that polling is the least bad way of doing this.
> Additionally, as Bart also wrote, you are not doing this in the right
> place. You want to do this post-merge, not for each incoming IO. Have
> you looked at laptop mode? Looks like you are essentially re-inventing
> that, but in a bad way.
Right, that's mostly down to my having no familiarity with the block
layer at all :) I can fix that up easily enough, but if a deferrable
timer is going to be too expensive then it'll need some rethinking
anyway.
--
Matthew Garrett | [email protected]
On Mon, Nov 23 2009, Matthew Garrett wrote:
> On Mon, Nov 23, 2009 at 03:17:54PM +0100, Jens Axboe wrote:
>
> > I have to agree, doing a mod_timer() on every single IO is going to suck
> > big time. I went to great lengths to avoid doing that even for timeout
> > detection. So that's pretty much a non-starter to begin with.
>
> It's conditional on a (default off) setting, so it's not a hit unless
> the user requests it. But yeah, the performance hit is obviously a
> concern. It may be that polling is the least bad way of doing this.
Even if it's off by default, doesn't mean we shouldn't make the
implementation correct or fast :-)
> > Additionally, as Bart also wrote, you are not doing this in the right
> > place. You want to do this post-merge, not for each incoming IO. Have
> > you looked at laptop mode? Looks like you are essentially re-inventing
> > that, but in a bad way.
>
> Right, that's mostly down to my having no familiarity with the block
> layer at all :) I can fix that up easily enough, but if a deferrable
> timer is going to be too expensive then it'll need some rethinking
> anyway.
Well, take a look at laptop mode. A timer per-io is probably
unavoidable, but doing it at IO completion could mean a big decrease in
timer activity as opposed to doing it for each incoming IO. And since
you are looking at when the disk is idle, it makes a lot more sense to
me to do that when the we complete a request (and have no further
pending IO) rather than on incoming IO.
Your biggest performance issue here is going to be sync IO, since the
disk will go idle very briefly before being kicked into action again.
--
Jens Axboe
On Mon, Nov 23, 2009 at 03:31:40PM +0100, Jens Axboe wrote:
> Well, take a look at laptop mode. A timer per-io is probably
> unavoidable, but doing it at IO completion could mean a big decrease in
> timer activity as opposed to doing it for each incoming IO. And since
> you are looking at when the disk is idle, it makes a lot more sense to
> me to do that when the we complete a request (and have no further
> pending IO) rather than on incoming IO.
Right. The current implementation I have does a del_timer() at
submission (which should be moved to post-merge) - that should be cheap
in the common case of a new command being submitted when there's already
commands outstanding. There's then a mod_timer() at completion time.
That's still a certain amount of expense, but it should be less.
> Your biggest performance issue here is going to be sync IO, since the
> disk will go idle very briefly before being kicked into action again.
Ok, I'll try to benchmark that.
The alternative (polling) method would be something much like Kay
suggested - either add an extra field to stat or an extra sysfs file,
then invalidate that on submission and set to jiffies on completion.
It's not ideal from a wakeups perspective, but it's pretty low impact on
the kernel side.
--
Matthew Garrett | [email protected]
On Mon, Nov 23 2009, Matthew Garrett wrote:
> On Mon, Nov 23, 2009 at 03:31:40PM +0100, Jens Axboe wrote:
>
> > Well, take a look at laptop mode. A timer per-io is probably
> > unavoidable, but doing it at IO completion could mean a big decrease in
> > timer activity as opposed to doing it for each incoming IO. And since
> > you are looking at when the disk is idle, it makes a lot more sense to
> > me to do that when the we complete a request (and have no further
> > pending IO) rather than on incoming IO.
>
> Right. The current implementation I have does a del_timer() at
> submission (which should be moved to post-merge) - that should be cheap
> in the common case of a new command being submitted when there's already
> commands outstanding. There's then a mod_timer() at completion time.
> That's still a certain amount of expense, but it should be less.
>
> > Your biggest performance issue here is going to be sync IO, since the
> > disk will go idle very briefly before being kicked into action again.
>
> Ok, I'll try to benchmark that.
>
> The alternative (polling) method would be something much like Kay
> suggested - either add an extra field to stat or an extra sysfs file,
> then invalidate that on submission and set to jiffies on completion.
> It's not ideal from a wakeups perspective, but it's pretty low impact on
> the kernel side.
If the polling works out, then yes that approach is certainly a lot
better from a performance impact pov.
What kind of time intervals are you targetting?
--
Jens Axboe
On Mon, Nov 23, 2009 at 08:50:00PM +0100, Jens Axboe wrote:
> If the polling works out, then yes that approach is certainly a lot
> better from a performance impact pov.
>
> What kind of time intervals are you targetting?
On the order of a second. I'm doing benchmarking with my current
implementation now, I'll let you know how it looks.
--
Matthew Garrett | [email protected]
I looked into polling and realised that it's not going to work - reducing
the performance hit requires the idle->active transition to be acknowleged
as quickly as possible, so there needs to be an event for that. And for
that to happen, we need to have an idea of whether the disk is idle or not.
That could be done by just performing a comparison of the last request time
against the current time, but it still ends up giving us several of the
same problems.
What I've done in this version is move the timer modification to the
completion, with request submission just performing the timer deletion. This
should make things less expensive than the previous versions of the patch.
I've also added updates to the idle_hysteresis file, which makes it possible
to implement a mechanism for multiple applications with different timeout
requirements. This is somewhat hacky, but avoids putting any more complexity
in fast path code. Thoughts?
---
Userspace may wish to know whether a given disk is active or idle, for
example to modify power management policy based on access patterns. This
patch adds a deferrable timer to the block layer which will fire if the
disk is idle for a user-definable period of time, generating a
notification event on the device's stat node. An event will also be
generated if an access is received while the disk is classified as idle.
Documentation/ABI/testing/sysfs-block | 49 +++++++++++++++++++++++++++++-
block/blk-core.c | 5 +++
block/elevator.c | 7 ++++
block/genhd.c | 54 +++++++++++++++++++++++++++++++++
fs/partitions/check.c | 9 ++++-
include/linux/genhd.h | 6 ++++
6 files changed, 127 insertions(+), 3 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 5f3beda..03b411b 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -3,7 +3,7 @@ Date: February 2008
Contact: Jerome Marchand <[email protected]>
Description:
The /sys/block/<disk>/stat files displays the I/O
- statistics of disk <disk>. They contain 11 fields:
+ statistics of disk <disk>. They contain 12 fields:
1 - reads completed succesfully
2 - reads merged
3 - sectors read
@@ -15,6 +15,14 @@ Description:
9 - I/Os currently in progress
10 - time spent doing I/Os (ms)
11 - weighted time spent doing I/Os (ms)
+ 12 - -1 if the disk is active, otherwise the length of time in
+ milliseconds since the disk became idle (determined by
+ the idle_hysteresis setting)
+
+ Applications that call poll() or select() on this attribute
+ will be woken when the block device undergoes a transition
+ between active and idle.
+
For more details refer Documentation/iostats.txt
@@ -128,3 +136,42 @@ Description:
preferred request size for workloads where sustained
throughput is desired. If no optimal I/O size is
reported this file contains 0.
+
+What: /sys/block/<disk>/idle_hysteresis
+Date: November 2009
+Contact: Matthew Garrett <[email protected]>
+Description:
+ Contains the number of milliseconds to wait after an
+ access before declaring that a disk is idle. Any
+ accesses to the block device during this time will
+ reset the timer. "0" (the default) indicates that no
+ events will be generated. If a value has already been
+ written to this file, then the only valid values are
+ either "0" (to disable notification) or a number
+ smaller than the current value. On write, a
+ notification will be sent to any userspace
+ applications poll()ing on this file.
+
+ The intended use of this interface is to allow
+ applications to change power management policy based
+ on disk activity patterns. The first application to
+ use this interface should write its timeout value and
+ continue monitoring this file along with
+ "stat". Notifications will be sent to the "stat" file
+ when the disk switches from active to idle or
+ vice-versa.
+
+ If more than one application uses this interface, the
+ second application should attempt to write its own
+ timeout. If this fails, it should read the current
+ timeout. It will then be woken on active to idle
+ transitions, at which point it should sleep for the
+ time difference between its desired timeout and the
+ programmed timeout. On waking, it should then re-read
+ the "state" file to determine if the disk has been
+ idle for long enough.
+
+ If the write is successful, then the first application
+ to use the interface will be woken instead. It should
+ then modify its wakeup code to match the above
+ description.
diff --git a/block/blk-core.c b/block/blk-core.c
index 10e305f..16b501a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2115,6 +2115,11 @@ static void blk_finish_request(struct request *req, int error)
if (unlikely(laptop_mode) && blk_fs_request(req))
laptop_io_completion(req);
+ if (blk_fs_request(req) && req->rq_disk->hysteresis_time)
+ mod_timer(&req->rq_disk->hysteresis_timer,
+ jiffies+msecs_to_jiffies
+ (req->rq_disk->hysteresis_time));
+
blk_delete_timer(req);
blk_account_io_done(req);
diff --git a/block/elevator.c b/block/elevator.c
index a847046..01f0bfb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -683,6 +683,13 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
BUG();
}
+ if (blk_fs_request(rq) && rq->rq_disk->hysteresis_time &&
+ rq->rq_disk->idle) {
+ rq->rq_disk->idle = 0;
+ del_timer(&rq->rq_disk->hysteresis_timer);
+ schedule_work(&rq->rq_disk->idle_notify);
+ }
+
if (unplug_it && blk_queue_plugged(q)) {
int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
- queue_in_flight(q);
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..802c142 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -504,6 +504,21 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
+static void disk_idle(unsigned long data)
+{
+ struct gendisk *gd = (struct gendisk *)data;
+
+ gd->idle = jiffies;
+ schedule_work(&gd->idle_notify);
+}
+
+static void disk_idle_notify_thread(struct work_struct *work)
+{
+ struct gendisk *gd = container_of(work, struct gendisk, idle_notify);
+
+ sysfs_notify(&disk_to_dev(gd)->kobj, NULL, "stat");
+}
+
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
@@ -543,6 +558,10 @@ void add_disk(struct gendisk *disk)
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
+
+ init_timer(&disk->hysteresis_timer);
+ setup_timer(&disk->hysteresis_timer, disk_idle, (unsigned long)disk);
+
register_disk(disk);
blk_register_queue(disk);
@@ -861,6 +880,36 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}
+static ssize_t disk_idle_hysteresis_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%d\n", disk->hysteresis_time);
+}
+
+static ssize_t disk_idle_hysteresis_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ unsigned long timeout;
+ int res;
+
+ res = strict_strtoul(buf, 10, &timeout);
+ if (res)
+ return -EINVAL;
+
+ if (disk->hysteresis_time && timeout > disk->hysteresis_time)
+ return -EINVAL;
+
+ disk->hysteresis_time = timeout;
+ sysfs_notify(&dev->kobj, NULL, "idle_hysteresis");
+
+ return count;
+}
+
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -870,6 +919,8 @@ static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(idle_hysteresis, 0644, disk_idle_hysteresis_show,
+ disk_idle_hysteresis_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -890,6 +941,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
+ &dev_attr_idle_hysteresis.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1183,6 +1235,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
device_initialize(disk_to_dev(disk));
INIT_WORK(&disk->async_notify,
media_change_notify_thread);
+ INIT_WORK(&disk->idle_notify,
+ disk_idle_notify_thread);
}
return disk;
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..954dc32 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -230,6 +230,7 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
+ struct gendisk *gd = dev_to_disk(dev);
int cpu;
cpu = part_stat_lock();
@@ -238,7 +239,7 @@ ssize_t part_stat_show(struct device *dev,
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
- "%8u %8u %8u"
+ "%8u %8u %8u %8d"
"\n",
part_stat_read(p, ios[READ]),
part_stat_read(p, merges[READ]),
@@ -250,7 +251,8 @@ ssize_t part_stat_show(struct device *dev,
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
part_in_flight(p),
jiffies_to_msecs(part_stat_read(p, io_ticks)),
- jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+ jiffies_to_msecs(part_stat_read(p, time_in_queue)),
+ gd->idle ? jiffies_to_msecs(jiffies - gd->idle) : -1);
}
ssize_t part_inflight_show(struct device *dev,
@@ -652,6 +654,9 @@ void del_gendisk(struct gendisk *disk)
struct disk_part_iter piter;
struct hd_struct *part;
+ del_timer_sync(&disk->hysteresis_timer);
+ cancel_work_sync(&disk->idle_notify);
+
/* invalidate stuff */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45..b8e6158 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
+#include <linux/timer.h>
#ifdef CONFIG_BLOCK
@@ -163,10 +164,15 @@ struct gendisk {
atomic_t sync_io; /* RAID */
struct work_struct async_notify;
+ struct work_struct idle_notify;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;
+
+ unsigned long idle;
+ int hysteresis_time;
+ struct timer_list hysteresis_timer;
};
static inline struct gendisk *part_to_disk(struct hd_struct *part)
--
1.6.5.2