From: Yasuaki Ishimatsu <[email protected]>
PROBLEM:
/proc/diskstats would display a strange output as follows.
$ cat /proc/diskstats |grep sda
8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
~~~~~~~~~~
8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92
8 4 sda4 4 0 8 0 0 0 0 0 0 0 0
8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137
Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.
The detailed root cause is as follows.
Assuming that there are two partition, sda1 and sda2.
1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
is 0 and sda2's one is 1.
| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------
2. A bio belongs to sda1 is issued and is merged into the request mentioned on
step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
from sda2 region to sda1 region. However the two partition's
hd_struct->in_flight are not changed.
| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------
3. The request is finished and blk_account_io_done() is called. In this case,
sda2's hd_struct->in_flight, not a sda1's one, is decremented.
| hd_struct->in_flight
---------------------------
sda1 | -1
sda2 | 1
---------------------------
HOW TO FIX:
The patch fixes the problem by changing a merging rule.
The problem is caused by merging different partition's I/Os. So the patch
check whether a merging bio or request is a same partition as a request or not
by using a partition's start sector and size.
Signed-off-by: Yasuaki Ishimatsu <[email protected]>
---
block/blk-core.c | 2 ++
block/blk-merge.c | 11 +++++++++++
include/linux/blkdev.h | 14 ++++++++++++++
3 files changed, 27 insertions(+)
Index: linux-2.6.37-rc3/block/blk-core.c
===================================================================
--- linux-2.6.37-rc3.orig/block/blk-core.c 2010-11-30 13:33:02.000000000 +0900
+++ linux-2.6.37-rc3/block/blk-core.c 2010-12-03 17:15:50.000000000 +0900
@@ -71,6 +71,8 @@ static void drive_stat_acct(struct reque
else {
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
+ rq->__part_start_sect = part->start_sect;
+ rq->__part_size = part->nr_sects;
}
part_stat_unlock();
Index: linux-2.6.37-rc3/block/blk-merge.c
===================================================================
--- linux-2.6.37-rc3.orig/block/blk-merge.c 2010-11-30 13:33:02.000000000 +0900
+++ linux-2.6.37-rc3/block/blk-merge.c 2010-12-03 17:15:50.000000000 +0900
@@ -235,6 +235,9 @@ int ll_back_merge_fn(struct request_queu
else
max_sectors = queue_max_sectors(q);
+ if (blk_rq_part_sector(req) + blk_rq_part_size(req) <= bio->bi_sector)
+ return 0;
+
if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
@@ -259,6 +262,8 @@ int ll_front_merge_fn(struct request_que
else
max_sectors = queue_max_sectors(q);
+ if (bio->bi_sector < blk_rq_part_sector(req))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
req->cmd_flags |= REQ_NOMERGE;
@@ -382,6 +387,12 @@ static int attempt_merge(struct request_
return 0;
/*
+ * Don't merge different partition's request
+ */
+ if (blk_rq_part_sector(req) != blk_rq_part_sector(next))
+ return 0;
+
+ /*
* not contiguous
*/
if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
Index: linux-2.6.37-rc3/include/linux/blkdev.h
===================================================================
--- linux-2.6.37-rc3.orig/include/linux/blkdev.h 2010-11-30 13:33:02.000000000 +0900
+++ linux-2.6.37-rc3/include/linux/blkdev.h 2010-11-30 16:46:19.000000000 +0900
@@ -91,6 +91,8 @@ struct request {
/* the following two fields are internal, NEVER access directly */
unsigned int __data_len; /* total data len */
sector_t __sector; /* sector cursor */
+ sector_t __part_start_sect; /* partition's start sector*/
+ sector_t __part_size; /* partition's size*/
struct bio *bio;
struct bio *biotail;
@@ -724,6 +726,8 @@ static inline struct request_queue *bdev
* blk_rq_err_bytes() : bytes left till the next error boundary
* blk_rq_sectors() : sectors left in the entire request
* blk_rq_cur_sectors() : sectors left in the current segment
+ * blk_rq_part_sector() : partition's start sector
+ * blk_rq_part_size() : partition's size
*/
static inline sector_t blk_rq_pos(const struct request *rq)
{
@@ -752,6 +756,16 @@ static inline unsigned int blk_rq_cur_se
return blk_rq_cur_bytes(rq) >> 9;
}
+static inline sector_t blk_rq_part_sector(const struct request *rq)
+{
+ return rq->__part_start_sect;
+}
+
+static inline sector_t blk_rq_part_size(const struct request *rq)
+{
+ return rq->__part_size;
+}
+
/*
* Request issue related functions.
*/
2010/12/6 Yasuaki Ishimatsu <[email protected]>:
>
> The problem is caused by merging different partition's I/Os. So the patch
> check whether a merging bio or request is a same partition as a request or not
> by using a partition's start sector and size.
I really think this is wrong.
We should just carry the partition information around in the req and
the bio, and just compare the pointers, rather than compare the range.
No need to even dereference the pointers, you should be able to just
do
/* don't merge if not on the same partition */
if (bio->part != req->part)
return 0;
or something.
This is doubly true since the accounting already does that horrible
partition lookup: rather than look it up, we should just _set_ it in
__generic_make_request(), where I think we already know it since we do
that whole blk_partition_remap().
So just something like the appended (TOTALLY UNTESTED) perhaps?
Note that this should get it right even for overlapping partitions etc.
Linus
Hi Linus, Yasuaki, and Jens
(2010/12/07 1:08), Linus Torvalds wrote:
> 2010/12/6 Yasuaki Ishimatsu<[email protected]>:
>>
>> The problem is caused by merging different partition's I/Os. So the patch
>> check whether a merging bio or request is a same partition as a request or not
>> by using a partition's start sector and size.
>
> I really think this is wrong.
>
> We should just carry the partition information around in the req and
> the bio, and just compare the pointers, rather than compare the range.
> No need to even dereference the pointers, you should be able to just
> do
>
> /* don't merge if not on the same partition */
> if (bio->part != req->part)
> return 0;
>
> or something.
>
> This is doubly true since the accounting already does that horrible
> partition lookup: rather than look it up, we should just _set_ it in
> __generic_make_request(), where I think we already know it since we do
> that whole blk_partition_remap().
>
> So just something like the appended (TOTALLY UNTESTED) perhaps?
>
> Note that this should get it right even for overlapping partitions etc.
>
> Linus
The problem can occur even if your patches are applied. Think about a case
like the following.
1) There are 2 partition, sda1 and sda2, on sda.
2) Open sda and issue an IO to sda2's first sector. Then sda2's in_flight
is incremented though you open not sda2 but sda. It is because of
partition lookup method. It is based on which partition rq->__sector
sector belongs to.
3) Issue an IO to sda1's last sector and it merged to the IO issued in
step (2) because their part are both sda. In addition, rq->__sector
is modified to the sda1's region.
4) After completing the IO, sda1's in_flight is decremented and diskstat
is corrupted here.
I think fixing this case is difficult and would cause more complexity.
I hit on another approach. Although it doesn'tprevent any merge as Linus
preferred, it can fix the problem anyway. In this idea, in_flight is
incremented and decremented for the partition which the request belonged
to in its creation. It has the following merits.
- It can fix the problem which Yasuaki reported, including the cases which
I mentioned above.
- It only append one extra field to request.
Although it would causes a bit gap, it doesn't have most influences because
merging requests beyond partitions is the rare case.
I confirmed the attached patch can be applied to 2.6.37-rc4 and succeeded
to compile. If you can accept this idea, I'll test it soon.
---
block/blk-core.c | 12 +++++++-----
block/blk-merge.c | 2 +-
include/linux/blkdev.h | 6 ++++++
3 files changed, 14 insertions(+), 6 deletions(-)
Index: linux-2.6.37-rc4/block/blk-core.c
===================================================================
--- linux-2.6.37-rc4.orig/block/blk-core.c 2010-11-30 13:42:04.000000000 +0900
+++ linux-2.6.37-rc4/block/blk-core.c 2010-12-07 14:31:55.000000000 +0900
@@ -64,11 +64,13 @@ static void drive_stat_acct(struct reque
return;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!new_io)
+ if (!new_io) {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_init_pos(rq));
part_stat_inc(cpu, part, merges[rw]);
- else {
+ } else {
+ rq->__initial_sector = rq->__sector;
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_init_pos(rq));
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
}
@@ -1776,7 +1778,7 @@ static void blk_account_io_completion(st
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = disk_map_sector_rcu(req->rq_disk, blk_rq_init_pos(req));
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
@@ -1796,7 +1798,7 @@ static void blk_account_io_done(struct r
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = disk_map_sector_rcu(req->rq_disk, blk_rq_init_pos(req));
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
Index: linux-2.6.37-rc4/block/blk-merge.c
===================================================================
--- linux-2.6.37-rc4.orig/block/blk-merge.c 2010-11-30 13:42:04.000000000 +0900
+++ linux-2.6.37-rc4/block/blk-merge.c 2010-12-07 14:14:55.000000000 +0900
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = disk_map_sector_rcu(req->rq_disk, blk_rq_init_pos(req));
part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
Index: linux-2.6.37-rc4/include/linux/blkdev.h
===================================================================
--- linux-2.6.37-rc4.orig/include/linux/blkdev.h 2010-11-30 13:42:04.000000000 +0900
+++ linux-2.6.37-rc4/include/linux/blkdev.h 2010-12-07 14:13:11.000000000 +0900
@@ -91,6 +91,7 @@ struct request {
/* the following two fields are internal, NEVER access directly */
unsigned int __data_len; /* total data len */
sector_t __sector; /* sector cursor */
+ sector_t __initial_sector;
struct bio *bio;
struct bio *biotail;
@@ -730,6 +731,11 @@ static inline sector_t blk_rq_pos(const
return rq->__sector;
}
+static inline sector_t blk_rq_init_pos(const struct request *rq)
+{
+ return rq->__initial_sector;
+}
+
static inline unsigned int blk_rq_bytes(const struct request *rq)
{
return rq->__data_len;
On Tue, Dec 07, 2010 at 04:18:27PM +0900, Satoru Takeuchi wrote:
> Hi Linus, Yasuaki, and Jens
>
> (2010/12/07 1:08), Linus Torvalds wrote:
> >2010/12/6 Yasuaki Ishimatsu<[email protected]>:
> >>
> >>The problem is caused by merging different partition's I/Os. So the patch
> >>check whether a merging bio or request is a same partition as a request or not
> >>by using a partition's start sector and size.
> >
> >I really think this is wrong.
> >
> >We should just carry the partition information around in the req and
> >the bio, and just compare the pointers, rather than compare the range.
> >No need to even dereference the pointers, you should be able to just
> >do
> >
> > /* don't merge if not on the same partition */
> > if (bio->part != req->part)
> > return 0;
> >
> >or something.
> >
> >This is doubly true since the accounting already does that horrible
> >partition lookup: rather than look it up, we should just _set_ it in
> >__generic_make_request(), where I think we already know it since we do
> >that whole blk_partition_remap().
> >
> >So just something like the appended (TOTALLY UNTESTED) perhaps?
> >
> >Note that this should get it right even for overlapping partitions etc.
> >
> > Linus
>
> The problem can occur even if your patches are applied. Think about a case
> like the following.
>
> 1) There are 2 partition, sda1 and sda2, on sda.
> 2) Open sda and issue an IO to sda2's first sector. Then sda2's in_flight
> is incremented though you open not sda2 but sda. It is because of
> partition lookup method. It is based on which partition rq->__sector
> sector belongs to.
> 3) Issue an IO to sda1's last sector and it merged to the IO issued in
> step (2) because their part are both sda. In addition, rq->__sector
> is modified to the sda1's region.
> 4) After completing the IO, sda1's in_flight is decremented and diskstat
> is corrupted here.
>
> I think fixing this case is difficult and would cause more complexity.
>
> I hit on another approach. Although it doesn'tprevent any merge as Linus
> preferred, it can fix the problem anyway. In this idea, in_flight is
> incremented and decremented for the partition which the request belonged
> to in its creation. It has the following merits.
>
> - It can fix the problem which Yasuaki reported, including the cases which
> I mentioned above.
> - It only append one extra field to request.
>
> Although it would causes a bit gap, it doesn't have most influences because
> merging requests beyond partitions is the rare case.
>
> I confirmed the attached patch can be applied to 2.6.37-rc4 and succeeded
> to compile. If you can accept this idea, I'll test it soon.
>
> ---
> block/blk-core.c | 12 +++++++-----
> block/blk-merge.c | 2 +-
> include/linux/blkdev.h | 6 ++++++
> 3 files changed, 14 insertions(+), 6 deletions(-)
>
> Index: linux-2.6.37-rc4/block/blk-core.c
> ===================================================================
> --- linux-2.6.37-rc4.orig/block/blk-core.c 2010-11-30 13:42:04.000000000 +0900
> +++ linux-2.6.37-rc4/block/blk-core.c 2010-12-07 14:31:55.000000000 +0900
> @@ -64,11 +64,13 @@ static void drive_stat_acct(struct reque
> return;
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> - if (!new_io)
> + if (!new_io) {
> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_init_pos(rq));
> part_stat_inc(cpu, part, merges[rw]);
> - else {
> + } else {
> + rq->__initial_sector = rq->__sector;
> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_init_pos(rq));
> part_round_stats(cpu, part);
> part_inc_in_flight(part, rw);
Ok, so idea seems to be that lets keep track of the sector number against
which we do the accounting. Even if we are doing merging later, accounting
sector of the request will not change so that in_flight will not go out
of the sync.
The only thing is that by allowing merging across partitions, one request
can have IO from multiple partitions and all of it being accounted to
only one partition. So it is more of little accounting error. Though I am
not sure how big a issue that is.
This sounds reasonable to me.
> }
> @@ -1776,7 +1778,7 @@ static void blk_account_io_completion(st
> int cpu;
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = disk_map_sector_rcu(req->rq_disk, blk_rq_init_pos(req));
> part_stat_add(cpu, part, sectors[rw], bytes >> 9);
> part_stat_unlock();
> }
> @@ -1796,7 +1798,7 @@ static void blk_account_io_done(struct r
> int cpu;
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = disk_map_sector_rcu(req->rq_disk, blk_rq_init_pos(req));
> part_stat_inc(cpu, part, ios[rw]);
> part_stat_add(cpu, part, ticks[rw], duration);
> Index: linux-2.6.37-rc4/block/blk-merge.c
> ===================================================================
> --- linux-2.6.37-rc4.orig/block/blk-merge.c 2010-11-30 13:42:04.000000000 +0900
> +++ linux-2.6.37-rc4/block/blk-merge.c 2010-12-07 14:14:55.000000000 +0900
> @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct
> int cpu;
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = disk_map_sector_rcu(req->rq_disk, blk_rq_init_pos(req));
> part_round_stats(cpu, part);
> part_dec_in_flight(part, rq_data_dir(req));
> Index: linux-2.6.37-rc4/include/linux/blkdev.h
> ===================================================================
> --- linux-2.6.37-rc4.orig/include/linux/blkdev.h 2010-11-30 13:42:04.000000000 +0900
> +++ linux-2.6.37-rc4/include/linux/blkdev.h 2010-12-07 14:13:11.000000000 +0900
> @@ -91,6 +91,7 @@ struct request {
> /* the following two fields are internal, NEVER access directly */
> unsigned int __data_len; /* total data len */
> sector_t __sector; /* sector cursor */
> + sector_t __initial_sector;
Would "acct_sector" be a better name. It just means this is the sector
which we will be using for accounting purposes of this rq.
Thanks
Vivek
On 2010-12-07 15:18, Satoru Takeuchi wrote:
> Hi Linus, Yasuaki, and Jens
>
> (2010/12/07 1:08), Linus Torvalds wrote:
>> 2010/12/6 Yasuaki Ishimatsu<[email protected]>:
>>>
>>> The problem is caused by merging different partition's I/Os. So the patch
>>> check whether a merging bio or request is a same partition as a request or not
>>> by using a partition's start sector and size.
>>
>> I really think this is wrong.
>>
>> We should just carry the partition information around in the req and
>> the bio, and just compare the pointers, rather than compare the range.
>> No need to even dereference the pointers, you should be able to just
>> do
>>
>> /* don't merge if not on the same partition */
>> if (bio->part != req->part)
>> return 0;
>>
>> or something.
>>
>> This is doubly true since the accounting already does that horrible
>> partition lookup: rather than look it up, we should just _set_ it in
>> __generic_make_request(), where I think we already know it since we do
>> that whole blk_partition_remap().
>>
>> So just something like the appended (TOTALLY UNTESTED) perhaps?
>>
>> Note that this should get it right even for overlapping partitions etc.
>>
>> Linus
>
> The problem can occur even if your patches are applied. Think about a case
> like the following.
>
> 1) There are 2 partition, sda1 and sda2, on sda.
> 2) Open sda and issue an IO to sda2's first sector. Then sda2's in_flight
> is incremented though you open not sda2 but sda. It is because of
> partition lookup method. It is based on which partition rq->__sector
> sector belongs to.
> 3) Issue an IO to sda1's last sector and it merged to the IO issued in
> step (2) because their part are both sda. In addition, rq->__sector
> is modified to the sda1's region.
> 4) After completing the IO, sda1's in_flight is decremented and diskstat
> is corrupted here.
>
> I think fixing this case is difficult and would cause more complexity.
>
> I hit on another approach. Although it doesn'tprevent any merge as Linus
> preferred, it can fix the problem anyway. In this idea, in_flight is
> incremented and decremented for the partition which the request belonged
> to in its creation. It has the following merits.
I really would prefer if we fixed up the patchset we ended up reverting.
At least that had a purpose with growing struct request, since we saved
on doing the partition lookups.
--
Jens Axboe
Hi Jens,
(2010/12/08 16:33), Jens Axboe wrote:
> On 2010-12-07 15:18, Satoru Takeuchi wrote:
>> I hit on another approach. Although it doesn'tprevent any merge as Linus
>> preferred, it can fix the problem anyway. In this idea, in_flight is
>> incremented and decremented for the partition which the request belonged
>> to in its creation. It has the following merits.
Revert is already finished. 2.6.37-rc-5 and latest stable kernel doesn't
contain Yasuaki's former logic.
https://lkml.org/lkml/2010/10/24/118
>
> I really would prefer if we fixed up the patchset we ended up reverting.
> At least that had a purpose with growing struct request, since we saved
> on doing the partition lookups.
>
Thanks,
Satoru
On 2010-12-08 15:59, Satoru Takeuchi wrote:
> Hi Jens,
>
> (2010/12/08 16:33), Jens Axboe wrote:
>> On 2010-12-07 15:18, Satoru Takeuchi wrote:
>
>>> I hit on another approach. Although it doesn'tprevent any merge as Linus
>>> preferred, it can fix the problem anyway. In this idea, in_flight is
>>> incremented and decremented for the partition which the request belonged
>>> to in its creation. It has the following merits.
>
> Revert is already finished. 2.6.37-rc-5 and latest stable kernel doesn't
> contain Yasuaki's former logic.
>
> https://lkml.org/lkml/2010/10/24/118
Yes I know, that is why I said:
>> I really would prefer if we fixed up the patchset we ended up reverting.
>> At least that had a purpose with growing struct request, since we saved
>> on doing the partition lookups.
That I prefer we fix that code up, since I think it's the best solution
to the problem.
--
Jens Axboe
Hi Jens,
(2010/12/08 17:06), Jens Axboe wrote:
>>>> I hit on another approach. Although it doesn'tprevent any merge as Linus
>>>> preferred, it can fix the problem anyway. In this idea, in_flight is
>>>> incremented and decremented for the partition which the request belonged
>>>> to in its creation. It has the following merits.
>>
>> Revert is already finished. 2.6.37-rc-5 and latest stable kernel doesn't
>> contain Yasuaki's former logic.
>>
>> https://lkml.org/lkml/2010/10/24/118
>
> Yes I know, that is why I said:
>
>>> I really would prefer if we fixed up the patchset we ended up reverting.
>>> At least that had a purpose with growing struct request, since we saved
>>> on doing the partition lookups.
>
> That I prefer we fix that code up, since I think it's the best solution
> to the problem.
>
I already postedit.
https://lkml.org/lkml/2010/12/8/12
I think it is OK without mail subject :-)
Thanks,
Satoru
On 2010-12-08 16:11, Satoru Takeuchi wrote:
> Hi Jens,
>
> (2010/12/08 17:06), Jens Axboe wrote:
>>>>> I hit on another approach. Although it doesn'tprevent any merge as Linus
>>>>> preferred, it can fix the problem anyway. In this idea, in_flight is
>>>>> incremented and decremented for the partition which the request belonged
>>>>> to in its creation. It has the following merits.
>>>
>>> Revert is already finished. 2.6.37-rc-5 and latest stable kernel doesn't
>>> contain Yasuaki's former logic.
>>>
>>> https://lkml.org/lkml/2010/10/24/118
>>
>> Yes I know, that is why I said:
>>
>>>> I really would prefer if we fixed up the patchset we ended up reverting..
>>>> At least that had a purpose with growing struct request, since we saved
>>>> on doing the partition lookups.
>>
>> That I prefer we fix that code up, since I think it's the best solution
>> to the problem.
>>
>
> I already postedit.
>
> https://lkml.org/lkml/2010/12/8/12
>
> I think it is OK without mail subject :-)
No, that's not it at all. What I mean (and what was reverted) was
caching the partition lookup, and using that for the stats. The problem
with that approach turned out to be the elevator queiscing logic not
being fully correct. One easier way to fix that would be to reference
count the part stats, instead of having to drain the queue.
--
Jens Axboe
On Wed, Dec 08, 2010 at 10:46:04PM +0800, Jens Axboe wrote:
> On 2010-12-08 16:11, Satoru Takeuchi wrote:
> > Hi Jens,
> >
> > (2010/12/08 17:06), Jens Axboe wrote:
> >>>>> I hit on another approach. Although it doesn'tprevent any merge as Linus
> >>>>> preferred, it can fix the problem anyway. In this idea, in_flight is
> >>>>> incremented and decremented for the partition which the request belonged
> >>>>> to in its creation. It has the following merits.
> >>>
> >>> Revert is already finished. 2.6.37-rc-5 and latest stable kernel doesn't
> >>> contain Yasuaki's former logic.
> >>>
> >>> https://lkml.org/lkml/2010/10/24/118
> >>
> >> Yes I know, that is why I said:
> >>
> >>>> I really would prefer if we fixed up the patchset we ended up reverting..
> >>>> At least that had a purpose with growing struct request, since we saved
> >>>> on doing the partition lookups.
> >>
> >> That I prefer we fix that code up, since I think it's the best solution
> >> to the problem.
> >>
> >
> > I already postedit.
> >
> > https://lkml.org/lkml/2010/12/8/12
> >
> > I think it is OK without mail subject :-)
>
> No, that's not it at all. What I mean (and what was reverted) was
> caching the partition lookup, and using that for the stats. The problem
> with that approach turned out to be the elevator queiscing logic not
> being fully correct. One easier way to fix that would be to reference
> count the part stats, instead of having to drain the queue.
Taking reference to hd_struct and storing it in rq, will definitely save
us 1 lookup while doing accounting on completion path. It does not save
on rq size though.
IIUC, current patch does not increase the number of existing lookups. So
current situation does not deteriorate with the patch.
But storing a reference in rq and avoiding 1 lookup in completion path
definitely sounds better.
Thanks
Vivek
On Wed, Dec 08, 2010 at 10:51:37AM -0500, Vivek Goyal wrote:
> On Wed, Dec 08, 2010 at 10:46:04PM +0800, Jens Axboe wrote:
> > On 2010-12-08 16:11, Satoru Takeuchi wrote:
> > > Hi Jens,
> > >
> > > (2010/12/08 17:06), Jens Axboe wrote:
> > >>>>> I hit on another approach. Although it doesn'tprevent any merge as Linus
> > >>>>> preferred, it can fix the problem anyway. In this idea, in_flight is
> > >>>>> incremented and decremented for the partition which the request belonged
> > >>>>> to in its creation. It has the following merits.
> > >>>
> > >>> Revert is already finished. 2.6.37-rc-5 and latest stable kernel doesn't
> > >>> contain Yasuaki's former logic.
> > >>>
> > >>> https://lkml.org/lkml/2010/10/24/118
> > >>
> > >> Yes I know, that is why I said:
> > >>
> > >>>> I really would prefer if we fixed up the patchset we ended up reverting..
> > >>>> At least that had a purpose with growing struct request, since we saved
> > >>>> on doing the partition lookups.
> > >>
> > >> That I prefer we fix that code up, since I think it's the best solution
> > >> to the problem.
> > >>
> > >
> > > I already postedit.
> > >
> > > https://lkml.org/lkml/2010/12/8/12
> > >
> > > I think it is OK without mail subject :-)
> >
> > No, that's not it at all. What I mean (and what was reverted) was
> > caching the partition lookup, and using that for the stats. The problem
> > with that approach turned out to be the elevator queiscing logic not
> > being fully correct. One easier way to fix that would be to reference
> > count the part stats, instead of having to drain the queue.
>
> Taking reference to hd_struct and storing it in rq, will definitely save
> us 1 lookup while doing accounting on completion path. It does not save
> on rq size though.
>
> IIUC, current patch does not increase the number of existing lookups. So
> current situation does not deteriorate with the patch.
>
> But storing a reference in rq and avoiding 1 lookup in completion path
> definitely sounds better.
>
Storing a pointer to partition in rq also got the advantage that we can
easily not allow merging of requests across partitions for better
accounting.
Satoru, so yes, if you can implement what jens is suggesting, would be
good.
Thanks
Vivek
On 12/08/2010 04:58 PM, Vivek Goyal wrote:
>
> Storing a pointer to partition in rq also got the advantage that we can
> easily not allow merging of requests across partitions for better
> accounting.
I don't think it really matters (as long as we keep in_flight consistent
of course). The requests that got merged across partition are very likely
rare enough to be neglectable.
Jerome
>
> Satoru, so yes, if you can implement what jens is suggesting, would be
> good.
>
> Thanks
> Vivek
On 12/08/2010 03:46 PM, Jens Axboe wrote:
>
> No, that's not it at all. What I mean (and what was reverted) was
> caching the partition lookup, and using that for the stats. The problem
> with that approach turned out to be the elevator queiscing logic not
> being fully correct. One easier way to fix that would be to reference
> count the part stats, instead of having to drain the queue.
>
The partition is already deleted in a rcu callback function. If I
understand RCU correctly, we just need to use rcu_dereference() each time
we use rq->part. Something like the following *untested* patch.
Jerome
---
diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f..70d23f9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
return;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!new_io)
+ if (!new_io) {
+ part = blk_rq_get_part(rq);
part_stat_inc(cpu, part, merges[rw]);
- else {
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
+ rq->part = part;
}
part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
+ rq->part = NULL;
}
EXPORT_SYMBOL(blk_rq_init);
@@ -1776,7 +1784,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = blk_rq_get_part(req);
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
@@ -1796,7 +1804,7 @@ static void blk_account_io_done(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = blk_rq_get_part(req);
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26..0ea5416 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = blk_rq_get_part(req);
part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd..00a3b93 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
void *elevator_private3;
struct gendisk *rq_disk;
+ struct hd_struct __rcu *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
unsigned long long start_time_ns;
@@ -752,6 +753,12 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
return blk_rq_cur_bytes(rq) >> 9;
}
+static inline struct hd_struct *blk_rq_get_part(const struct request *rq)
+{
+ struct hd_struct *part = rcu_dereference(rq->part);
+ return part;
+}
+
/*
* Request issue related functions.
*/
On Fri, Dec 10, 2010 at 05:12:04PM +0100, Jerome Marchand wrote:
> On 12/08/2010 03:46 PM, Jens Axboe wrote:
> >
> > No, that's not it at all. What I mean (and what was reverted) was
> > caching the partition lookup, and using that for the stats. The problem
> > with that approach turned out to be the elevator queiscing logic not
> > being fully correct. One easier way to fix that would be to reference
> > count the part stats, instead of having to drain the queue.
> >
>
> The partition is already deleted in a rcu callback function. If I
> understand RCU correctly, we just need to use rcu_dereference() each time
> we use rq->part. Something like the following *untested* patch.
Jerome,
I don't think that rcu_dereference() is going to solve the problem. The
partition table will be freed as soon as one rcu period is over. So to
make sure partition table is not freed one has to be holding
rcu_read_lock(). It is not a good idea to keep rcu period going till
request finishes so a better idea will to to reference count it.
Thanks
Vivek
>
> Jerome
>
> ---
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 4ce953f..70d23f9 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
> return;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>
> - if (!new_io)
> + if (!new_io) {
> + part = blk_rq_get_part(rq);
> part_stat_inc(cpu, part, merges[rw]);
> - else {
> + } else {
> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> part_round_stats(cpu, part);
> part_inc_in_flight(part, rw);
> + rq->part = part;
> }
>
> part_stat_unlock();
> @@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
> rq->ref_count = 1;
> rq->start_time = jiffies;
> set_start_time_ns(rq);
> + rq->part = NULL;
> }
> EXPORT_SYMBOL(blk_rq_init);
>
> @@ -1776,7 +1784,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
> int cpu;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = blk_rq_get_part(req);
> part_stat_add(cpu, part, sectors[rw], bytes >> 9);
> part_stat_unlock();
> }
> @@ -1796,7 +1804,7 @@ static void blk_account_io_done(struct request *req)
> int cpu;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = blk_rq_get_part(req);
>
> part_stat_inc(cpu, part, ios[rw]);
> part_stat_add(cpu, part, ticks[rw], duration);
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 77b7c26..0ea5416 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -351,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
> int cpu;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = blk_rq_get_part(req);
>
> part_round_stats(cpu, part);
> part_dec_in_flight(part, rq_data_dir(req));
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index aae86fd..00a3b93 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -115,6 +115,7 @@ struct request {
> void *elevator_private3;
>
> struct gendisk *rq_disk;
> + struct hd_struct __rcu *part;
> unsigned long start_time;
> #ifdef CONFIG_BLK_CGROUP
> unsigned long long start_time_ns;
> @@ -752,6 +753,12 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
> return blk_rq_cur_bytes(rq) >> 9;
> }
>
> +static inline struct hd_struct *blk_rq_get_part(const struct request *rq)
> +{
> + struct hd_struct *part = rcu_dereference(rq->part);
> + return part;
> +}
> +
> /*
> * Request issue related functions.
> */
On 2010-12-10 17:55, Vivek Goyal wrote:
> On Fri, Dec 10, 2010 at 05:12:04PM +0100, Jerome Marchand wrote:
>> On 12/08/2010 03:46 PM, Jens Axboe wrote:
>>>
>>> No, that's not it at all. What I mean (and what was reverted) was
>>> caching the partition lookup, and using that for the stats. The problem
>>> with that approach turned out to be the elevator queiscing logic not
>>> being fully correct. One easier way to fix that would be to reference
>>> count the part stats, instead of having to drain the queue.
>>>
>>
>> The partition is already deleted in a rcu callback function. If I
>> understand RCU correctly, we just need to use rcu_dereference() each time
>> we use rq->part. Something like the following *untested* patch.
>
> Jerome,
>
> I don't think that rcu_dereference() is going to solve the problem. The
> partition table will be freed as soon as one rcu period is over. So to
> make sure partition table is not freed one has to be holding
> rcu_read_lock(). It is not a good idea to keep rcu period going till
> request finishes so a better idea will to to reference count it.
Exactly. The only change you would do to partition handling is ensure
that each io grabs a reference to it and drops it at the end. You need
not even do that in the core bits outside each IO, we just need to
ensure that the partition struct persists in memory even if it is no
longer the valid partition table. The rcu call to free the memory would
happen when the ref drops to zero.
What Vivek says it completely correct, and rcu_dereference() would only
help if you also had rcu_read_lock() over each IO. That is not feasible
at all.
--
Jens Axboe
/proc/diskstats would display a strange output as follows.
$ cat /proc/diskstats |grep sda
8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
~~~~~~~~~~
8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92
8 4 sda4 4 0 8 0 0 0 0 0 0 0 0
8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137
Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.
The detailed root cause is as follows.
Assuming that there are two partition, sda1 and sda2.
1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
is 0 and sda2's one is 1.
| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------
2. A bio belongs to sda1 is issued and is merged into the request mentioned on
step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
from sda2 region to sda1 region. However the two partition's
hd_struct->in_flight are not changed.
| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------
3. The request is finished and blk_account_io_done() is called. In this case,
sda2's hd_struct->in_flight, not a sda1's one, is decremented.
| hd_struct->in_flight
---------------------------
sda1 | -1
sda2 | 1
---------------------------
The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.
Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist.
Signed-off-by: Jerome Marchand <[email protected]>
Signed-off-by: Yasuaki Ishimatsu <[email protected]>
---
block/blk-core.c | 15 ++++++++++-----
block/blk-merge.c | 3 ++-
block/genhd.c | 1 +
fs/partitions/check.c | 11 ++++++++++-
include/linux/blkdev.h | 1 +
include/linux/genhd.h | 2 ++
6 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f..064921d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,16 @@ static void drive_stat_acct(struct request *rq, int new_io)
return;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!new_io)
+ if (!new_io) {
+ part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
- else {
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
+ kref_get(&part->ref);
+ rq->part = part;
}
part_stat_unlock();
@@ -128,6 +131,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
+ rq->part = NULL;
}
EXPORT_SYMBOL(blk_rq_init);
@@ -1776,7 +1780,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
@@ -1796,13 +1800,14 @@ static void blk_account_io_done(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
part_round_stats(cpu, part);
part_dec_in_flight(part, rw);
+ kref_put(&part->ref, __delete_partition);
part_stat_unlock();
}
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26..b06b83b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
+ kref_put(&part->ref, __delete_partition);
part_stat_unlock();
}
}
diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44..0c55eae 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
return NULL;
}
disk->part_tbl->part[0] = &disk->part0;
+ kref_init(&disk->part0.ref);
disk->minors = minors;
rand_initialize_disk(disk);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad..4b5aa9d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -372,6 +372,14 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
put_device(part_to_dev(part));
}
+void __delete_partition(struct kref *ref)
+{
+ struct hd_struct *part =
+ container_of(ref, struct hd_struct, ref);
+
+ call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
void delete_partition(struct gendisk *disk, int partno)
{
struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +398,7 @@ void delete_partition(struct gendisk *disk, int partno)
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
- call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+ kref_put(&part->ref, __delete_partition);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +497,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
+ kref_init(&p->ref);
return p;
out_free_info:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd..482a7fd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
void *elevator_private3;
struct gendisk *rq_disk;
+ struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
unsigned long long start_time_ns;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7a7b9c1..2ba2792 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
struct disk_stats dkstats;
#endif
struct rcu_head rcu_head;
+ struct kref ref;
};
#define GENHD_FL_REMOVABLE 1
@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
sector_t len, int flags,
struct partition_meta_info
*info);
+extern void __delete_partition(struct kref *ref);
extern void delete_partition(struct gendisk *, int);
extern void printk_all_partitions(void);
On 2010-12-17 14:42, Jerome Marchand wrote:
>
> /proc/diskstats would display a strange output as follows.
[snip]
This looks a lot better! One comment:
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 4ce953f..064921d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -64,13 +64,16 @@ static void drive_stat_acct(struct request *rq, int new_io)
> return;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>
> - if (!new_io)
> + if (!new_io) {
> + part = rq->part;
> part_stat_inc(cpu, part, merges[rw]);
> - else {
> + } else {
> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> part_round_stats(cpu, part);
> part_inc_in_flight(part, rw);
> + kref_get(&part->ref);
> + rq->part = part;
> }
>
> part_stat_unlock();
I don't think this is completely safe. The rcu lock is held due to the
part_stat_lock(), but that only prevents the __delete_partition()
callback from happening. Lets say you have this:
CPU0 CPU1
part = disk_map_sector_rcu()
kref_put(part); <- now 0
part_stat_unlock()
__delete_partition();
...
delete_partition_rcu_cb();
merge, or endio, boom
Now rq has ->part pointing to freed memory, later merges or end
accounting will touch freed memory.
I think we can fix this by just having delete_partition_rcu_rb() check
the reference count and return if non-zero. Since someone holds a
reference to the table, they will drop it and we'll re-schedule the rcu
callback.
--
Jens Axboe
On Fri, Dec 17, 2010 at 08:06:09PM +0100, Jens Axboe wrote:
> On 2010-12-17 14:42, Jerome Marchand wrote:
> >
> > /proc/diskstats would display a strange output as follows.
>
> [snip]
>
> This looks a lot better! One comment:
>
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 4ce953f..064921d 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -64,13 +64,16 @@ static void drive_stat_acct(struct request *rq, int new_io)
> > return;
> >
> > cpu = part_stat_lock();
> > - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> >
> > - if (!new_io)
> > + if (!new_io) {
> > + part = rq->part;
> > part_stat_inc(cpu, part, merges[rw]);
> > - else {
> > + } else {
> > + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> > part_round_stats(cpu, part);
> > part_inc_in_flight(part, rw);
> > + kref_get(&part->ref);
> > + rq->part = part;
> > }
> >
> > part_stat_unlock();
>
> I don't think this is completely safe. The rcu lock is held due to the
> part_stat_lock(), but that only prevents the __delete_partition()
> callback from happening. Lets say you have this:
>
> CPU0 CPU1
> part = disk_map_sector_rcu()
> kref_put(part); <- now 0
> part_stat_unlock()
> __delete_partition();
> ...
> delete_partition_rcu_cb();
> merge, or endio, boom
>
> Now rq has ->part pointing to freed memory, later merges or end
> accounting will touch freed memory.
>
> I think we can fix this by just having delete_partition_rcu_rb() check
> the reference count and return if non-zero. Since someone holds a
> reference to the table, they will drop it and we'll re-schedule the rcu
> callback.
This is interesting. Using RCU with kref(). So even if somebody has done
a kref_put() and this is last reference, but rcu period is not over, somebody
can still go and take reference again and set it to 1 again and then
partition will not be freed as delete_partition_rcu_cb() will find it set.
I guess read shall have to be atomic_read() and struct kref is opaque so
one might have to introduce kref_read() or something like that and
possibly update Documentation/kref.txt for this usage of with RCU. I would
also recommend it to get it reviewed from Paul McKenney to make sure this
usage of RCU is fine.
Thanks
Vivek
On 12/17/2010 08:06 PM, Jens Axboe wrote:
> On 2010-12-17 14:42, Jerome Marchand wrote:
>>
>> /proc/diskstats would display a strange output as follows.
>
> [snip]
>
> This looks a lot better! One comment:
>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 4ce953f..064921d 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -64,13 +64,16 @@ static void drive_stat_acct(struct request *rq, int new_io)
>> return;
>>
>> cpu = part_stat_lock();
>> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>>
>> - if (!new_io)
>> + if (!new_io) {
>> + part = rq->part;
>> part_stat_inc(cpu, part, merges[rw]);
>> - else {
>> + } else {
>> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>> part_round_stats(cpu, part);
>> part_inc_in_flight(part, rw);
>> + kref_get(&part->ref);
>> + rq->part = part;
>> }
>>
>> part_stat_unlock();
>
> I don't think this is completely safe. The rcu lock is held due to the
> part_stat_lock(), but that only prevents the __delete_partition()
> callback from happening. Lets say you have this:
>
> CPU0 CPU1
> part = disk_map_sector_rcu()
> kref_put(part); <- now 0
A kref_get(part) happens here, or am I missing something?
If we use an atomic kref_test_and_get() function, which only increment the
refcount if it's not zero, we would be safe. If kref_test_and_get() fails,
we can cache rq->rq_disk->part0 instead. See my drafty patch below.
> part_stat_unlock()
> __delete_partition();
> ...
> delete_partition_rcu_cb();
> merge, or endio, boom
>
> Now rq has ->part pointing to freed memory, later merges or end
> accounting will touch freed memory.
>
> I think we can fix this by just having delete_partition_rcu_rb() check
> the reference count and return if non-zero. Since someone holds a
> reference to the table, they will drop it and we'll re-schedule the rcu
> callback.
>
>
---
diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f..72d12d2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,21 @@ static void drive_stat_acct(struct request *rq, int new_io)
return;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!new_io)
+ if (!new_io) {
+ part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
- else {
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+ if(part->partno && !kref_test_and_get(&part->ref))
+ /*
+ * The partition is already being removed,
+ * the request will be accounted on the disk only
+ */
+ part = &rq->rq_disk->part0;
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
+ rq->part = part;
}
part_stat_unlock();
@@ -128,6 +136,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
+ rq->part = NULL;
}
EXPORT_SYMBOL(blk_rq_init);
@@ -1776,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
@@ -1796,13 +1805,15 @@ static void blk_account_io_done(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
part_round_stats(cpu, part);
part_dec_in_flight(part, rw);
+ if (part->partno)
+ kref_put(&part->ref, __delete_partition);
part_stat_unlock();
}
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 77b7c26..3334578 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -351,11 +351,13 @@ static void blk_account_io_merge(struct request *req)
int cpu;
cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
+ if (part->partno)
+ kref_put(&part->ref, __delete_partition);
part_stat_unlock();
}
}
diff --git a/block/genhd.c b/block/genhd.c
index 5fa2b44..0c55eae 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
return NULL;
}
disk->part_tbl->part[0] = &disk->part0;
+ kref_init(&disk->part0.ref);
disk->minors = minors;
rand_initialize_disk(disk);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad..0123717 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -372,6 +372,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
put_device(part_to_dev(part));
}
+void __delete_partition(struct kref *ref)
+{
+ struct hd_struct *part = container_of(ref, struct hd_struct, ref);
+
+ call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
void delete_partition(struct gendisk *disk, int partno)
{
struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +397,7 @@ void delete_partition(struct gendisk *disk, int partno)
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
- call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+ kref_put(&part->ref, __delete_partition);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +496,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
+ kref_init(&p->ref);
return p;
out_free_info:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd..482a7fd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
void *elevator_private3;
struct gendisk *rq_disk;
+ struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
unsigned long long start_time_ns;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7a7b9c1..2ba2792 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
struct disk_stats dkstats;
#endif
struct rcu_head rcu_head;
+ struct kref ref;
};
#define GENHD_FL_REMOVABLE 1
@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
sector_t len, int flags,
struct partition_meta_info
*info);
+extern void __delete_partition(struct kref *ref);
extern void delete_partition(struct gendisk *, int);
extern void printk_all_partitions(void);
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 6cc38fc..90b9e44 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -23,6 +23,7 @@ struct kref {
void kref_init(struct kref *kref);
void kref_get(struct kref *kref);
+int kref_test_and_get(struct kref *kref);
int kref_put(struct kref *kref, void (*release) (struct kref *kref));
#endif /* _KREF_H_ */
diff --git a/lib/kref.c b/lib/kref.c
index d3d227a..5f663b9 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -37,6 +37,22 @@ void kref_get(struct kref *kref)
}
/**
+ * kref_test_and_get - increment refcount for object only if refcount is not
+ * zero.
+ * @kref: object.
+ *
+ * Return non-zero if the refcount was incremented, 0 otherwise
+ */
+int kref_test_and_get(struct kref *kref)
+{
+ int ret;
+ smp_mb__before_atomic_inc();
+ ret = atomic_inc_not_zero(&kref->refcount);
+ smp_mb__after_atomic_inc();
+ return ret;
+}
+
+/**
* kref_put - decrement refcount for object.
* @kref: object.
* @release: pointer to the function that will clean up the object when the
On Thu, Dec 23, 2010 at 04:10:04PM +0100, Jerome Marchand wrote:
> On 12/17/2010 08:06 PM, Jens Axboe wrote:
> > On 2010-12-17 14:42, Jerome Marchand wrote:
> >>
> >> /proc/diskstats would display a strange output as follows.
> >
> > [snip]
> >
> > This looks a lot better! One comment:
> >
> >> diff --git a/block/blk-core.c b/block/blk-core.c
> >> index 4ce953f..064921d 100644
> >> --- a/block/blk-core.c
> >> +++ b/block/blk-core.c
> >> @@ -64,13 +64,16 @@ static void drive_stat_acct(struct request *rq, int new_io)
> >> return;
> >>
> >> cpu = part_stat_lock();
> >> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> >>
> >> - if (!new_io)
> >> + if (!new_io) {
> >> + part = rq->part;
> >> part_stat_inc(cpu, part, merges[rw]);
> >> - else {
> >> + } else {
> >> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> >> part_round_stats(cpu, part);
> >> part_inc_in_flight(part, rw);
> >> + kref_get(&part->ref);
> >> + rq->part = part;
> >> }
> >>
> >> part_stat_unlock();
> >
> > I don't think this is completely safe. The rcu lock is held due to the
> > part_stat_lock(), but that only prevents the __delete_partition()
> > callback from happening. Lets say you have this:
> >
> > CPU0 CPU1
> > part = disk_map_sector_rcu()
> > kref_put(part); <- now 0
>
> A kref_get(part) happens here, or am I missing something?
It might happen that kref_get(part) is called after kref_put() has been
called and reference has reached 0 and and delete_parition() call has
been scheduled as soon as rcu grace period is over. So if you do
kref_get() after that, it is not going to help.
> If we use an atomic kref_test_and_get() function, which only increment the
> refcount if it's not zero, we would be safe. If kref_test_and_get() fails,
> we can cache rq->rq_disk->part0 instead. See my drafty patch below.
Conceptually it kind of makes sense to me. So if even if we get the
pointer to partition under rcu_read_lock(), we will not account the IO
to partition if it is going away.
>
> > part_stat_unlock()
> > __delete_partition();
> > ...
> > delete_partition_rcu_cb();
> > merge, or endio, boom
> >
> > Now rq has ->part pointing to freed memory, later merges or end
> > accounting will touch freed memory.
> >
> > I think we can fix this by just having delete_partition_rcu_rb() check
> > the reference count and return if non-zero. Since someone holds a
> > reference to the table, they will drop it and we'll re-schedule the rcu
> > callback.
> >
> >
>
>
> ---
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 4ce953f..72d12d2 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -64,13 +64,21 @@ static void drive_stat_acct(struct request *rq, int new_io)
> return;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>
> - if (!new_io)
> + if (!new_io) {
> + part = rq->part;
> part_stat_inc(cpu, part, merges[rw]);
> - else {
> + } else {
> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> + if(part->partno && !kref_test_and_get(&part->ref))
> + /*
Do we have to check this part->partno both while taking and releasing
reference. Can't we take one extra reference for disk->part0, at
alloc_disk_node() time so that it is never freed and only freed when
disk is going away and gendisk is being freed.
That way, you don't have to differentiate between disk->part0 and rest
of the partitions while taking or dropping references.
Thanks
Vivek
> + * The partition is already being removed,
> + * the request will be accounted on the disk only
> + */
> + part = &rq->rq_disk->part0;
> part_round_stats(cpu, part);
> part_inc_in_flight(part, rw);
> + rq->part = part;
> }
>
> part_stat_unlock();
> @@ -128,6 +136,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
> rq->ref_count = 1;
> rq->start_time = jiffies;
> set_start_time_ns(rq);
> + rq->part = NULL;
> }
> EXPORT_SYMBOL(blk_rq_init);
>
> @@ -1776,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
> int cpu;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = req->part;
> part_stat_add(cpu, part, sectors[rw], bytes >> 9);
> part_stat_unlock();
> }
> @@ -1796,13 +1805,15 @@ static void blk_account_io_done(struct request *req)
> int cpu;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = req->part;
>
> part_stat_inc(cpu, part, ios[rw]);
> part_stat_add(cpu, part, ticks[rw], duration);
> part_round_stats(cpu, part);
> part_dec_in_flight(part, rw);
>
> + if (part->partno)
> + kref_put(&part->ref, __delete_partition);
> part_stat_unlock();
> }
> }
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 77b7c26..3334578 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -351,11 +351,13 @@ static void blk_account_io_merge(struct request *req)
> int cpu;
>
> cpu = part_stat_lock();
> - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
> + part = req->part;
>
> part_round_stats(cpu, part);
> part_dec_in_flight(part, rq_data_dir(req));
>
> + if (part->partno)
> + kref_put(&part->ref, __delete_partition);
> part_stat_unlock();
> }
> }
> diff --git a/block/genhd.c b/block/genhd.c
> index 5fa2b44..0c55eae 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
> return NULL;
> }
> disk->part_tbl->part[0] = &disk->part0;
> + kref_init(&disk->part0.ref);
>
> disk->minors = minors;
> rand_initialize_disk(disk);
> diff --git a/fs/partitions/check.c b/fs/partitions/check.c
> index 0a8b0ad..0123717 100644
> --- a/fs/partitions/check.c
> +++ b/fs/partitions/check.c
> @@ -372,6 +372,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
> put_device(part_to_dev(part));
> }
>
> +void __delete_partition(struct kref *ref)
> +{
> + struct hd_struct *part = container_of(ref, struct hd_struct, ref);
> +
> + call_rcu(&part->rcu_head, delete_partition_rcu_cb);
> +}
> +
> void delete_partition(struct gendisk *disk, int partno)
> {
> struct disk_part_tbl *ptbl = disk->part_tbl;
> @@ -390,7 +397,7 @@ void delete_partition(struct gendisk *disk, int partno)
> kobject_put(part->holder_dir);
> device_del(part_to_dev(part));
>
> - call_rcu(&part->rcu_head, delete_partition_rcu_cb);
> + kref_put(&part->ref, __delete_partition);
> }
>
> static ssize_t whole_disk_show(struct device *dev,
> @@ -489,6 +496,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
> if (!dev_get_uevent_suppress(ddev))
> kobject_uevent(&pdev->kobj, KOBJ_ADD);
>
> + kref_init(&p->ref);
> return p;
>
> out_free_info:
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index aae86fd..482a7fd 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -115,6 +115,7 @@ struct request {
> void *elevator_private3;
>
> struct gendisk *rq_disk;
> + struct hd_struct *part;
> unsigned long start_time;
> #ifdef CONFIG_BLK_CGROUP
> unsigned long long start_time_ns;
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 7a7b9c1..2ba2792 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -116,6 +116,7 @@ struct hd_struct {
> struct disk_stats dkstats;
> #endif
> struct rcu_head rcu_head;
> + struct kref ref;
> };
>
> #define GENHD_FL_REMOVABLE 1
> @@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
> sector_t len, int flags,
> struct partition_meta_info
> *info);
> +extern void __delete_partition(struct kref *ref);
> extern void delete_partition(struct gendisk *, int);
> extern void printk_all_partitions(void);
>
> diff --git a/include/linux/kref.h b/include/linux/kref.h
> index 6cc38fc..90b9e44 100644
> --- a/include/linux/kref.h
> +++ b/include/linux/kref.h
> @@ -23,6 +23,7 @@ struct kref {
>
> void kref_init(struct kref *kref);
> void kref_get(struct kref *kref);
> +int kref_test_and_get(struct kref *kref);
> int kref_put(struct kref *kref, void (*release) (struct kref *kref));
>
> #endif /* _KREF_H_ */
> diff --git a/lib/kref.c b/lib/kref.c
> index d3d227a..5f663b9 100644
> --- a/lib/kref.c
> +++ b/lib/kref.c
> @@ -37,6 +37,22 @@ void kref_get(struct kref *kref)
> }
>
> /**
> + * kref_test_and_get - increment refcount for object only if refcount is not
> + * zero.
> + * @kref: object.
> + *
> + * Return non-zero if the refcount was incremented, 0 otherwise
> + */
> +int kref_test_and_get(struct kref *kref)
> +{
> + int ret;
> + smp_mb__before_atomic_inc();
> + ret = atomic_inc_not_zero(&kref->refcount);
> + smp_mb__after_atomic_inc();
> + return ret;
> +}
> +
> +/**
> * kref_put - decrement refcount for object.
> * @kref: object.
> * @release: pointer to the function that will clean up the object when the
On 12/23/2010 04:39 PM, Vivek Goyal wrote:
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 4ce953f..72d12d2 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -64,13 +64,21 @@ static void drive_stat_acct(struct request *rq, int new_io)
>> return;
>>
>> cpu = part_stat_lock();
>> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>>
>> - if (!new_io)
>> + if (!new_io) {
>> + part = rq->part;
>> part_stat_inc(cpu, part, merges[rw]);
>> - else {
>> + } else {
>> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
>> + if(part->partno && !kref_test_and_get(&part->ref))
>> + /*
>
> Do we have to check this part->partno both while taking and releasing
> reference. Can't we take one extra reference for disk->part0, at
> alloc_disk_node() time so that it is never freed and only freed when
> disk is going away and gendisk is being freed.
>
> That way, you don't have to differentiate between disk->part0 and rest
> of the partitions while taking or dropping references.
We could do it in any way, as long as we don't end up trying to free
disk->part0. I choose not to touch part0->ref at all, but we could also
drop all the part->partno test, and get a reference on part0 when we use
it as a backup. I have no strong opinion about a way or an other.
Jerome
>
> Thanks
> Vivek
On Thu, Dec 23, 2010 at 06:04:11PM +0100, Jerome Marchand wrote:
> On 12/23/2010 04:39 PM, Vivek Goyal wrote:
> >> diff --git a/block/blk-core.c b/block/blk-core.c
> >> index 4ce953f..72d12d2 100644
> >> --- a/block/blk-core.c
> >> +++ b/block/blk-core.c
> >> @@ -64,13 +64,21 @@ static void drive_stat_acct(struct request *rq, int new_io)
> >> return;
> >>
> >> cpu = part_stat_lock();
> >> - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> >>
> >> - if (!new_io)
> >> + if (!new_io) {
> >> + part = rq->part;
> >> part_stat_inc(cpu, part, merges[rw]);
> >> - else {
> >> + } else {
> >> + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> >> + if(part->partno && !kref_test_and_get(&part->ref))
> >> + /*
> >
> > Do we have to check this part->partno both while taking and releasing
> > reference. Can't we take one extra reference for disk->part0, at
> > alloc_disk_node() time so that it is never freed and only freed when
> > disk is going away and gendisk is being freed.
> >
> > That way, you don't have to differentiate between disk->part0 and rest
> > of the partitions while taking or dropping references.
>
> We could do it in any way, as long as we don't end up trying to free
> disk->part0. I choose not to touch part0->ref at all, but we could also
> drop all the part->partno test, and get a reference on part0 when we use
> it as a backup. I have no strong opinion about a way or an other.
Ok, I personally like taking an extra reference to disk->part0 and put
a proper comment there so that at rest of the places we don't try to
differentiate between part0 and other partitions.
Having said that I am not too concerned about it. It is just one of the
minor details. So post the new version of patch after testing.
Thanks
Vivek