blk_queue_start_tag and blk_queue_end_tag are called for tagging
I/O to scsi device that is capable of tcq. blk_queue_find_tag is
a function that utilizes the tag information built up on every I/O.
However, there aren't many consumers for blk_queue_find_tag, except
NCR53c700 and tekram-dc390. Vast majority of scsi drivers don't
use these tag currently. So why bother build them at the beginning
of an I/O and then tear it all down at the end, all doing hard work
but no other functions in the kernel appears to care.
Is there another big scheme in the works to use these tags? If not,
I'd like to propose we add a sysfs attribute to dynamically control
whether kernel maintains blk request tag or not. This has performance
advantage that we don't needlessly waste CPU cycle on things we throw
away without using them. Would the following patch be acceptable?
Comments?
Signed-off-by: Ken Chen <[email protected]>
--- linux-2.6.14-rc3/drivers/block/ll_rw_blk.c.orig 2005-10-06 19:11:28.452697852 -0700
+++ linux-2.6.14-rc3/drivers/block/ll_rw_blk.c 2005-10-06 19:11:43.198791421 -0700
@@ -757,7 +757,7 @@ static void __blk_queue_free_tags(reques
}
q->queue_tags = NULL;
- q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
+ q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED | 1 << QUEUE_FLAG_TAGGED);
}
/**
@@ -771,6 +771,7 @@ static void __blk_queue_free_tags(reques
void blk_queue_free_tags(request_queue_t *q)
{
clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+ clear_bit(QUEUE_FLAG_TAGGED, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_free_tags);
@@ -838,6 +839,7 @@ int blk_queue_init_tags(request_queue_t
if ((rc = blk_queue_resize_tags(q, depth)))
return rc;
set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
+ set_bit(QUEUE_FLAG_TAGGED, &q->queue_flags);
return 0;
} else
atomic_inc(&tags->refcnt);
@@ -846,7 +848,7 @@ int blk_queue_init_tags(request_queue_t
* assign it, all done
*/
q->queue_tags = tags;
- q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
+ q->queue_flags |= (1 << QUEUE_FLAG_QUEUED | 1 << QUEUE_FLAG_TAGGED);
return 0;
fail:
kfree(tags);
@@ -3589,6 +3591,26 @@ static ssize_t queue_max_hw_sectors_show
return queue_var_show(max_hw_sectors_kb, (page));
}
+static ssize_t
+queue_tag_maint_store(struct request_queue *q, const char *page, size_t count)
+{
+ unsigned long tag_maint;
+ ssize_t ret = queue_var_store(&tag_maint, page, count);
+
+ if (blk_queue_queued(q)) {
+ if (tag_maint)
+ set_bit(QUEUE_FLAG_TAGGED, &q->queue_flags);
+ else
+ clear_bit(QUEUE_FLAG_TAGGED, &q->queue_flags);
+ }
+
+ return ret;
+}
+
+static ssize_t queue_tag_maint_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(blk_queue_tagged(q), (page));
+}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -3619,12 +3641,19 @@ static struct queue_sysfs_entry queue_io
.store = elv_iosched_store,
};
+static struct queue_sysfs_entry queue_tag_maint_entry = {
+ .attr = {.name = "tag_maint", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_tag_maint_show,
+ .store = queue_tag_maint_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
&queue_iosched_entry.attr,
+ &queue_tag_maint_entry.attr,
NULL,
};
--- linux-2.6.14-rc3/include/linux/blkdev.h.orig 2005-10-06 19:14:45.193906379 -0700
+++ linux-2.6.14-rc3/include/linux/blkdev.h 2005-10-06 19:18:17.228083470 -0700
@@ -436,9 +436,11 @@ enum {
#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */
#define QUEUE_FLAG_DRAIN 8 /* draining queue for sched switch */
#define QUEUE_FLAG_FLUSH 9 /* doing barrier flush sequence */
+#define QUEUE_FLAG_TAGGED 10 /* maintain per request tag */
#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
-#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
+#define blk_queue_queued(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
+#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_TAGGED, &(q)->queue_flags)
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_flushing(q) test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags)
On Thu, Oct 06 2005, Chen, Kenneth W wrote:
> blk_queue_start_tag and blk_queue_end_tag are called for tagging
> I/O to scsi device that is capable of tcq. blk_queue_find_tag is
> a function that utilizes the tag information built up on every I/O.
>
> However, there aren't many consumers for blk_queue_find_tag, except
> NCR53c700 and tekram-dc390. Vast majority of scsi drivers don't
> use these tag currently. So why bother build them at the beginning
> of an I/O and then tear it all down at the end, all doing hard work
> but no other functions in the kernel appears to care.
>
> Is there another big scheme in the works to use these tags? If not,
> I'd like to propose we add a sysfs attribute to dynamically control
> whether kernel maintains blk request tag or not. This has performance
> advantage that we don't needlessly waste CPU cycle on things we throw
> away without using them. Would the following patch be acceptable?
> Comments?
I don't understand the need for this patch - the generic tagging is only
used if the SCSI LLD indicated it wanted it by issuing a
scsi_activate_tcq(). So blk_queue_start_tag() is only called if the LLD
already did a scsi_activate_tcq(), and blk_queue_end_tag() is only
called if the rq is block layer tagged. blk_queue_find_tag() is only
used with direct use of scsi_find_tag(), a function that should (and is)
only usable by users of the generic tagging already.
So please, a description of what problem you are trying to solve would
be appreciated :-)
--
Jens Axboe
Jens Axboe wrote on Friday, October 07, 2005 12:24 AM
> I don't understand the need for this patch - the generic tagging is
only
> used if the SCSI LLD indicated it wanted it by issuing a
> scsi_activate_tcq(). So blk_queue_start_tag() is only called if the
LLD
> already did a scsi_activate_tcq(), and blk_queue_end_tag() is only
> called if the rq is block layer tagged. blk_queue_find_tag() is only
> used with direct use of scsi_find_tag(), a function that should (and
is)
> only usable by users of the generic tagging already.
>
You beat me by a couple of minutes. I was about to say that the culprit
is in the qla2x00 driver where it unnecessarily activated generic blk
tag queuing by calling scsi_activate_tcq() and it never uses tag.
> So please, a description of what problem you are trying to solve would
> be appreciated :-)
It starts out with scsi_end_request being a fairly hot function in the
execution profile, then I noticed blk_queue_start/end_tag() are being
called but no actual consumer of using the tag. I'm trying to find a
way to avoid making these blk_queue_start/end_tag calls. I got the
answer
now. The proper way is to fix it in the scsi LLDD. Scratch this patch,
new patch to follow :-)
- Ken
On Fri, Oct 07 2005, Chen, Kenneth W wrote:
> Jens Axboe wrote on Friday, October 07, 2005 12:24 AM
> > I don't understand the need for this patch - the generic tagging is
> only
> > used if the SCSI LLD indicated it wanted it by issuing a
> > scsi_activate_tcq(). So blk_queue_start_tag() is only called if the
> LLD
> > already did a scsi_activate_tcq(), and blk_queue_end_tag() is only
> > called if the rq is block layer tagged. blk_queue_find_tag() is only
> > used with direct use of scsi_find_tag(), a function that should (and
> is)
> > only usable by users of the generic tagging already.
> >
>
> You beat me by a couple of minutes. I was about to say that the culprit
> is in the qla2x00 driver where it unnecessarily activated generic blk
> tag queuing by calling scsi_activate_tcq() and it never uses tag.
>
> > So please, a description of what problem you are trying to solve would
> > be appreciated :-)
>
> It starts out with scsi_end_request being a fairly hot function in the
> execution profile, then I noticed blk_queue_start/end_tag() are being
> called but no actual consumer of using the tag. I'm trying to find a
> way to avoid making these blk_queue_start/end_tag calls. I got the
> answer
> now. The proper way is to fix it in the scsi LLDD. Scratch this patch,
> new patch to follow :-)
Ok that makes more sense! But it's a little worrying that
blk_queue_end_tag() would show up as hot in the profile, it is actually
quite lean.
--
Jens Axboe
On Fri, 2005-10-07 at 09:41 +0200, Jens Axboe wrote:
> Ok that makes more sense! But it's a little worrying that
> blk_queue_end_tag() would show up as hot in the profile, it is actually
> quite lean.
it probably just is the first one to touch the IO structures after the
completion, and thus gets the penalty for the cachemiss. Something has
to have that after io completion (the io started usually > 10 msec ago
after all, and usually on another cpu at that) and my experience is that
it's one of those jello elephants; you can only move it around but not
really avoid it.
Jens Axboe wrote on Friday, October 07, 2005 12:42 AM
> > It starts out with scsi_end_request being a fairly hot function in
the
> > execution profile, then I noticed blk_queue_start/end_tag() are
being
> > called but no actual consumer of using the tag. I'm trying to find
a
> > way to avoid making these blk_queue_start/end_tag calls. I got the
> > answer
> > now. The proper way is to fix it in the scsi LLDD. Scratch this
patch,
> > new patch to follow :-)
>
> Ok that makes more sense! But it's a little worrying that
> blk_queue_end_tag() would show up as hot in the profile, it is
actually
> quite lean.
It's probably a very small number that I'm chasing with avoiding blk
layer tagging. Nevertheless, any number no matter how small, is a gold
mine to me :-)
Latest execution profile taken with 2.6.14-rc2 kernel with "industry
standard transaction processing database workload". First column is
clock ticks (a direct measure of time), 2nd column is instruction
retired,
and 3rd column is number of L3 misses occurred inside the function.
Symbol Clockticks Inst. Retired L3 Misses
scsi_request_fn 8.12% 9.27% 11.18%
Schedule 6.52% 4.93% 7.26%
scsi_end_request 4.44% 3.59% 6.76%
__blockdev_direct_IO 4.28% 4.38% 3.98%
__make_request 3.59% 4.16% 3.47%
__wake_up 2.46% 1.56% 3.33%
dio_bio_end_io 2.14% 1.67% 3.18%
aio_complete 2.05% 1.27% 3.56%
kmem_cache_free 1.95% 1.70% 0.71%
kmem_cache_alloc 1.45% 1.84% 0.45%
put_page 1.42% 0.60% 1.27%
follow_hugetlb_page 1.41% 0.75% 1.27%
__generic_file_aio_read 1.37% 0.36% 1.68%
On Fri, Oct 07 2005, Arjan van de Ven wrote:
> On Fri, 2005-10-07 at 09:41 +0200, Jens Axboe wrote:
> > Ok that makes more sense! But it's a little worrying that
> > blk_queue_end_tag() would show up as hot in the profile, it is actually
> > quite lean.
>
> it probably just is the first one to touch the IO structures after the
> completion, and thus gets the penalty for the cachemiss. Something has
> to have that after io completion (the io started usually > 10 msec ago
> after all, and usually on another cpu at that) and my experience is that
> it's one of those jello elephants; you can only move it around but not
> really avoid it.
That thought did occur to me, but I don't really see how that can be the
case. The ->queue_tags should be cache hot if you repeatedly call that
function, since that will never change. The request itself has been
touched by scsi_end_request() already, so unless the layout is really
bad we shouldn't need to fetch a lot of cache lines there. That leaves
__test_and_clear_bit(), I guess that must be it.
--
Jens Axboe
Crap .... (my silly mail client doesn't line up tab correctly)
Repost the profile.
Latest execution profile taken with 2.6.14-rc2 kernel with "industry
standard transaction processing database workload". First column is
clock ticks (a direct measure of time), 2nd column is instruction
retired, and 3rd column is number of L3 misses occurred inside the
function.
Symbol Clktick Inst.
Retired L3 Misses
scsi_request_fn 8.12% 9.27% 11.18%
Schedule 6.52% 4.93% 7.26%
scsi_end_request 4.44% 3.59% 6.76%
__blockdev_direct_IO 4.28% 4.38% 3.98%
__make_request 3.59% 4.16% 3.47%
__wake_up 2.46% 1.56% 3.33%
dio_bio_end_io 2.14% 1.67% 3.18%
aio_complete 2.05% 1.27% 3.56%
kmem_cache_free 1.95% 1.70% 0.71%
kmem_cache_alloc 1.45% 1.84% 0.45%
put_page 1.42% 0.60% 1.27%
follow_hugetlb_page 1.41% 0.75% 1.27%
__generic_file_aio_read 1.37% 0.36% 1.68%
On Fri, Oct 07 2005, Chen, Kenneth W wrote:
> Jens Axboe wrote on Friday, October 07, 2005 12:42 AM
> > > It starts out with scsi_end_request being a fairly hot function in
> the
> > > execution profile, then I noticed blk_queue_start/end_tag() are
> being
> > > called but no actual consumer of using the tag. I'm trying to find
> a
> > > way to avoid making these blk_queue_start/end_tag calls. I got the
> > > answer
> > > now. The proper way is to fix it in the scsi LLDD. Scratch this
> patch,
> > > new patch to follow :-)
> >
> > Ok that makes more sense! But it's a little worrying that
> > blk_queue_end_tag() would show up as hot in the profile, it is
> actually
> > quite lean.
>
> It's probably a very small number that I'm chasing with avoiding blk
> layer tagging. Nevertheless, any number no matter how small, is a gold
> mine to me :-)
>
> Latest execution profile taken with 2.6.14-rc2 kernel with "industry
> standard transaction processing database workload". First column is
> clock ticks (a direct measure of time), 2nd column is instruction
> retired,
> and 3rd column is number of L3 misses occurred inside the function.
>
> Symbol Clockticks Inst. Retired L3 Misses
> scsi_request_fn 8.12% 9.27% 11.18%
> Schedule 6.52% 4.93% 7.26%
> scsi_end_request 4.44% 3.59% 6.76%
> __blockdev_direct_IO 4.28% 4.38% 3.98%
> __make_request 3.59% 4.16% 3.47%
> __wake_up 2.46% 1.56% 3.33%
> dio_bio_end_io 2.14% 1.67% 3.18%
> aio_complete 2.05% 1.27% 3.56%
> kmem_cache_free 1.95% 1.70% 0.71%
> kmem_cache_alloc 1.45% 1.84% 0.45%
> put_page 1.42% 0.60% 1.27%
> follow_hugetlb_page 1.41% 0.75% 1.27%
> __generic_file_aio_read 1.37% 0.36% 1.68%
The above looks pretty much as expected. What change in profile did you
see when eliminating the blk_queue_end_tag() call?
--
Jens Axboe
Jens Axboe wrote on Friday, October 07, 2005 1:08 AM
> > It's probably a very small number that I'm chasing with avoiding blk
> > layer tagging. Nevertheless, any number no matter how small, is a
gold
> > mine to me :-)
> >
> > Latest execution profile taken with 2.6.14-rc2 kernel with "industry
> > standard transaction processing database workload". First column is
> > clock ticks (a direct measure of time), 2nd column is instruction
> > retired,
> > and 3rd column is number of L3 misses occurred inside the function.
> >
> > Symbol Clockticks Inst. Retired L3
Misses
> > scsi_request_fn 8.12% 9.27% 11.18%
> > Schedule 6.52% 4.93% 7.26%
> > scsi_end_request 4.44% 3.59% 6.76%
> > __blockdev_direct_IO 4.28% 4.38% 3.98%
> > __make_request 3.59% 4.16% 3.47%
> > __wake_up 2.46% 1.56% 3.33%
> > dio_bio_end_io 2.14% 1.67% 3.18%
> > aio_complete 2.05% 1.27% 3.56%
> > kmem_cache_free 1.95% 1.70% 0.71%
> > kmem_cache_alloc 1.45% 1.84% 0.45%
> > put_page 1.42% 0.60% 1.27%
> > follow_hugetlb_page 1.41% 0.75% 1.27%
> > __generic_file_aio_read 1.37% 0.36% 1.68%
>
> The above looks pretty much as expected. What change in profile did
you
> see when eliminating the blk_queue_end_tag() call?
I haven't make any measurement yet. The original patch was an RFC, and
I want to hear opinions from the experts first. I will do a measurement
with the change in qla2x00 driver and I will let you know how much
difference does it make. Most likely, clock ticks in scsi_request_fn
and
scsi_end_request should be reduced. It will be interesting to see L3
misses stats as well.
- Ken
> The request itself has been
> touched by scsi_end_request() already, so unless the layout is really
> bad we shouldn't need to fetch a lot of cache lines there. That leaves
> __test_and_clear_bit(), I guess that must be it.
__test_and_clear_bit() is cheap; it's a single non-locked instruction.
Arguably it should be converted into a C variant so that the compiler
can pick the best code sequence (I'm not so sure the asm we picked for
this is optimal for all processors, I bet it'll be microcoded) but I'd
be surprised if it'd be more than a handful of cycles no matter what.
On Fri, 07 Oct 2005, Chen, Kenneth W wrote:
> Jens Axboe wrote on Friday, October 07, 2005 1:08 AM
> > > It's probably a very small number that I'm chasing with avoiding blk
> > > layer tagging. Nevertheless, any number no matter how small, is a
> gold
> > > mine to me :-)
> > >
> > > Latest execution profile taken with 2.6.14-rc2 kernel with "industry
> > > standard transaction processing database workload". First column is
> > > clock ticks (a direct measure of time), 2nd column is instruction
> > > retired,
> > > and 3rd column is number of L3 misses occurred inside the function.
> > >
> > > Symbol Clockticks Inst. Retired L3
> Misses
> > > scsi_request_fn 8.12% 9.27% 11.18%
> > > Schedule 6.52% 4.93% 7.26%
> > > scsi_end_request 4.44% 3.59% 6.76%
> > > __blockdev_direct_IO 4.28% 4.38% 3.98%
> > > __make_request 3.59% 4.16% 3.47%
> > > __wake_up 2.46% 1.56% 3.33%
> > > dio_bio_end_io 2.14% 1.67% 3.18%
> > > aio_complete 2.05% 1.27% 3.56%
> > > kmem_cache_free 1.95% 1.70% 0.71%
> > > kmem_cache_alloc 1.45% 1.84% 0.45%
> > > put_page 1.42% 0.60% 1.27%
> > > follow_hugetlb_page 1.41% 0.75% 1.27%
> > > __generic_file_aio_read 1.37% 0.36% 1.68%
> >
> > The above looks pretty much as expected. What change in profile did
> you
> > see when eliminating the blk_queue_end_tag() call?
>
> I haven't make any measurement yet. The original patch was an RFC, and
> I want to hear opinions from the experts first. I will do a measurement
> with the change in qla2x00 driver and I will let you know how much
> difference does it make. Most likely, clock ticks in scsi_request_fn
> and
> scsi_end_request should be reduced. It will be interesting to see L3
> misses stats as well.
Yes, please let us know what your benchmarking measurements show. I
take it you are planning on doing something like:
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1101,10 +1101,7 @@ qla2xxx_slave_configure(struct scsi_devi
scsi_qla_host_t *ha = to_qla_host(sdev->host);
struct fc_rport *rport = starget_to_rport(sdev->sdev_target);
- if (sdev->tagged_supported)
- scsi_activate_tcq(sdev, 32);
- else
- scsi_deactivate_tcq(sdev, 32);
+ scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), depth);
rport->dev_loss_tmo = ha->port_down_retry_count + 5;
Thanks,
Andrew Vasquez
On Fri, Oct 07 2005, Andrew Vasquez wrote:
> On Fri, 07 Oct 2005, Chen, Kenneth W wrote:
>
> > Jens Axboe wrote on Friday, October 07, 2005 1:08 AM
> > > > It's probably a very small number that I'm chasing with avoiding blk
> > > > layer tagging. Nevertheless, any number no matter how small, is a
> > gold
> > > > mine to me :-)
> > > >
> > > > Latest execution profile taken with 2.6.14-rc2 kernel with "industry
> > > > standard transaction processing database workload". First column is
> > > > clock ticks (a direct measure of time), 2nd column is instruction
> > > > retired,
> > > > and 3rd column is number of L3 misses occurred inside the function.
> > > >
> > > > Symbol Clockticks Inst. Retired L3
> > Misses
> > > > scsi_request_fn 8.12% 9.27% 11.18%
> > > > Schedule 6.52% 4.93% 7.26%
> > > > scsi_end_request 4.44% 3.59% 6.76%
> > > > __blockdev_direct_IO 4.28% 4.38% 3.98%
> > > > __make_request 3.59% 4.16% 3.47%
> > > > __wake_up 2.46% 1.56% 3.33%
> > > > dio_bio_end_io 2.14% 1.67% 3.18%
> > > > aio_complete 2.05% 1.27% 3.56%
> > > > kmem_cache_free 1.95% 1.70% 0.71%
> > > > kmem_cache_alloc 1.45% 1.84% 0.45%
> > > > put_page 1.42% 0.60% 1.27%
> > > > follow_hugetlb_page 1.41% 0.75% 1.27%
> > > > __generic_file_aio_read 1.37% 0.36% 1.68%
> > >
> > > The above looks pretty much as expected. What change in profile did
> > you
> > > see when eliminating the blk_queue_end_tag() call?
> >
> > I haven't make any measurement yet. The original patch was an RFC, and
> > I want to hear opinions from the experts first. I will do a measurement
> > with the change in qla2x00 driver and I will let you know how much
> > difference does it make. Most likely, clock ticks in scsi_request_fn
> > and
> > scsi_end_request should be reduced. It will be interesting to see L3
> > misses stats as well.
>
> Yes, please let us know what your benchmarking measurements show. I
> take it you are planning on doing something like:
>
> --- a/drivers/scsi/qla2xxx/qla_os.c
> +++ b/drivers/scsi/qla2xxx/qla_os.c
> @@ -1101,10 +1101,7 @@ qla2xxx_slave_configure(struct scsi_devi
> scsi_qla_host_t *ha = to_qla_host(sdev->host);
> struct fc_rport *rport = starget_to_rport(sdev->sdev_target);
>
> - if (sdev->tagged_supported)
> - scsi_activate_tcq(sdev, 32);
> - else
> - scsi_deactivate_tcq(sdev, 32);
> + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), depth);
>
> rport->dev_loss_tmo = ha->port_down_retry_count + 5;
Ken later posted the patch to linux-scsi, it's very similar to yours.
I doubt the profile will show a significant change, however it is indeed
a little silly to setup the block queueing if it's not going to be used.
Doing the work twice in different code paths is never going to be a win
:-)
--
Jens Axboe