2009-12-21 02:28:30

by Shaohua Li

[permalink] [raw]
Subject: [RFC]block: add a new flag to make request complete on submitted cpu

We already have a QUEUE_FLAG_SAME_COMP, which makes request complete
on the first cpu of a mc/ht, but this isn't sufficient. In a system
with fast block devices (intel SSD), it turns out the first cpu is
bottlenect. Add a flag to make request complete on cpu where request
is submitted. The flag implies QUEUE_FLAG_SAME_COMP. By default, it is off.

My test machine has two CPUs and 4 intel SSD. Without the new flag,
the io thoughput is about 400m/s; with it, the thoughput is about 500m/s.

Signed-off-by: Shaohua Li <[email protected]>
---
block/blk-core.c | 2 +-
block/blk-softirq.c | 2 +-
block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++
block/blk.h | 9 +++++++--
include/linux/blkdev.h | 3 ++-
5 files changed, 44 insertions(+), 5 deletions(-)

Index: linux-2.6/block/blk-sysfs.c
===================================================================
--- linux-2.6.orig/block/blk-sysfs.c
+++ linux-2.6/block/blk-sysfs.c
@@ -233,6 +233,32 @@ queue_rq_affinity_store(struct request_q
return ret;
}

+static ssize_t queue_rq_samecpu_show(struct request_queue *q, char *page)
+{
+ bool set = test_bit(QUEUE_FLAG_SAME_CPU, &q->queue_flags);
+
+ return queue_var_show(set, page);
+}
+
+static ssize_t
+queue_rq_samecpu_store(struct request_queue *q, const char *page, size_t count)
+{
+ ssize_t ret = -EINVAL;
+#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+ unsigned long val;
+
+ ret = queue_var_store(&val, page, count);
+ spin_lock_irq(q->queue_lock);
+ if (val) {
+ queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+ queue_flag_set(QUEUE_FLAG_SAME_CPU, q);
+ } else
+ queue_flag_clear(QUEUE_FLAG_SAME_CPU, q);
+ spin_unlock_irq(q->queue_lock);
+#endif
+ return ret;
+}
+
static ssize_t queue_iostats_show(struct request_queue *q, char *page)
{
return queue_var_show(blk_queue_io_stat(q), page);
@@ -341,6 +367,12 @@ static struct queue_sysfs_entry queue_rq
.store = queue_rq_affinity_store,
};

+static struct queue_sysfs_entry queue_rq_samecpu_entry = {
+ .attr = {.name = "rq_samecpu", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_rq_samecpu_show,
+ .store = queue_rq_samecpu_store,
+};
+
static struct queue_sysfs_entry queue_iostats_entry = {
.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
.show = queue_iostats_show,
@@ -365,6 +397,7 @@ static struct attribute *default_attrs[]
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
+ &queue_rq_samecpu_entry.attr,
NULL,
};

Index: linux-2.6/block/blk.h
===================================================================
--- linux-2.6.orig/block/blk.h
+++ linux-2.6/block/blk.h
@@ -140,10 +140,15 @@ static inline int queue_congestion_off_t

#endif /* BLK_DEV_INTEGRITY */

-static inline int blk_cpu_to_group(int cpu)
+static inline int blk_cpu_to_group(struct request_queue *q, int cpu)
{
+ const struct cpumask *mask;
+
+ if (test_bit(QUEUE_FLAG_SAME_CPU, &q->queue_flags))
+ return cpu;
+
#ifdef CONFIG_SCHED_MC
- const struct cpumask *mask = cpu_coregroup_mask(cpu);
+ mask = cpu_coregroup_mask(cpu);
return cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
return cpumask_first(topology_thread_cpumask(cpu));
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h
+++ linux-2.6/include/linux/blkdev.h
@@ -455,7 +455,7 @@ struct request_queue
#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */
#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */
+#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU group */
#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */
#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */
#define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */
@@ -463,6 +463,7 @@ struct request_queue
#define QUEUE_FLAG_IO_STAT 15 /* do IO stats */
#define QUEUE_FLAG_CQ 16 /* hardware does queuing */
#define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */
+#define QUEUE_FLAG_SAME_CPU 18 /* force complete on same CPU */

#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_CLUSTER) | \
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -1267,7 +1267,7 @@ get_rq:
spin_lock_irq(q->queue_lock);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
bio_flagged(bio, BIO_CPU_AFFINE))
- req->cpu = blk_cpu_to_group(smp_processor_id());
+ req->cpu = blk_cpu_to_group(q, smp_processor_id());
if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
Index: linux-2.6/block/blk-softirq.c
===================================================================
--- linux-2.6.orig/block/blk-softirq.c
+++ linux-2.6/block/blk-softirq.c
@@ -111,7 +111,7 @@ void __blk_complete_request(struct reque

local_irq_save(flags);
cpu = smp_processor_id();
- group_cpu = blk_cpu_to_group(cpu);
+ group_cpu = blk_cpu_to_group(q, cpu);

/*
* Select completion CPU


2009-12-21 09:10:33

by Jens Axboe

[permalink] [raw]
Subject: Re: [RFC]block: add a new flag to make request complete on submitted cpu

On Mon, Dec 21 2009, Shaohua Li wrote:
> We already have a QUEUE_FLAG_SAME_COMP, which makes request complete
> on the first cpu of a mc/ht, but this isn't sufficient. In a system
> with fast block devices (intel SSD), it turns out the first cpu is
> bottlenect. Add a flag to make request complete on cpu where request
> is submitted. The flag implies QUEUE_FLAG_SAME_COMP. By default, it is off.

It was a lazy trick to avoid doing any round robin work in there.

> My test machine has two CPUs and 4 intel SSD. Without the new flag,
> the io thoughput is about 400m/s; with it, the thoughput is about 500m/s.

So I think we should just fix it, I still think the group logic makes
sense. But instead of always going for the first one, let it complete
locally if part of the group, if not send to specific submitter CPU.

--
Jens Axboe

2009-12-21 12:12:32

by Jens Axboe

[permalink] [raw]
Subject: Re: [RFC]block: add a new flag to make request complete on submitted cpu

On Mon, Dec 21 2009, Jens Axboe wrote:
> On Mon, Dec 21 2009, Shaohua Li wrote:
> > We already have a QUEUE_FLAG_SAME_COMP, which makes request complete
> > on the first cpu of a mc/ht, but this isn't sufficient. In a system
> > with fast block devices (intel SSD), it turns out the first cpu is
> > bottlenect. Add a flag to make request complete on cpu where request
> > is submitted. The flag implies QUEUE_FLAG_SAME_COMP. By default, it is off.
>
> It was a lazy trick to avoid doing any round robin work in there.
>
> > My test machine has two CPUs and 4 intel SSD. Without the new flag,
> > the io thoughput is about 400m/s; with it, the thoughput is about 500m/s.
>
> So I think we should just fix it, I still think the group logic makes
> sense. But instead of always going for the first one, let it complete
> locally if part of the group, if not send to specific submitter CPU.

Is this enough? It renames cpu to local_cpu and ccpu to target_cpu to
make things clearer to read, the real change is that we allow local
completion if the cpu matches OR the group matches.

We just want to keep it as cache local as we can, I don't think we
should add a new sysfs flag to control group vs specific CPU completion.
That's exposing too much detail.

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ee9c216..300dc90 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -105,23 +105,28 @@ void __blk_complete_request(struct request *req)
{
struct request_queue *q = req->q;
unsigned long flags;
- int ccpu, cpu, group_cpu;
+ int target_cpu, local_cpu, group_cpu;

BUG_ON(!q->softirq_done_fn);

local_irq_save(flags);
- cpu = smp_processor_id();
- group_cpu = blk_cpu_to_group(cpu);
+ local_cpu = smp_processor_id();
+ group_cpu = blk_cpu_to_group(local_cpu);

/*
* Select completion CPU
*/
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
- ccpu = req->cpu;
+ target_cpu = req->cpu;
else
- ccpu = cpu;
+ target_cpu = local_cpu;

- if (ccpu == cpu || ccpu == group_cpu) {
+ /*
+ * If the target_cpu is same as the local one or from the same group,
+ * complete locally
+ */
+ if (target_cpu == local_cpu ||
+ blk_cpu_to_group(target_cpu) == group_cpu) {
struct list_head *list;
do_local:
list = &__get_cpu_var(blk_cpu_done);
@@ -135,7 +140,7 @@ do_local:
*/
if (list->next == &req->csd.list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
- } else if (raise_blk_irq(ccpu, req))
+ } else if (raise_blk_irq(target_cpu, req))
goto do_local;

local_irq_restore(flags);

--
Jens Axboe

2009-12-22 01:38:53

by Shaohua Li

[permalink] [raw]
Subject: Re: [RFC]block: add a new flag to make request complete on submitted cpu

On Mon, Dec 21, 2009 at 08:12:28PM +0800, Jens Axboe wrote:
> On Mon, Dec 21 2009, Jens Axboe wrote:
> > On Mon, Dec 21 2009, Shaohua Li wrote:
> > > We already have a QUEUE_FLAG_SAME_COMP, which makes request complete
> > > on the first cpu of a mc/ht, but this isn't sufficient. In a system
> > > with fast block devices (intel SSD), it turns out the first cpu is
> > > bottlenect. Add a flag to make request complete on cpu where request
> > > is submitted. The flag implies QUEUE_FLAG_SAME_COMP. By default, it is off.
> >
> > It was a lazy trick to avoid doing any round robin work in there.
> >
> > > My test machine has two CPUs and 4 intel SSD. Without the new flag,
> > > the io thoughput is about 400m/s; with it, the thoughput is about 500m/s.
> >
> > So I think we should just fix it, I still think the group logic makes
> > sense. But instead of always going for the first one, let it complete
> > locally if part of the group, if not send to specific submitter CPU.
>
> Is this enough? It renames cpu to local_cpu and ccpu to target_cpu to
> make things clearer to read, the real change is that we allow local
> completion if the cpu matches OR the group matches.
No. Interrupt is fired on one CPU, so the change is a nop. All requests
are handled by the CPU directed the interrupt.
AHCI supports multiple MSI but not support per-vector mask, which makes
it's hard to add multiple MSI support.

Thanks,
Shaohua