Hi,
Since I'll be on vacation next week, I thought I'd send this out in
case people wanted to play with it. It works here, but I haven't done
any performance numbers at all.
Patches 1-7 are all preparation patches for #8, which contains the
real changes. I'm not particularly happy with the arch implementation
for raising a softirq on another CPU, but it should be fast enough
so suffice for testing.
Anyway, this patchset is mainly meant as a playground for testing IO
affinity. It allows you to set three values per queue, see the files
in the /sys/block/<dev>/queue directory:
completion_affinity
Only allow completions to happen on the defined CPU mask.
queue_affinity
Only allow queuing to happen on the defined CPU mask.
rq_affinity
Always complete a request on the same CPU that queued it.
As you can tell, there's some overlap to allow for experimentation.
rq_affinity will override completion_affinity, so it's possible to
have completions on a CPU that isn't set in that mask. The interface
is currently limited to all CPUs or a specific CPU, but the implementation
is supports (and works with) cpu masks. The logic is in
blk_queue_set_cpumask(), it should be easy enough to change this to
echo a full mask, or allow OR'ing of CPU masks when a new CPU is passed in.
For now, echo a CPU number to set that CPU, or use -1 to set all CPUs.
The default is all CPUs for no change in behaviour.
Patch set is against current git as of this morning. The code is also in
the block git repo, branch is io-cpu-affinity.
git://git.kernel.dk/linux-2.6-block.git io-cpu-affinity
--
Jens Axboe
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/workqueue.h | 1 +
kernel/workqueue.c | 15 +++++++++++++++
2 files changed, 16 insertions(+), 0 deletions(-)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7f28c32..4c46944 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -179,6 +179,7 @@ __create_workqueue_key(const char *name, int singlethread,
extern void destroy_workqueue(struct workqueue_struct *wq);
extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
+extern int FASTCALL(queue_work_on_cpu(struct workqueue_struct *wq, struct work_struct *work, int cpu));
extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *work, unsigned long delay));
extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52db48e..551ad7e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -175,6 +175,21 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
}
EXPORT_SYMBOL_GPL(queue_work);
+int fastcall queue_work_on_cpu(struct workqueue_struct *wq,
+ struct work_struct *work, int cpu)
+{
+ int ret = 0;
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+ BUG_ON(!list_empty(&work->entry));
+ __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ ret = 1;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_on_cpu);
+
void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
--
1.5.4.22.g7a20
Signed-off-by: Jens Axboe <[email protected]>
---
block/Makefile | 3 +-
block/blk-core.c | 88 -------------------------------------------
block/blk-softirq.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 105 insertions(+), 89 deletions(-)
create mode 100644 block/blk-softirq.c
diff --git a/block/Makefile b/block/Makefile
index 5a43c7d..b064190 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,7 +4,8 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o
+ blk-exec.o blk-merge.o blk-softirq.o ioctl.o genhd.o \
+ scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 4afb39c..11d79f6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,8 +26,6 @@
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
#include <linux/blktrace_api.h>
#include <linux/fault-inject.h>
@@ -50,8 +48,6 @@ struct kmem_cache *blk_requestq_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
static void drive_stat_acct(struct request *rq, int new_io)
{
int rw = rq_data_dir(rq);
@@ -1605,82 +1601,6 @@ static int __end_that_request_first(struct request *req, int error,
}
/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
- struct list_head *cpu_list, local_list;
-
- local_irq_disable();
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
-
- rq = list_entry(local_list.next, struct request, donelist);
- list_del_init(&rq->donelist);
- rq->q->softirq_done_fn(rq);
- }
-}
-
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- &__get_cpu_var(blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
- }
-
- return NOTIFY_OK;
-}
-
-
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
- .notifier_call = blk_cpu_notify,
-};
-
-/**
- * blk_complete_request - end I/O on a request
- * @req: the request being processed
- *
- * Description:
- * Ends all I/O on a request. It does not handle partial completions,
- * unless the driver actually implements this in its completion callback
- * through requeueing. The actual completion happens out-of-order,
- * through a softirq handler. The user must have registered a completion
- * callback through blk_queue_softirq_done().
- **/
-
-void blk_complete_request(struct request *req)
-{
- struct list_head *cpu_list;
- unsigned long flags;
-
- BUG_ON(!req->q->softirq_done_fn);
-
- local_irq_save(flags);
-
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_add_tail(&req->donelist, cpu_list);
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_complete_request);
-
-/*
* queue lock must be held
*/
static void end_that_request_last(struct request *req, int error)
@@ -2004,8 +1924,6 @@ EXPORT_SYMBOL(kblockd_flush_work);
int __init blk_dev_init(void)
{
- int i;
-
kblockd_workqueue = create_workqueue("kblockd");
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
@@ -2016,12 +1934,6 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("blkdev_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
- open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
- register_hotcpu_notifier(&blk_cpu_notifier);
-
return 0;
}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
new file mode 100644
index 0000000..05f9451
--- /dev/null
+++ b/block/blk-softirq.c
@@ -0,0 +1,103 @@
+/*
+ * Functions related to softirq rq completions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include "blk.h"
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+static int __cpuinit blk_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ int cpu = (unsigned long) hcpu;
+
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ &__get_cpu_var(blk_cpu_done));
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
+ }
+
+ return NOTIFY_OK;
+}
+
+
+static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+ .notifier_call = blk_cpu_notify,
+};
+
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+ struct list_head *cpu_list, local_list;
+
+ local_irq_disable();
+ cpu_list = &__get_cpu_var(blk_cpu_done);
+ list_replace_init(cpu_list, &local_list);
+ local_irq_enable();
+
+ while (!list_empty(&local_list)) {
+ struct request *rq;
+
+ rq = list_entry(local_list.next, struct request, donelist);
+ list_del_init(&rq->donelist);
+ rq->q->softirq_done_fn(rq);
+ }
+}
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req: the request being processed
+ *
+ * Description:
+ * Ends all I/O on a request. It does not handle partial completions,
+ * unless the driver actually implements this in its completion callback
+ * through requeueing. The actual completion happens out-of-order,
+ * through a softirq handler. The user must have registered a completion
+ * callback through blk_queue_softirq_done().
+ **/
+
+void blk_complete_request(struct request *req)
+{
+ struct list_head *cpu_list;
+ unsigned long flags;
+
+ BUG_ON(!req->q->softirq_done_fn);
+
+ local_irq_save(flags);
+
+ cpu_list = &__get_cpu_var(blk_cpu_done);
+ list_add_tail(&req->donelist, cpu_list);
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_complete_request);
+
+int __init blk_softirq_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+ register_hotcpu_notifier(&blk_cpu_notifier);
+ return 0;
+}
+subsys_initcall(blk_softirq_init);
--
1.5.4.22.g7a20
Signed-off-by: Jens Axboe <[email protected]>
---
arch/x86/kernel/smp_32.c | 15 +++++++++++++++
arch/x86/kernel/smpboot_32.c | 3 +++
include/asm-x86/hw_irq_32.h | 1 +
include/asm-x86/mach-default/entry_arch.h | 1 +
include/asm-x86/mach-default/irq_vectors.h | 1 +
5 files changed, 21 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index dc0cde9..668b8a4 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -672,6 +672,21 @@ void smp_call_function_interrupt(struct pt_regs *regs)
}
}
+fastcall void smp_raise_block_softirq(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ ack_APIC_irq();
+ local_irq_save(flags);
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void arch_raise_softirq_on_cpu(int cpu, unsigned int nr)
+{
+ send_IPI_mask(cpumask_of_cpu(cpu), BLOCK_SOFTIRQ_VECTOR);
+}
+
static int convert_apicid_to_cpu(int apic_id)
{
int i;
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 579b9b7..ca35697 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -1304,6 +1304,9 @@ void __init smp_intr_init(void)
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+ /* IPI for scheduling block softirq */
+ set_intr_gate(BLOCK_SOFTIRQ_VECTOR, raise_block_softirq);
}
/*
diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h
index ea88054..0a053d8 100644
--- a/include/asm-x86/hw_irq_32.h
+++ b/include/asm-x86/hw_irq_32.h
@@ -32,6 +32,7 @@ extern void (*const interrupt[NR_IRQS])(void);
void reschedule_interrupt(void);
void invalidate_interrupt(void);
void call_function_interrupt(void);
+void raise_block_softirq(void);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/include/asm-x86/mach-default/entry_arch.h b/include/asm-x86/mach-default/entry_arch.h
index bc86146..e96ed61 100644
--- a/include/asm-x86/mach-default/entry_arch.h
+++ b/include/asm-x86/mach-default/entry_arch.h
@@ -13,6 +13,7 @@
BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(raise_block_softirq,BLOCK_SOFTIRQ_VECTOR);
#endif
/*
diff --git a/include/asm-x86/mach-default/irq_vectors.h b/include/asm-x86/mach-default/irq_vectors.h
index 881c63c..f7535a3 100644
--- a/include/asm-x86/mach-default/irq_vectors.h
+++ b/include/asm-x86/mach-default/irq_vectors.h
@@ -48,6 +48,7 @@
#define INVALIDATE_TLB_VECTOR 0xfd
#define RESCHEDULE_VECTOR 0xfc
#define CALL_FUNCTION_VECTOR 0xfb
+#define BLOCK_SOFTIRQ_VECTOR 0xfa
#define THERMAL_APIC_VECTOR 0xf0
/*
--
1.5.4.22.g7a20
Signed-off-by: Jens Axboe <[email protected]>
---
arch/x86/kernel/entry_64.S | 3 +++
arch/x86/kernel/i8259_64.c | 3 +++
arch/x86/kernel/smp_64.c | 15 +++++++++++++++
include/asm-x86/hw_irq_64.h | 2 ++
4 files changed, 23 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c7341e8..753834b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -704,6 +704,9 @@ END(invalidate_interrupt\num)
ENTRY(call_function_interrupt)
apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
END(call_function_interrupt)
+ENTRY(raise_block_softirq)
+ apicinterrupt BLOCK_SOFTIRQ_VECTOR,smp_raise_block_softirq
+END(raise_block_softirq);
ENTRY(irq_move_cleanup_interrupt)
apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
END(irq_move_cleanup_interrupt)
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index fa57a15..cd7675b 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -494,6 +494,9 @@ void __init native_init_IRQ(void)
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+ /* IPI for raising block softirq on another CPU */
+ set_intr_gate(BLOCK_SOFTIRQ_VECTOR, raise_block_softirq);
+
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
#endif
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 2fd74b0..fe4c2bf 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -460,6 +460,21 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
}
EXPORT_SYMBOL(smp_call_function);
+asmlinkage void smp_raise_block_softirq(void)
+{
+ unsigned long flags;
+
+ ack_APIC_irq();
+ local_irq_save(flags);
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void arch_raise_softirq_on_cpu(int cpu, unsigned int nr)
+{
+ send_IPI_mask(cpumask_of_cpu(cpu), BLOCK_SOFTIRQ_VECTOR);
+}
+
static void stop_this_cpu(void *dummy)
{
local_irq_disable();
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index 312a58d..98ed167 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -68,6 +68,7 @@
#define ERROR_APIC_VECTOR 0xfe
#define RESCHEDULE_VECTOR 0xfd
#define CALL_FUNCTION_VECTOR 0xfc
+#define BLOCK_SOFTIRQ_VECTOR 0xfb
/* fb free - please don't readd KDB here because it's useless
(hint - think what a NMI bit does to a vector) */
#define THERMAL_APIC_VECTOR 0xfa
@@ -102,6 +103,7 @@ void spurious_interrupt(void);
void error_interrupt(void);
void reschedule_interrupt(void);
void call_function_interrupt(void);
+void raise_block_softirq(void);
void irq_move_cleanup_interrupt(void);
void invalidate_interrupt0(void);
void invalidate_interrupt1(void);
--
1.5.4.22.g7a20
Signed-off-by: Jens Axboe <[email protected]>
---
arch/ia64/kernel/smp.c | 14 ++++++++++++++
1 files changed, 14 insertions(+), 0 deletions(-)
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 4e446aa..96f8ffa 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -80,6 +80,7 @@ static volatile struct call_data_struct *call_data;
#define IPI_CALL_FUNC 0
#define IPI_CPU_STOP 1
#define IPI_KDUMP_CPU_STOP 3
+#define IPI_BLOCK_SOFTIRQ 4
/* This needs to be cacheline aligned because it is written to by *other* CPUs. */
static DEFINE_PER_CPU_SHARED_ALIGNED(u64, ipi_operation);
@@ -174,6 +175,14 @@ handle_IPI (int irq, void *dev_id)
unw_init_running(kdump_cpu_freeze, NULL);
break;
#endif
+ case IPI_BLOCK_SOFTIRQ: {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_restore(flags);
+ break;
+ }
default:
printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
break;
@@ -461,6 +470,11 @@ smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wai
}
EXPORT_SYMBOL(smp_call_function);
+void arch_raise_softirq_on_cpu(int cpuid)
+{
+ send_IPI_single(cpuid, IPI_BLOCK_SOFTIRQ);
+}
+
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
--
1.5.4.22.g7a20
Preparatory patch for checking queuing affinity.
Signed-off-by: Jens Axboe <[email protected]>
---
block/as-iosched.c | 6 +++---
block/blk-core.c | 8 ++++----
block/cfq-iosched.c | 2 +-
include/linux/blkdev.h | 2 +-
4 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 8c39467..6ef766f 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -450,7 +450,7 @@ static void as_antic_stop(struct as_data *ad)
del_timer(&ad->antic_timer);
ad->antic_status = ANTIC_FINISHED;
/* see as_work_handler */
- kblockd_schedule_work(&ad->antic_work);
+ kblockd_schedule_work(ad->q, &ad->antic_work);
}
}
@@ -471,7 +471,7 @@ static void as_antic_timeout(unsigned long data)
aic = ad->io_context->aic;
ad->antic_status = ANTIC_FINISHED;
- kblockd_schedule_work(&ad->antic_work);
+ kblockd_schedule_work(q, &ad->antic_work);
if (aic->ttime_samples == 0) {
/* process anticipated on has exited or timed out*/
@@ -831,7 +831,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
}
if (ad->changed_batch && ad->nr_dispatched == 1) {
- kblockd_schedule_work(&ad->antic_work);
+ kblockd_schedule_work(q, &ad->antic_work);
ad->changed_batch = 0;
if (ad->batch_data_dir == REQ_SYNC)
diff --git a/block/blk-core.c b/block/blk-core.c
index 11d79f6..34a475b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -282,7 +282,7 @@ void blk_unplug_timeout(unsigned long data)
blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
- kblockd_schedule_work(&q->unplug_work);
+ kblockd_schedule_work(q, &q->unplug_work);
}
void blk_unplug(struct request_queue *q)
@@ -323,7 +323,7 @@ void blk_start_queue(struct request_queue *q)
clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
} else {
blk_plug_device(q);
- kblockd_schedule_work(&q->unplug_work);
+ kblockd_schedule_work(q, &q->unplug_work);
}
}
EXPORT_SYMBOL(blk_start_queue);
@@ -391,7 +391,7 @@ void blk_run_queue(struct request_queue *q)
clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
} else {
blk_plug_device(q);
- kblockd_schedule_work(&q->unplug_work);
+ kblockd_schedule_work(q, &q->unplug_work);
}
}
@@ -1910,7 +1910,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
rq->rq_disk = bio->bi_bdev->bd_disk;
}
-int kblockd_schedule_work(struct work_struct *work)
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
{
return queue_work(kblockd_workqueue, work);
}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ca198e6..91d1126 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -235,7 +235,7 @@ static inline int cfq_bio_sync(struct bio *bio)
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
if (cfqd->busy_queues)
- kblockd_schedule_work(&cfqd->unplug_work);
+ kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
}
static int cfq_queue_empty(struct request_queue *q)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90392a9..5e43772 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -803,7 +803,7 @@ static inline void put_dev_sector(Sector p)
}
struct work_struct;
-int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
void kblockd_flush_work(struct work_struct *work);
#define MODULE_ALIAS_BLOCKDEV(major,minor) \
--
1.5.4.22.g7a20
This only works for the block softirq, due to the hackish method of
the arch implementations.
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/smp.h | 5 +++++
kernel/softirq.c | 7 +++++++
2 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 55232cc..2b546c0 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -61,6 +61,7 @@ int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
* Call a function on all processors
*/
int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
+extern void FASTCALL(raise_softirq_on_cpu(int cpu, unsigned int nr));
#define MSG_ALL_BUT_SELF 0x8000 /* Assume <32768 CPU's */
#define MSG_ALL 0x8001
@@ -112,6 +113,10 @@ static inline void smp_send_reschedule(int cpu) { }
})
#define smp_call_function_mask(mask, func, info, wait) \
(up_smp_call_function(func, info))
+#define raise_softirq_on_cpu(cpu, nr) do { \
+ WARN_ON(!irqs_disabled()); \
+ raise_softirq_irqoff((nr)); \
+} while (0)
#endif /* !SMP */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7837d4..0bad5c3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -663,4 +663,11 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
return ret;
}
EXPORT_SYMBOL(on_each_cpu);
+
+extern void arch_raise_softirq_on_cpu(int, unsigned int);
+void fastcall raise_softirq_on_cpu(int cpu, unsigned int nr)
+{
+ BUG_ON(nr != BLOCK_SOFTIRQ);
+ arch_raise_softirq_on_cpu(cpu, nr);
+}
#endif
--
1.5.4.22.g7a20
Supports several modes:
- Force IO queue affinity to a specific mask of CPUs
- Force IO completion affinity to a specific mask of CPUs
- Force completion of a request on the same CPU that queued it
Test code so far.
Signed-off-by: Jens Axboe <[email protected]>
---
block/blk-core.c | 80 ++++++++++++++++++++++++-------------
block/blk-settings.c | 47 +++++++++++++++++++++
block/blk-softirq.c | 105 ++++++++++++++++++++++++++++++++++++++----------
block/blk-sysfs.c | 85 ++++++++++++++++++++++++++++++++++++++
include/linux/blkdev.h | 8 ++++
5 files changed, 275 insertions(+), 50 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 34a475b..bda4623 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -103,6 +103,7 @@ void rq_init(struct request_queue *q, struct request *rq)
INIT_LIST_HEAD(&rq->queuelist);
INIT_LIST_HEAD(&rq->donelist);
+ rq->cpu = -1;
rq->errors = 0;
rq->bio = rq->biotail = NULL;
INIT_HLIST_NODE(&rq->hash);
@@ -180,6 +181,11 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
+static inline int blk_is_io_cpu(struct request_queue *q)
+{
+ return cpu_isset(smp_processor_id(), q->queue_cpu);
+}
+
/*
* "plug" the device if there are no outstanding requests: this will
* force the transfer to start only after we have put all the requests
@@ -200,6 +206,10 @@ void blk_plug_device(struct request_queue *q)
return;
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
+ /*
+ * no need to care about the io cpu here, since the
+ * timeout handler needs to punt to kblockd anyway
+ */
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
}
@@ -299,6 +309,22 @@ void blk_unplug(struct request_queue *q)
}
EXPORT_SYMBOL(blk_unplug);
+static void blk_invoke_request_fn(struct request_queue *q)
+{
+ /*
+ * one level of recursion is ok and is much faster than kicking
+ * the unplug handling
+ */
+ if (blk_is_io_cpu(q) &&
+ !test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
+ q->request_fn(q);
+ clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
+ } else {
+ set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+ kblockd_schedule_work(q, &q->unplug_work);
+ }
+}
+
/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
@@ -313,18 +339,7 @@ void blk_start_queue(struct request_queue *q)
WARN_ON(!irqs_disabled());
clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
-
- /*
- * one level of recursion is ok and is much faster than kicking
- * the unplug handling
- */
- if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
- q->request_fn(q);
- clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
- } else {
- blk_plug_device(q);
- kblockd_schedule_work(q, &q->unplug_work);
- }
+ blk_invoke_request_fn(q);
}
EXPORT_SYMBOL(blk_start_queue);
@@ -381,19 +396,8 @@ void blk_run_queue(struct request_queue *q)
spin_lock_irqsave(q->queue_lock, flags);
blk_remove_plug(q);
- /*
- * Only recurse once to avoid overrunning the stack, let the unplug
- * handling reinvoke the handler shortly if we already got there.
- */
- if (!elv_queue_empty(q)) {
- if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
- q->request_fn(q);
- clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
- } else {
- blk_plug_device(q);
- kblockd_schedule_work(q, &q->unplug_work);
- }
- }
+ if (!elv_queue_empty(q))
+ blk_invoke_request_fn(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
@@ -453,6 +457,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
+ cpus_setall(q->queue_cpu);
+ cpus_setall(q->complete_cpu);
+ q->force_same_complete = 0;
q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
q->backing_dev_info.unplug_io_data = q;
err = bdi_init(&q->backing_dev_info);
@@ -857,7 +864,10 @@ EXPORT_SYMBOL(blk_get_request);
*/
void blk_start_queueing(struct request_queue *q)
{
- if (!blk_queue_plugged(q))
+ if (!blk_is_io_cpu(q)) {
+ set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+ kblockd_schedule_work(q, &q->unplug_work);
+ } else if (!blk_queue_plugged(q))
q->request_fn(q);
else
__generic_unplug_device(q);
@@ -1160,13 +1170,18 @@ get_rq:
init_request_from_bio(req, bio);
spin_lock_irq(q->queue_lock);
+ if (q->force_same_complete)
+ req->cpu = smp_processor_id();
if (elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
out:
if (sync)
__generic_unplug_device(q);
-
+ if (cpu_isset(smp_processor_id(), q->queue_cpu))
+ q->local_queue++;
+ else
+ q->remote_queue++;
spin_unlock_irq(q->queue_lock);
return 0;
@@ -1912,7 +1927,16 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
{
- return queue_work(kblockd_workqueue, work);
+ int cpu;
+
+ if (blk_is_io_cpu(q))
+ return queue_work(kblockd_workqueue, work);
+
+ /*
+ * would need to be improved, of course...
+ */
+ cpu = first_cpu(q->queue_cpu);
+ return queue_work_on_cpu(kblockd_workqueue, work, cpu);
}
EXPORT_SYMBOL(kblockd_schedule_work);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c8d0c57..2126139 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -386,6 +386,53 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+static int blk_queue_set_cpumask(cpumask_t *cpumask, int cpu)
+{
+ if (cpu == -1)
+ cpus_setall(*cpumask);
+ else if (!cpu_isset(cpu, cpu_possible_map)) {
+ cpus_setall(*cpumask);
+ return -EINVAL;
+ } else {
+ cpus_clear(*cpumask);
+ cpu_set(cpu, *cpumask);
+ }
+
+ return 0;
+}
+
+/**
+ * blk_queue_set_completion_cpu - Set IO CPU for completions
+ * @q: the request queue for the device
+ * @cpu: cpu
+ *
+ * Description:
+ * This function allows a driver to set a CPU that should handle completions
+ * for this device.
+ *
+ **/
+int blk_queue_set_completion_cpu(struct request_queue *q, int cpu)
+{
+ return blk_queue_set_cpumask(&q->complete_cpu, cpu);
+}
+EXPORT_SYMBOL(blk_queue_set_completion_cpu);
+
+/**
+ * blk_queue_set_queue_cpu - Set IO CPU for queuing
+ * @q: the request queue for the device
+ * @cpu: cpu
+ *
+ * Description:
+ * This function allows a driver to set a CPU that should handle queuing
+ * for this device.
+ *
+ **/
+int blk_queue_set_queue_cpu(struct request_queue *q, int cpu)
+{
+ return blk_queue_set_cpumask(&q->queue_cpu, cpu);
+}
+EXPORT_SYMBOL(blk_queue_set_queue_cpu);
+
int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 05f9451..5ae8345 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -11,29 +11,51 @@
#include "blk.h"
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+struct blk_comp {
+ spinlock_t lock;
+ struct list_head list;
+ int dead;
+};
+static DEFINE_PER_CPU(struct blk_comp, blk_cpu_done);
static int __cpuinit blk_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
+ int cpu = (unsigned long) hcpu;
+ struct blk_comp *bc;
+
+ switch (action) {
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN: {
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ struct blk_comp *dead;
local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- &__get_cpu_var(blk_cpu_done));
+ bc = &__get_cpu_var(blk_cpu_done);
+ dead = &per_cpu(blk_cpu_done, cpu);
+ double_spin_lock(&bc->lock, &dead->lock, bc < dead);
+ list_splice_init(&dead->list, &bc->list);
+ dead->dead = 1;
+ double_spin_unlock(&bc->lock, &dead->lock, bc < dead);
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_enable();
+ break;
+ }
+ case CPU_ONLINE: {
+ local_irq_disable();
+ bc = &per_cpu(blk_cpu_done, cpu);
+ bc->dead = 0;
+ local_irq_enable();
+ break;
+ }
}
return NOTIFY_OK;
}
-
static struct notifier_block blk_cpu_notifier __cpuinitdata = {
.notifier_call = blk_cpu_notify,
};
@@ -44,11 +66,14 @@ static struct notifier_block blk_cpu_notifier __cpuinitdata = {
*/
static void blk_done_softirq(struct softirq_action *h)
{
- struct list_head *cpu_list, local_list;
+ struct blk_comp *bc;
+ struct list_head local_list;
local_irq_disable();
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
+ bc = &__get_cpu_var(blk_cpu_done);
+ spin_lock(&bc->lock);
+ list_replace_init(&bc->list, &local_list);
+ spin_unlock(&bc->lock);
local_irq_enable();
while (!list_empty(&local_list)) {
@@ -71,30 +96,66 @@ static void blk_done_softirq(struct softirq_action *h)
* through a softirq handler. The user must have registered a completion
* callback through blk_queue_softirq_done().
**/
-
void blk_complete_request(struct request *req)
{
- struct list_head *cpu_list;
+ int ccpu, cpu, was_empty;
+ struct request_queue *q = req->q;
+ struct blk_comp *bc;
unsigned long flags;
- BUG_ON(!req->q->softirq_done_fn);
+ BUG_ON(!q->softirq_done_fn);
local_irq_save(flags);
+ cpu = smp_processor_id();
+
+ if (q->force_same_complete && req->cpu != -1)
+ ccpu = req->cpu;
+ else if (cpu_isset(cpu, q->complete_cpu))
+ ccpu = cpu;
+ else
+ ccpu = first_cpu(q->complete_cpu);
+
+ if (ccpu == cpu) {
+is_dead:
+ bc = &__get_cpu_var(blk_cpu_done);
+ q->local_complete++;
+ } else {
+ bc = &per_cpu(blk_cpu_done, ccpu);
+ if (unlikely(bc->dead)) {
+ ccpu = cpu;
+ goto is_dead;
+ }
+
+ q->remote_complete++;
+ }
+
+ spin_lock(&bc->lock);
+ was_empty = list_empty(&bc->list);
+ list_add_tail(&req->donelist, &bc->list);
+ spin_unlock(&bc->lock);
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_add_tail(&req->donelist, cpu_list);
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ if (was_empty) {
+ if (cpu == ccpu)
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ else
+ raise_softirq_on_cpu(ccpu, BLOCK_SOFTIRQ);
+ }
local_irq_restore(flags);
}
EXPORT_SYMBOL(blk_complete_request);
-int __init blk_softirq_init(void)
+static int __init blk_softirq_init(void)
{
int i;
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+ for_each_possible_cpu(i) {
+ struct blk_comp *bc = &per_cpu(blk_cpu_done, i);
+
+ spin_lock_init(&bc->lock);
+ INIT_LIST_HEAD(&bc->list);
+ bc->dead = 0;
+ }
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
register_hotcpu_notifier(&blk_cpu_notifier);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 54d0db1..870e837 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -135,6 +135,70 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
return queue_var_show(max_hw_sectors_kb, (page));
}
+static ssize_t queue_complete_affinity_show(struct request_queue *q, char *page)
+{
+ ssize_t len;
+
+ len = cpumask_scnprintf(page, PAGE_SIZE, q->complete_cpu);
+ len += sprintf(page+len, "\nlocal\t%d\nremote\t%d\n", q->local_complete, q->remote_complete);
+ return len;
+}
+
+static ssize_t queue_complete_affinity_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ char *p = (char *) page;
+ long val;
+
+ val = simple_strtol(p, &p, 10);
+ spin_lock_irq(q->queue_lock);
+ blk_queue_set_completion_cpu(q, val);
+ q->local_complete = q->remote_complete = 0;
+ spin_unlock_irq(q->queue_lock);
+ return count;
+}
+
+static ssize_t queue_queue_affinity_show(struct request_queue *q, char *page)
+{
+ ssize_t len;
+
+ len = cpumask_scnprintf(page, PAGE_SIZE, q->queue_cpu);
+ len += sprintf(page+len, "\nlocal\t%d\nremote\t%d\n", q->local_queue, q->remote_queue);
+ return len;
+}
+
+static ssize_t queue_queue_affinity_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ char *p = (char *) page;
+ long val;
+
+ val = simple_strtol(p, &p, 10);
+ spin_lock_irq(q->queue_lock);
+ blk_queue_set_queue_cpu(q, val);
+ q->local_queue = q->remote_queue = 0;
+ spin_unlock_irq(q->queue_lock);
+ return count;
+}
+
+static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(q->force_same_complete, page);
+}
+
+static ssize_t
+queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = queue_var_store(&val, page, count);
+ spin_lock_irq(q->queue_lock);
+ q->force_same_complete = !!val;
+ spin_unlock_irq(q->queue_lock);
+
+ return ret;
+}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -170,6 +234,24 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.show = queue_hw_sector_size_show,
};
+static struct queue_sysfs_entry queue_complete_affinity_entry = {
+ .attr = {.name = "completion_affinity", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_complete_affinity_show,
+ .store = queue_complete_affinity_store,
+};
+
+static struct queue_sysfs_entry queue_queue_affinity_entry = {
+ .attr = {.name = "queue_affinity", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_queue_affinity_show,
+ .store = queue_queue_affinity_store,
+};
+
+static struct queue_sysfs_entry queue_rq_affinity_entry = {
+ .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_rq_affinity_show,
+ .store = queue_rq_affinity_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -177,6 +259,9 @@ static struct attribute *default_attrs[] = {
&queue_max_sectors_entry.attr,
&queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
+ &queue_complete_affinity_entry.attr,
+ &queue_queue_affinity_entry.attr,
+ &queue_rq_affinity_entry.attr,
NULL,
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e43772..7db3bdf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -142,6 +142,7 @@ enum rq_flag_bits {
struct request {
struct list_head queuelist;
struct list_head donelist;
+ int cpu;
struct request_queue *q;
@@ -387,6 +388,11 @@ struct request_queue
#if defined(CONFIG_BLK_DEV_BSG)
struct bsg_class_device bsg_dev;
#endif
+ cpumask_t queue_cpu;
+ cpumask_t complete_cpu;
+ int local_queue, remote_queue;
+ int local_complete, remote_complete;
+ int force_same_complete;
};
#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */
@@ -702,6 +708,8 @@ extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
extern void blk_queue_dma_alignment(struct request_queue *, int);
+extern int blk_queue_set_queue_cpu(struct request_queue *, int);
+extern int blk_queue_set_completion_cpu(struct request_queue *, int);
extern void blk_queue_update_dma_alignment(struct request_queue *, int);
extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
--
1.5.4.22.g7a20
On Thu, 7 Feb 2008 10:18:59 +0100 Jens Axboe <[email protected]> wrote:
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> include/linux/workqueue.h | 1 +
> kernel/workqueue.c | 15 +++++++++++++++
> 2 files changed, 16 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> index 7f28c32..4c46944 100644
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -179,6 +179,7 @@ __create_workqueue_key(const char *name, int singlethread,
> extern void destroy_workqueue(struct workqueue_struct *wq);
>
> extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
> +extern int FASTCALL(queue_work_on_cpu(struct workqueue_struct *wq, struct work_struct *work, int cpu));
-ETOOWIDE
please don't add fastcall - we're trying to remove it.
> extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
> struct delayed_work *work, unsigned long delay));
> extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 52db48e..551ad7e 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -175,6 +175,21 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
> }
> EXPORT_SYMBOL_GPL(queue_work);
>
> +int fastcall queue_work_on_cpu(struct workqueue_struct *wq,
> + struct work_struct *work, int cpu)
> +{
> + int ret = 0;
> +
> + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> + BUG_ON(!list_empty(&work->entry));
> + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
> + ret = 1;
> + }
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(queue_work_on_cpu);
Might as well change queue_work() to call this?
Oleg cc'ed as queue_work() maintainer ;)
On Thu, Feb 07 2008, Andrew Morton wrote:
> On Thu, 7 Feb 2008 10:18:59 +0100 Jens Axboe <[email protected]> wrote:
>
> > Signed-off-by: Jens Axboe <[email protected]>
> > ---
> > include/linux/workqueue.h | 1 +
> > kernel/workqueue.c | 15 +++++++++++++++
> > 2 files changed, 16 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> > index 7f28c32..4c46944 100644
> > --- a/include/linux/workqueue.h
> > +++ b/include/linux/workqueue.h
> > @@ -179,6 +179,7 @@ __create_workqueue_key(const char *name, int singlethread,
> > extern void destroy_workqueue(struct workqueue_struct *wq);
> >
> > extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
> > +extern int FASTCALL(queue_work_on_cpu(struct workqueue_struct *wq, struct work_struct *work, int cpu));
>
> -ETOOWIDE
>
> please don't add fastcall - we're trying to remove it.
Oops yes, I'll kill the fastcall and fix the > 80 char line!
> > extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
> > struct delayed_work *work, unsigned long delay));
> > extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index 52db48e..551ad7e 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -175,6 +175,21 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
> > }
> > EXPORT_SYMBOL_GPL(queue_work);
> >
> > +int fastcall queue_work_on_cpu(struct workqueue_struct *wq,
> > + struct work_struct *work, int cpu)
> > +{
> > + int ret = 0;
> > +
> > + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> > + BUG_ON(!list_empty(&work->entry));
> > + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
> > + ret = 1;
> > + }
> > +
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(queue_work_on_cpu);
>
> Might as well change queue_work() to call this?
>
> Oleg cc'ed as queue_work() maintainer ;)
Good point, I'll do that locally at least.
--
Jens Axboe
* Jens Axboe <[email protected]> wrote:
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> arch/x86/kernel/smp_32.c | 15 +++++++++++++++
> arch/x86/kernel/smpboot_32.c | 3 +++
> include/asm-x86/hw_irq_32.h | 1 +
> include/asm-x86/mach-default/entry_arch.h | 1 +
> include/asm-x86/mach-default/irq_vectors.h | 1 +
> 5 files changed, 21 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
> index dc0cde9..668b8a4 100644
> --- a/arch/x86/kernel/smp_32.c
> +++ b/arch/x86/kernel/smp_32.c
> @@ -672,6 +672,21 @@ void smp_call_function_interrupt(struct pt_regs *regs)
> }
> }
>
> +fastcall void smp_raise_block_softirq(struct pt_regs *regs)
small detail: there's no fastcall used in arch/x86 anymore.
> +{
> + unsigned long flags;
> +
> + ack_APIC_irq();
> + local_irq_save(flags);
> + raise_softirq_irqoff(BLOCK_SOFTIRQ);
> + local_irq_restore(flags);
> +}
if then this should be a general facility to trigger any softirq - not
just the block one.
> #define CALL_FUNCTION_VECTOR 0xfb
> +#define BLOCK_SOFTIRQ_VECTOR 0xfa
this wastes another irq vector and is very special-purpose. Why not make
the smp_call_function() one more scalable instead?
on the more conceptual level, shouldnt we just move to threads instead
of softirqs? That way you can become affine to any CPU and can do
cross-CPU wakeups anytime - which will be nice and fast via the
smp_reschedule_interrupt() facility.
Ingo
On Thu, Feb 07 2008, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > Signed-off-by: Jens Axboe <[email protected]>
> > ---
> > arch/x86/kernel/smp_32.c | 15 +++++++++++++++
> > arch/x86/kernel/smpboot_32.c | 3 +++
> > include/asm-x86/hw_irq_32.h | 1 +
> > include/asm-x86/mach-default/entry_arch.h | 1 +
> > include/asm-x86/mach-default/irq_vectors.h | 1 +
> > 5 files changed, 21 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
> > index dc0cde9..668b8a4 100644
> > --- a/arch/x86/kernel/smp_32.c
> > +++ b/arch/x86/kernel/smp_32.c
> > @@ -672,6 +672,21 @@ void smp_call_function_interrupt(struct pt_regs *regs)
> > }
> > }
> >
> > +fastcall void smp_raise_block_softirq(struct pt_regs *regs)
>
> small detail: there's no fastcall used in arch/x86 anymore.
Yeah, andrew already complained about that, fixed up.
> > +{
> > + unsigned long flags;
> > +
> > + ack_APIC_irq();
> > + local_irq_save(flags);
> > + raise_softirq_irqoff(BLOCK_SOFTIRQ);
> > + local_irq_restore(flags);
> > +}
>
> if then this should be a general facility to trigger any softirq - not
> just the block one.
Oh yeah, definitely agree, I wrote that in the intro as well. The
interface is horrible, not meant to go anywhere, just serve for testing.
> > #define CALL_FUNCTION_VECTOR 0xfb
> > +#define BLOCK_SOFTIRQ_VECTOR 0xfa
>
> this wastes another irq vector and is very special-purpose. Why not make
> the smp_call_function() one more scalable instead?
That's definitely a possibility, Nick had something like that. I just
didn't like having to allocate a cookie object to store the function and
data.
> on the more conceptual level, shouldnt we just move to threads instead
> of softirqs? That way you can become affine to any CPU and can do
> cross-CPU wakeups anytime - which will be nice and fast via the
> smp_reschedule_interrupt() facility.
That would indeed be nicer and not require any arch changes. I was
afraid it would be more costly than massaging the softirqs a bit though,
perhaps that is unfounded.
--
Jens Axboe
* Jens Axboe <[email protected]> wrote:
> > on the more conceptual level, shouldnt we just move to threads
> > instead of softirqs? That way you can become affine to any CPU and
> > can do cross-CPU wakeups anytime - which will be nice and fast via
> > the smp_reschedule_interrupt() facility.
>
> That would indeed be nicer and not require any arch changes. I was
> afraid it would be more costly than massaging the softirqs a bit
> though, perhaps that is unfounded.
pick up the threaded softirq patches from -rt, those move all softirqs
processing into kernel threads. I'd suggest to extend those via
wakeup-from-remote functionality - it fits the construct quite
naturally. You should also be able to directly observe any performance
impact of threaded softirq handlers. (and if you find any, let me know
so that we can make it faster :-)
Ingo
On Thu, Feb 07 2008, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > > on the more conceptual level, shouldnt we just move to threads
> > > instead of softirqs? That way you can become affine to any CPU and
> > > can do cross-CPU wakeups anytime - which will be nice and fast via
> > > the smp_reschedule_interrupt() facility.
> >
> > That would indeed be nicer and not require any arch changes. I was
> > afraid it would be more costly than massaging the softirqs a bit
> > though, perhaps that is unfounded.
>
> pick up the threaded softirq patches from -rt, those move all softirqs
> processing into kernel threads. I'd suggest to extend those via
> wakeup-from-remote functionality - it fits the construct quite
> naturally. You should also be able to directly observe any performance
> impact of threaded softirq handlers. (and if you find any, let me know
> so that we can make it faster :-)
I was just considering that, since I knew -rt moved the softirqs into
threads. I'll look into it, but may not post anything until after my
vacation.
--
Jens Axboe
* Jens Axboe <[email protected]> wrote:
> > pick up the threaded softirq patches from -rt, those move all
> > softirqs processing into kernel threads. I'd suggest to extend those
> > via wakeup-from-remote functionality - it fits the construct quite
> > naturally. You should also be able to directly observe any
> > performance impact of threaded softirq handlers. (and if you find
> > any, let me know so that we can make it faster :-)
>
> I was just considering that, since I knew -rt moved the softirqs into
> threads. I'll look into it, but may not post anything until after my
> vacation.
we should more seriously investigate kernel thread scalability for
another reason as well: besides -rt, any generic async IO facility we
pick up will likely heavily rely on them. Kernel thread scheduling is
quite a bit lighter than user task scheduling [no TLB flushes, etc.] -
and if it is still not good enough we could probably accelerate them
some more. (and everyone will benefit) irq-context softirqs on the other
hand are quite rigid and bring in many atomicity assumptions so they are
not as natural to program for.
Ingo
* Jens Axboe <[email protected]> wrote:
> [...] but may not post anything until after my vacation.
oh, you going on a vacation. I am sitting on a few block layer patches
you might be interested in :-)
i am playing with Vegard Nossum's kmemcheck on x86 (with much help from
Pekka Enberg for the SLUB details) and it's getting quite promising.
Kmemcheck is in essence Valgrind for the native kernel - it is "Da Bomb"
in terms of kernel object lifetime and access validation debugging
helpers.
it promptly triggered a few uninitialized accesses in the block layer
(amongst others), resulting in the following 4 fixes (find them below):
block: restructure rq_init() to make it safer
block: fix access to uninitialized field
block: initialize rq->tag
block: rq->cmd* initialization
i _think_ the actual uninitialized memory accesses are only latent bugs
not actual runtime bugs because they relate to SCSI init sequences that
do not truly need the elevator's attention - but rq_init() sure looked a
bit dangerous in this regard and the elv_next_request() access to those
uninitialized fields is not nice.
Do these fixes look good to you? I had them in testing for a few days
already.
Ingo
--------------------->
Subject: block: restructure rq_init() to make it safer
From: Ingo Molnar <[email protected]>
reorder the initializations done in rq_init() to both align them
to memory order, and to document them better. They now match
the field ordering of "struct request" in blkdev.h.
No change in functionality:
text data bss dec hex filename
8426 0 20 8446 20fe blk-core.o.before
8426 0 20 8446 20fe blk-core.o.after
Signed-off-by: Ingo Molnar <[email protected]>
---
block/blk-core.c | 51 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 35 insertions(+), 16 deletions(-)
Index: linux/block/blk-core.c
===================================================================
--- linux.orig/block/blk-core.c
+++ linux/block/blk-core.c
@@ -106,24 +106,43 @@ void rq_init(struct request_queue *q, st
{
INIT_LIST_HEAD(&rq->queuelist);
INIT_LIST_HEAD(&rq->donelist);
-
- rq->errors = 0;
- rq->bio = rq->biotail = NULL;
+ rq->q = q;
+ /* rq->cmd_flags */
+ /* rq->cmd_type */
+ /* rq->sector */
+ /* rq->hard_sector */
+ /* rq->nr_sectors */
+ /* rq->hard_nr_sectors */
+ /* rq->current_nr_sectors */
+ /* rq->hard_cur_sectors */
+ rq->bio = NULL;
+ rq->biotail = NULL;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- rq->ioprio = 0;
- rq->buffer = NULL;
- rq->ref_count = 1;
- rq->q = q;
- rq->special = NULL;
- rq->data_len = 0;
- rq->data = NULL;
- rq->nr_phys_segments = 0;
- rq->sense = NULL;
- rq->end_io = NULL;
- rq->end_io_data = NULL;
- rq->completion_data = NULL;
- rq->next_rq = NULL;
+ rq->completion_data = NULL;
+ /* rq->elevator_private */
+ /* rq->elevator_private2 */
+ /* rq->rq_disk */
+ /* rq->start_time */
+ rq->nr_phys_segments = 0;
+ /* rq->nr_hw_segments */
+ rq->ioprio = 0;
+ rq->special = NULL;
+ rq->buffer = NULL;
+ /* rq->tag */
+ rq->errors = 0;
+ rq->ref_count = 1;
+ /* rq->cmd_len */
+ /* rq->cmd[] */
+ rq->data_len = 0;
+ /* rq->sense_len */
+ rq->data = NULL;
+ rq->sense = NULL;
+ /* rq->timeout */
+ /* rq->retries */
+ rq->end_io = NULL;
+ rq->end_io_data = NULL;
+ rq->next_rq = NULL;
}
static void req_bio_endio(struct request *rq, struct bio *bio,
Subject: block: fix access to uninitialized field
From: Ingo Molnar <[email protected]>
kmemcheck caught the following uninitialized memory access in the
block layer:
kmemcheck: Caught uninitialized read:
EIP = c020e596 (elv_next_request+0x63/0x154), address f74985dc, size 32
Pid: 1, comm: swapper Not tainted (2.6.24 #5)
EIP: 0060:[<c020e596>] EFLAGS: 00010046 CPU: 0
EIP is at elv_next_request+0x63/0x154
EAX: 00000000 EBX: f74b83b0 ECX: 00000002 EDX: 00000000
ESI: f74985c0 EDI: f7c5bbf0 EBP: f7c5bc00 ESP: f7c5bbf0
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
CR0: 8005003b CR2: f74985dc CR3: 0060c000 CR4: 000006c0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff4ff0 DR7: 00000400
[<c02d27e8>] scsi_request_fn+0x74/0x28e
[<c020fd2e>] __generic_unplug_device+0x1d/0x20
[<c0211bb4>] blk_execute_rq_nowait+0x50/0x5c
[<c0211c26>] blk_execute_rq+0x66/0x83
[<c0211c43>] ? blk_end_sync_rq+0x0/0x29
[<c01151fd>] ? hide_addr+0x32/0x72
[<c0115275>] ? kmemcheck_hide+0x38/0x67
[<c0431650>] ? do_debug+0x3d/0x105
[<c04312cf>] ? debug_stack_correct+0x27/0x2c
[<c02d1e51>] scsi_execute+0xc0/0xed
[<c02d1ece>] scsi_execute_req+0x50/0x9d
[<c02d33dd>] scsi_probe_and_add_lun+0x1a3/0x7d1
[<c0431650>] ? do_debug+0x3d/0x105
[<c0270024>] ? hwrng_register+0xc3/0x147
[<c02d465a>] __scsi_add_device+0x8a/0xb7
[<c031e52d>] ata_scsi_scan_host+0x9d/0x2c3
[<c031b577>] ata_host_register+0x21b/0x239
[<c03201b2>] ata_pci_activate_sff_host+0x17c/0x1a6
[<c031c764>] ? ata_interrupt+0x0/0x214
[<c032061c>] ata_pci_init_one+0x9b/0xef
[<c032e3d7>] amd_init_one+0x171/0x179
[<c0226e82>] pci_device_probe+0x39/0x63
[<c027d966>] driver_probe_device+0xb8/0x14d
[<c027dafe>] __driver_attach+0x59/0x88
[<c027ce3b>] bus_for_each_dev+0x41/0x64
[<c027d7f3>] driver_attach+0x17/0x19
[<c027daa5>] ? __driver_attach+0x0/0x88
[<c027d5db>] bus_add_driver+0xa8/0x1ed
[<c027dcbb>] driver_register+0x55/0xc4
[<c0227036>] __pci_register_driver+0x2e/0x5c
[<c05e110d>] amd_init+0x17/0x19
[<c05c66c8>] kernel_init+0xba/0x1fa
[<c05c660e>] ? kernel_init+0x0/0x1fa
[<c010578f>] kernel_thread_helper+0x7/0x10
=======================
kmemcheck: Caught uninitialized read from EIP = c02d16aa (scsi_get_cmd_from_req+0x28/0x3c), address f7498630, size 32
which corresponds to:
0xc020e596 is in elv_next_request (block/elevator.c:746).
741 rq->cmd_flags |= REQ_STARTED;
742 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
743 }
744
745 if (!q->boundary_rq || q->boundary_rq == rq) {
746 q->end_sector = rq_end_sector(rq);
747 q->boundary_rq = NULL;
748 }
749
750 if (rq->cmd_flags & REQ_DONTPREP)
the problem is that during ATA probing, we pass in a half-initialized
request structure to the block layer, which processes it. While this is
probably harmless in itself (probe requests often have no real 'sector'
field), it could be more fatal in other cases.
so be defensive and initialize the sector fields. We use -10000LL,
because that's better than an accidental IO to sector 0 ...
Signed-off-by: Ingo Molnar <[email protected]>
---
block/blk-core.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
Index: linux/block/blk-core.c
===================================================================
--- linux.orig/block/blk-core.c
+++ linux/block/blk-core.c
@@ -102,6 +102,8 @@ struct backing_dev_info *blk_get_backing
}
EXPORT_SYMBOL(blk_get_backing_dev_info);
+#define ILLEGAL_SECTOR -1000LL
+
void rq_init(struct request_queue *q, struct request *rq)
{
INIT_LIST_HEAD(&rq->queuelist);
@@ -109,12 +111,12 @@ void rq_init(struct request_queue *q, st
rq->q = q;
/* rq->cmd_flags */
/* rq->cmd_type */
- /* rq->sector */
- /* rq->hard_sector */
- /* rq->nr_sectors */
- /* rq->hard_nr_sectors */
- /* rq->current_nr_sectors */
- /* rq->hard_cur_sectors */
+ rq->sector = ILLEGAL_SECTOR;
+ rq->hard_sector = ILLEGAL_SECTOR;
+ rq->nr_sectors = 0;
+ rq->hard_nr_sectors = 0;
+ rq->current_nr_sectors = 0;
+ rq->hard_cur_sectors = 0;
rq->bio = NULL;
rq->biotail = NULL;
INIT_HLIST_NODE(&rq->hash);
Subject: block: initialize rq->tag
From: Ingo Molnar <[email protected]>
kmemcheck (valgrind for the native Linux kernel) caught a 32-bit read
to an uninitialized piece of memory in the block/SCSI layer:
ata2.01: configured for UDMA/33
kmemcheck: Caught uninitialized 32-bit read:
=> from EIP = c02d3386 (scsi_get_cmd_from_req+0x28/0x3c), address f7dc0d38
Pid: 1, comm: swapper Not tainted (2.6.24 #6)
EIP: 0060:[<c02d3386>] EFLAGS: 00010086 CPU: 0
EIP is at scsi_get_cmd_from_req+0x28/0x3c
EAX: f749a800 EBX: f7dc0cc0 ECX: f74b801c EDX: f749a800
ESI: f74b8000 EDI: f7c5bbf0 EBP: f7c5bbbc ESP: f7c5bbb8
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
CR0: 8005003b CR2: f7dc0d38 CR3: 00610000 CR4: 000006c0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff4ff0 DR7: 00000400
[<c02d36ca>] scsi_setup_blk_pc_cmnd+0x26/0x101
[<c02d37c6>] scsi_prep_fn+0x21/0x30
[<c020fc77>] elv_next_request+0xb3/0x15f
[<c02d44d6>] scsi_request_fn+0x74/0x28e
[<c0211541>] __generic_unplug_device+0x1d/0x20
[<c02133d4>] blk_execute_rq_nowait+0x50/0x5c
[<c0213449>] blk_execute_rq+0x69/0x86
[<c0213466>] ? blk_end_sync_rq+0x0/0x2a
[<c011520b>] ? hide_addr+0x32/0x72
[<c0115283>] ? kmemcheck_hide+0x38/0x67
[<c0433da0>] ? do_debug+0x3d/0x105
[<c0433a1f>] ? debug_stack_correct+0x27/0x2c
[<c02d3b33>] scsi_execute+0xc3/0xf3
[<c02d3bb3>] scsi_execute_req+0x50/0x9d
[<c02d50c9>] scsi_probe_and_add_lun+0x1a3/0x7d1
[<c0433da0>] ? do_debug+0x3d/0x105
[<c0270024>] ? rtc_do_ioctl+0x66d/0x7a2
[<c02d6346>] __scsi_add_device+0x8a/0xb7
[<c0320838>] ata_scsi_scan_host+0x9d/0x2c3
[<c031d837>] ata_host_register+0x21b/0x239
[<c03224be>] ata_pci_activate_sff_host+0x17c/0x1a6
[<c031ea24>] ? ata_interrupt+0x0/0x214
[<c0322928>] ata_pci_init_one+0x9b/0xef
[<c03306e3>] amd_init_one+0x171/0x179
[<c0228a42>] pci_device_probe+0x39/0x63
[<c027f526>] driver_probe_device+0xb8/0x14d
[<c027f6be>] __driver_attach+0x59/0x88
[<c027e9fb>] bus_for_each_dev+0x41/0x64
[<c027f3b3>] driver_attach+0x17/0x19
[<c027f665>] ? __driver_attach+0x0/0x88
[<c027f19b>] bus_add_driver+0xa8/0x1ed
[<c027f87b>] driver_register+0x55/0xc4
[<c0228bf6>] __pci_register_driver+0x2e/0x5c
[<c05e4df2>] amd_init+0x17/0x19
[<c05ca6cb>] kernel_init+0xba/0x1fa
[<c05ca611>] ? kernel_init+0x0/0x1fa
[<c010578f>] kernel_thread_helper+0x7/0x10
=======================
while it's probably harmless for probing functions, be defensive and
initialize rq->tag to -1 explicitly.
Signed-off-by: Ingo Molnar <[email protected]>
---
block/blk-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: linux/block/blk-core.c
===================================================================
--- linux.orig/block/blk-core.c
+++ linux/block/blk-core.c
@@ -131,7 +131,7 @@ void rq_init(struct request_queue *q, st
rq->ioprio = 0;
rq->special = NULL;
rq->buffer = NULL;
- /* rq->tag */
+ rq->tag = -1;
rq->errors = 0;
rq->ref_count = 1;
/* rq->cmd_len */
Subject: block: rq->cmd* initialization
From: Ingo Molnar <[email protected]>
kmemcheck found the following uninitialized 32-bit read:
kmemcheck: Caught uninitialized 32-bit read:
=> from EIP = c02d3747 (scsi_setup_blk_pc_cmnd+0xa3/0x101), address f7dc0d4c
Pid: 1, comm: swapper Not tainted (2.6.24 #7)
EIP: 0060:[<c02d3747>] EFLAGS: 00010082 CPU: 0
EIP is at scsi_setup_blk_pc_cmnd+0xa3/0x101
EAX: 00000000 EBX: f7dc0cc0 ECX: f7fd0300 EDX: f7fd0300
ESI: f7dc0d4c EDI: f7496838 EBP: f7c5bbd8 ESP: f7c5bbc4
DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
CR0: 8005003b CR2: f7dc0d4c CR3: 00610000 CR4: 000006c0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff4ff0 DR7: 00000400
[<c02d37c6>] scsi_prep_fn+0x21/0x30
[<c020fc77>] elv_next_request+0xb3/0x15f
[<c02d44d6>] scsi_request_fn+0x74/0x28e
[<c0211548>] __generic_unplug_device+0x1d/0x20
[<c02133d8>] blk_execute_rq_nowait+0x50/0x5c
[<c021344d>] blk_execute_rq+0x69/0x86
[<c021346a>] ? blk_end_sync_rq+0x0/0x2a
[<c011520b>] ? hide_addr+0x32/0x72
[<c0115283>] ? kmemcheck_hide+0x38/0x67
[<c0433da0>] ? do_debug+0x3d/0x105
[<c0433a1f>] ? debug_stack_correct+0x27/0x2c
[<c02d3b33>] scsi_execute+0xc3/0xf3
[<c02d3bb3>] scsi_execute_req+0x50/0x9d
[<c02d50c9>] scsi_probe_and_add_lun+0x1a3/0x7d1
[<c0433da0>] ? do_debug+0x3d/0x105
[<c0270024>] ? rtc_do_ioctl+0x66d/0x7a2
[<c02d6346>] __scsi_add_device+0x8a/0xb7
[<c0320838>] ata_scsi_scan_host+0x9d/0x2c3
[<c031d837>] ata_host_register+0x21b/0x239
[<c03224be>] ata_pci_activate_sff_host+0x17c/0x1a6
[<c031ea24>] ? ata_interrupt+0x0/0x214
[<c0322928>] ata_pci_init_one+0x9b/0xef
[<c03306e3>] amd_init_one+0x171/0x179
[<c0228a42>] pci_device_probe+0x39/0x63
[<c027f526>] driver_probe_device+0xb8/0x14d
[<c027f6be>] __driver_attach+0x59/0x88
[<c027e9fb>] bus_for_each_dev+0x41/0x64
[<c027f3b3>] driver_attach+0x17/0x19
[<c027f665>] ? __driver_attach+0x0/0x88
[<c027f19b>] bus_add_driver+0xa8/0x1ed
[<c027f87b>] driver_register+0x55/0xc4
[<c0228bf6>] __pci_register_driver+0x2e/0x5c
[<c05e5136>] amd_init+0x17/0x19
[<c05ca6c8>] kernel_init+0xba/0x1fa
[<c05ca60e>] ? kernel_init+0x0/0x1fa
[<c010578f>] kernel_thread_helper+0x7/0x10
=======================
while it's harmless here, initialize rq->cmd* properly nevertheless.
Signed-off-by: Ingo Molnar <[email protected]>
---
block/blk-core.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
Index: linux/block/blk-core.c
===================================================================
--- linux.orig/block/blk-core.c
+++ linux/block/blk-core.c
@@ -134,8 +134,8 @@ void rq_init(struct request_queue *q, st
rq->tag = -1;
rq->errors = 0;
rq->ref_count = 1;
- /* rq->cmd_len */
- /* rq->cmd[] */
+ rq->cmd_len = 0;
+ memset(rq->cmd, 0, BLK_MAX_CDB);
rq->data_len = 0;
/* rq->sense_len */
rq->data = NULL;
On Thu, Feb 07 2008, Ingo Molnar wrote:
>
> * Jens Axboe <[email protected]> wrote:
>
> > > pick up the threaded softirq patches from -rt, those move all
> > > softirqs processing into kernel threads. I'd suggest to extend those
> > > via wakeup-from-remote functionality - it fits the construct quite
> > > naturally. You should also be able to directly observe any
> > > performance impact of threaded softirq handlers. (and if you find
> > > any, let me know so that we can make it faster :-)
> >
> > I was just considering that, since I knew -rt moved the softirqs into
> > threads. I'll look into it, but may not post anything until after my
> > vacation.
>
> we should more seriously investigate kernel thread scalability for
> another reason as well: besides -rt, any generic async IO facility we
> pick up will likely heavily rely on them. Kernel thread scheduling is
> quite a bit lighter than user task scheduling [no TLB flushes, etc.] -
> and if it is still not good enough we could probably accelerate them
> some more. (and everyone will benefit) irq-context softirqs on the other
> hand are quite rigid and bring in many atomicity assumptions so they are
> not as natural to program for.
Here's a patch on top of the other patchset that switches blk-softirq.c
to using kernel threads instead. It's pretty simple. Booted tested,
works for me :-)
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 5ae8345..73c71c7 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -7,6 +7,8 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
#include <linux/cpu.h>
#include "blk.h"
@@ -14,41 +16,136 @@
struct blk_comp {
spinlock_t lock;
struct list_head list;
- int dead;
+ struct task_struct *task;
};
-static DEFINE_PER_CPU(struct blk_comp, blk_cpu_done);
+static DEFINE_PER_CPU(struct blk_comp, blk_irq_data);
+
+static void raise_blk_irq(int cpu)
+{
+ struct blk_comp *bc = &per_cpu(blk_irq_data, cpu);
+
+ wake_up_process(bc->task);
+}
+
+static void blk_done_softirq(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_entry(list->next, struct request, donelist);
+ list_del_init(&rq->donelist);
+ rq->q->softirq_done_fn(rq);
+ }
+}
+
+static int blk_irq_thread(void *data)
+{
+ struct blk_comp *bc = data;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+
+ sched_setscheduler(current, SCHED_FIFO, ¶m);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+ LIST_HEAD(local_list);
+
+ preempt_disable();
+ smp_mb();
+ if (list_empty(&bc->list)) {
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+ }
+
+ __set_current_state(TASK_RUNNING);
+ smp_mb();
+ while (!list_empty(&bc->list)) {
+ spin_lock_irq(&bc->lock);
+ list_replace_init(&bc->list, &local_list);
+ spin_unlock_irq(&bc->lock);
+
+
+ blk_done_softirq(&local_list);
+ }
+ preempt_enable();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+static int create_blk_irq_thread(int cpu)
+{
+ struct blk_comp *bc = &per_cpu(blk_irq_data, cpu);
+ struct task_struct *task;
+
+ spin_lock_init(&bc->lock);
+ INIT_LIST_HEAD(&bc->list);
+ bc->task = NULL;
+
+ task = kthread_create(blk_irq_thread, bc, "blk-irq-%d", cpu);
+ if (IS_ERR(task)) {
+ printk("block: irq thread creation failed\n");
+ return 1;
+ }
+
+ kthread_bind(task, cpu);
+ bc->task = task;
+ return 0;
+}
static int __cpuinit blk_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long) hcpu;
+ struct task_struct *task;
struct blk_comp *bc;
switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ bc = &per_cpu(blk_irq_data, cpu);
+ if (bc->task) {
+ printk("blk: task for %d already there\n", cpu);
+ break;
+ }
+ if (create_blk_irq_thread(cpu))
+ return NOTIFY_BAD;
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ raise_blk_irq(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ bc = &per_cpu(blk_irq_data, cpu);
+ if (!bc->task);
+ break;
+ kthread_bind(bc->task, any_online_cpu(cpu_online_map));
case CPU_DEAD:
case CPU_DEAD_FROZEN: {
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
+ struct sched_param param;
struct blk_comp *dead;
local_irq_disable();
- bc = &__get_cpu_var(blk_cpu_done);
- dead = &per_cpu(blk_cpu_done, cpu);
+ bc = &__get_cpu_var(blk_irq_data);
+ dead = &per_cpu(blk_irq_data, cpu);
double_spin_lock(&bc->lock, &dead->lock, bc < dead);
list_splice_init(&dead->list, &bc->list);
- dead->dead = 1;
double_spin_unlock(&bc->lock, &dead->lock, bc < dead);
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
- break;
- }
- case CPU_ONLINE: {
- local_irq_disable();
- bc = &per_cpu(blk_cpu_done, cpu);
- bc->dead = 0;
+
+ param.sched_priority = MAX_RT_PRIO - 1;
+ task = dead->task;
+ sched_setscheduler(task, SCHED_FIFO, ¶m);
+ dead->task = NULL;
+ raise_blk_irq(smp_processor_id());
local_irq_enable();
+ kthread_stop(task);
break;
}
}
@@ -56,35 +153,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+static struct notifier_block __cpuinitdata blk_cpu_notifier = {
.notifier_call = blk_cpu_notify,
};
-/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
- struct blk_comp *bc;
- struct list_head local_list;
-
- local_irq_disable();
- bc = &__get_cpu_var(blk_cpu_done);
- spin_lock(&bc->lock);
- list_replace_init(&bc->list, &local_list);
- spin_unlock(&bc->lock);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
-
- rq = list_entry(local_list.next, struct request, donelist);
- list_del_init(&rq->donelist);
- rq->q->softirq_done_fn(rq);
- }
-}
-
/**
* blk_complete_request - end I/O on a request
* @req: the request being processed
@@ -117,11 +189,11 @@ void blk_complete_request(struct request *req)
if (ccpu == cpu) {
is_dead:
- bc = &__get_cpu_var(blk_cpu_done);
+ bc = &__get_cpu_var(blk_irq_data);
q->local_complete++;
} else {
- bc = &per_cpu(blk_cpu_done, ccpu);
- if (unlikely(bc->dead)) {
+ bc = &per_cpu(blk_irq_data, ccpu);
+ if (unlikely(!bc->task)) {
ccpu = cpu;
goto is_dead;
}
@@ -134,30 +206,27 @@ is_dead:
list_add_tail(&req->donelist, &bc->list);
spin_unlock(&bc->lock);
- if (was_empty) {
- if (cpu == ccpu)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- else
- raise_softirq_on_cpu(ccpu, BLOCK_SOFTIRQ);
- }
+ if (was_empty)
+ raise_blk_irq(ccpu);
local_irq_restore(flags);
}
EXPORT_SYMBOL(blk_complete_request);
-static int __init blk_softirq_init(void)
+__init int blk_softirq_init(void)
{
int i;
for_each_possible_cpu(i) {
- struct blk_comp *bc = &per_cpu(blk_cpu_done, i);
+ void *cpu;
+ int err;
- spin_lock_init(&bc->lock);
- INIT_LIST_HEAD(&bc->list);
- bc->dead = 0;
+ cpu = (void *) (long) i;
+ err = blk_cpu_notify(&blk_cpu_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
+ blk_cpu_notify(&blk_cpu_notifier, CPU_ONLINE, cpu);
}
- open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
register_hotcpu_notifier(&blk_cpu_notifier);
return 0;
}
--
Jens Axboe
Jens Axboe wrote:
> Hi,
>
> Since I'll be on vacation next week, I thought I'd send this out in
> case people wanted to play with it. It works here, but I haven't done
> any performance numbers at all.
>
> Patches 1-7 are all preparation patches for #8, which contains the
> real changes. I'm not particularly happy with the arch implementation
> for raising a softirq on another CPU, but it should be fast enough
> so suffice for testing.
>
> Anyway, this patchset is mainly meant as a playground for testing IO
> affinity. It allows you to set three values per queue, see the files
> in the /sys/block/<dev>/queue directory:
>
> completion_affinity
> Only allow completions to happen on the defined CPU mask.
> queue_affinity
> Only allow queuing to happen on the defined CPU mask.
> rq_affinity
> Always complete a request on the same CPU that queued it.
>
> As you can tell, there's some overlap to allow for experimentation.
> rq_affinity will override completion_affinity, so it's possible to
> have completions on a CPU that isn't set in that mask. The interface
> is currently limited to all CPUs or a specific CPU, but the implementation
> is supports (and works with) cpu masks. The logic is in
> blk_queue_set_cpumask(), it should be easy enough to change this to
> echo a full mask, or allow OR'ing of CPU masks when a new CPU is passed in.
> For now, echo a CPU number to set that CPU, or use -1 to set all CPUs.
> The default is all CPUs for no change in behaviour.
>
> Patch set is against current git as of this morning. The code is also in
> the block git repo, branch is io-cpu-affinity.
>
> git://git.kernel.dk/linux-2.6-block.git io-cpu-affinity
>
FYI: on a kernel with this patch set, running on a 4-way ia64 (non-NUMA) w/ a FC disk, I crafted a test with 135 combinations:
o Having the issuing application pegged on each CPU - or - left alone (run on any CPU), yields 5 possibilities
o Having the queue affinity on each CPU, or any (-1), yields 5 possibilities
o Having the completion affinity on each CPU, or any (-1), yields 5 possibilities
and
o Having the issuing application pegged on each CPU - or - left alone (run on ay CPU), yields 5 possibilities
o Having rq_affinity set to 0 or 1, yields 2 possibilities.
Each test was for 10 minutes, and ran overnight just fine. The difference amongst the 135 resulting values (based upon latency per-IO seen at the application layer) was <<1% (0.32% to be exact). This would seem to indicate that there isn't a penalty for running with this code, and it seems relatively stable given this.
The application used was doing 64KiB asynchronous direct reads, and had a minimum average per-IO latency of 42.426310 milliseconds, and average of 42.486557 milliseconds (std dev of 0.0041561), and a max of 42.561360 milliseconds
I'm going to do some runs on a 16-way NUMA box, w/ a lot of disks today, to see if we see gains in that environment.
Alan D. Brunelle
HP OSLO S&P
On Thu, 7 Feb 2008, Ingo Molnar wrote:
> INIT_HLIST_NODE(&rq->hash);
> RB_CLEAR_NODE(&rq->rb_node);
> - rq->ioprio = 0;
> - rq->buffer = NULL;
> - rq->ref_count = 1;
> - rq->q = q;
> - rq->special = NULL;
> - rq->data_len = 0;
> - rq->data = NULL;
> - rq->nr_phys_segments = 0;
> - rq->sense = NULL;
> - rq->end_io = NULL;
> - rq->end_io_data = NULL;
> - rq->completion_data = NULL;
> - rq->next_rq = NULL;
> + rq->completion_data = NULL;
> + /* rq->elevator_private */
> + /* rq->elevator_private2 */
> + /* rq->rq_disk */
> + /* rq->start_time */
> + rq->nr_phys_segments = 0;
> + /* rq->nr_hw_segments */
> + rq->ioprio = 0;
> + rq->special = NULL;
> + rq->buffer = NULL;
...
Can we please just stop doing these one-by-one assignments, and just do
something like
memset(rq, 0, sizeof(*rq));
rq->q = q;
rq->ref_count = 1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
instead?
The memset() is likely faster and smaller than one-by-one assignments
anyway, even if the one-by-ones can avoid initializing some field or there
ends up being a double initialization..
Linus
On Thu, 2008-02-07 at 10:49 +0100, Jens Axboe wrote:
> On Thu, Feb 07 2008, Andrew Morton wrote:
> > On Thu, 7 Feb 2008 10:18:59 +0100 Jens Axboe <[email protected]> wrote:
> >
> > > Signed-off-by: Jens Axboe <[email protected]>
> > > ---
> > > include/linux/workqueue.h | 1 +
> > > kernel/workqueue.c | 15 +++++++++++++++
> > > 2 files changed, 16 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> > > index 7f28c32..4c46944 100644
> > > --- a/include/linux/workqueue.h
> > > +++ b/include/linux/workqueue.h
> > > @@ -179,6 +179,7 @@ __create_workqueue_key(const char *name, int singlethread,
> > > extern void destroy_workqueue(struct workqueue_struct *wq);
> > >
> > > extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
> > > +extern int FASTCALL(queue_work_on_cpu(struct workqueue_struct *wq, struct work_struct *work, int cpu));
> >
> > -ETOOWIDE
> >
> > please don't add fastcall - we're trying to remove it.
>
> Oops yes, I'll kill the fastcall and fix the > 80 char line!
>
I'll be making another round-up patch at -rc1 for FASTCALL/fastcall
FYI
Harvey
On Thu, Feb 07 2008, Linus Torvalds wrote:
>
>
> On Thu, 7 Feb 2008, Ingo Molnar wrote:
> > INIT_HLIST_NODE(&rq->hash);
> > RB_CLEAR_NODE(&rq->rb_node);
> > - rq->ioprio = 0;
> > - rq->buffer = NULL;
> > - rq->ref_count = 1;
> > - rq->q = q;
> > - rq->special = NULL;
> > - rq->data_len = 0;
> > - rq->data = NULL;
> > - rq->nr_phys_segments = 0;
> > - rq->sense = NULL;
> > - rq->end_io = NULL;
> > - rq->end_io_data = NULL;
> > - rq->completion_data = NULL;
> > - rq->next_rq = NULL;
> > + rq->completion_data = NULL;
> > + /* rq->elevator_private */
> > + /* rq->elevator_private2 */
> > + /* rq->rq_disk */
> > + /* rq->start_time */
> > + rq->nr_phys_segments = 0;
> > + /* rq->nr_hw_segments */
> > + rq->ioprio = 0;
> > + rq->special = NULL;
> > + rq->buffer = NULL;
> ...
>
> Can we please just stop doing these one-by-one assignments, and just do
> something like
>
> memset(rq, 0, sizeof(*rq));
> rq->q = q;
> rq->ref_count = 1;
> INIT_HLIST_NODE(&rq->hash);
> RB_CLEAR_NODE(&rq->rb_node);
>
> instead?
>
> The memset() is likely faster and smaller than one-by-one assignments
> anyway, even if the one-by-ones can avoid initializing some field or there
> ends up being a double initialization..
I completely agree, the fidgeting with single members quickly loses
appeal. Ingo, I'll merge and integrate your fixes before leaving, I'll
be here all day tomorrow as well.
--
Jens Axboe
Hi,
Here's a variant using kernel threads only, the nasty arch bits are then
not needed. Works for me, no performance testing (that's a hint for Alan
to try and queue up some testing for this variant as well :-)
--
Jens Axboe
* Linus Torvalds <[email protected]> wrote:
> On Thu, 7 Feb 2008, Ingo Molnar wrote:
> > INIT_HLIST_NODE(&rq->hash);
> > RB_CLEAR_NODE(&rq->rb_node);
> > - rq->ioprio = 0;
> > - rq->buffer = NULL;
> > - rq->ref_count = 1;
> > - rq->q = q;
> > - rq->special = NULL;
> > - rq->data_len = 0;
> > - rq->data = NULL;
> > - rq->nr_phys_segments = 0;
> > - rq->sense = NULL;
> > - rq->end_io = NULL;
> > - rq->end_io_data = NULL;
> > - rq->completion_data = NULL;
> > - rq->next_rq = NULL;
> > + rq->completion_data = NULL;
> > + /* rq->elevator_private */
> > + /* rq->elevator_private2 */
> > + /* rq->rq_disk */
> > + /* rq->start_time */
> > + rq->nr_phys_segments = 0;
> > + /* rq->nr_hw_segments */
> > + rq->ioprio = 0;
> > + rq->special = NULL;
> > + rq->buffer = NULL;
> ...
>
> Can we please just stop doing these one-by-one assignments, and just do
> something like
>
> memset(rq, 0, sizeof(*rq));
> rq->q = q;
> rq->ref_count = 1;
> INIT_HLIST_NODE(&rq->hash);
> RB_CLEAR_NODE(&rq->rb_node);
>
> instead?
>
> The memset() is likely faster and smaller than one-by-one assignments
> anyway, even if the one-by-ones can avoid initializing some field or
> there ends up being a double initialization..
i definitely agree and do that for all code i write.
But if someone does item by item initialization for some crazy
performance reason (networking folks tend to have such constructs), it
should be done i think how i've done it in the patch: by systematically
listing _every_ field in the structure, in the same order, and
indicating it clearly when it is not initialized and why.
and there it already shows that we do not initialize a few other members
that could cause problems later on:
+ rq->data_len = 0;
+ /* rq->sense_len */
+ rq->data = NULL;
+ rq->sense = NULL;
why is sense_len not initialized - while data_len is? In any case, these
days the memclear instructions are dirt cheap and we should just always
initialize everything to zero by default, especially if it's almost all
zero-initialized anyway.
Ingo
On Thu, Feb 07 2008, Ingo Molnar wrote:
>
> * Linus Torvalds <[email protected]> wrote:
>
> > On Thu, 7 Feb 2008, Ingo Molnar wrote:
> > > INIT_HLIST_NODE(&rq->hash);
> > > RB_CLEAR_NODE(&rq->rb_node);
> > > - rq->ioprio = 0;
> > > - rq->buffer = NULL;
> > > - rq->ref_count = 1;
> > > - rq->q = q;
> > > - rq->special = NULL;
> > > - rq->data_len = 0;
> > > - rq->data = NULL;
> > > - rq->nr_phys_segments = 0;
> > > - rq->sense = NULL;
> > > - rq->end_io = NULL;
> > > - rq->end_io_data = NULL;
> > > - rq->completion_data = NULL;
> > > - rq->next_rq = NULL;
> > > + rq->completion_data = NULL;
> > > + /* rq->elevator_private */
> > > + /* rq->elevator_private2 */
> > > + /* rq->rq_disk */
> > > + /* rq->start_time */
> > > + rq->nr_phys_segments = 0;
> > > + /* rq->nr_hw_segments */
> > > + rq->ioprio = 0;
> > > + rq->special = NULL;
> > > + rq->buffer = NULL;
> > ...
> >
> > Can we please just stop doing these one-by-one assignments, and just do
> > something like
> >
> > memset(rq, 0, sizeof(*rq));
> > rq->q = q;
> > rq->ref_count = 1;
> > INIT_HLIST_NODE(&rq->hash);
> > RB_CLEAR_NODE(&rq->rb_node);
> >
> > instead?
> >
> > The memset() is likely faster and smaller than one-by-one assignments
> > anyway, even if the one-by-ones can avoid initializing some field or
> > there ends up being a double initialization..
>
> i definitely agree and do that for all code i write.
>
> But if someone does item by item initialization for some crazy
> performance reason (networking folks tend to have such constructs), it
> should be done i think how i've done it in the patch: by systematically
> listing _every_ field in the structure, in the same order, and
> indicating it clearly when it is not initialized and why.
That assumes that people find the references in two places when adding
members to a structure, not very likely (people are lazy!).
> and there it already shows that we do not initialize a few other members
> that could cause problems later on:
>
> + rq->data_len = 0;
> + /* rq->sense_len */
> + rq->data = NULL;
> + rq->sense = NULL;
>
> why is sense_len not initialized - while data_len is? In any case, these
because sense isn't set, when someone sets ->sense they should set
sense_len as well.
> days the memclear instructions are dirt cheap and we should just always
> initialize everything to zero by default, especially if it's almost all
> zero-initialized anyway.
Completely agree, some of these are just dormant bugs waiting to happen.
Clearing everything is the sanest approach.
--
Jens Axboe
Jens Axboe wrote:
> Hi,
>
> Here's a variant using kernel threads only, the nasty arch bits are then
> not needed. Works for me, no performance testing (that's a hint for Alan
> to try and queue up some testing for this variant as well :-)
>
>
I'll get to that, working my way through the first batch of testing on a NUMA platform. [[If anybody has ideas on specific testing to do, that would be helpful.]] I do plan on running some AIM7 tests, as those have shown improvement in other types of affinity changes in the kernel, and some of them have "interesting" IO load characteristics.
Alan
From: Linus Torvalds <[email protected]>
Date: Thu, 7 Feb 2008 09:42:56 -0800 (PST)
> Can we please just stop doing these one-by-one assignments, and just do
> something like
>
> memset(rq, 0, sizeof(*rq));
> rq->q = q;
> rq->ref_count = 1;
> INIT_HLIST_NODE(&rq->hash);
> RB_CLEAR_NODE(&rq->rb_node);
>
> instead?
>
> The memset() is likely faster and smaller than one-by-one assignments
> anyway, even if the one-by-ones can avoid initializing some field or there
> ends up being a double initialization..
The problem is store buffer compression. At least a few years
ago this made a huge difference in sk_buff initialization in the
networking.
Maybe cpus these days have so much store bandwith that doing
things like the above is OK, but I doubt it :-)
On Thu, 7 Feb 2008, David Miller wrote:
>
> Maybe cpus these days have so much store bandwith that doing
> things like the above is OK, but I doubt it :-)
I seriously doubt the same is true for the IO requests (which are
different anyway, and tend to happen at a much lower frequency than
high-speed networking).
I also suspect that your timings were very hardware-dependent in the first
place (and yes, the long-term effect is likely working against that
"optimization"), and depend a lot on just how sparse the initialization
can to be.
Linus
On Thu, Feb 07, 2008 at 07:25:45PM +0100, Jens Axboe wrote:
> Hi,
>
> Here's a variant using kernel threads only, the nasty arch bits are then
> not needed. Works for me, no performance testing (that's a hint for Alan
> to try and queue up some testing for this variant as well :-)
Well this stuff looks pretty nice (although I'm not sure whether the
softirq->thread changes are a good idea for performance, I guess we'll
see).
You still don't have the option that the Intel patch gave, that is,
to submit on the completer. I guess that you could do it somewhat
generically by having a cpuid in the request queue, and update that
with the completing cpu.
At least they reported it to be the most efficient scheme in their
testing, and Dave thought that migrating completions out to submitters
might be a bottleneck in some cases.
On Fri, Feb 08 2008, Nick Piggin wrote:
> On Thu, Feb 07, 2008 at 07:25:45PM +0100, Jens Axboe wrote:
> > Hi,
> >
> > Here's a variant using kernel threads only, the nasty arch bits are then
> > not needed. Works for me, no performance testing (that's a hint for Alan
> > to try and queue up some testing for this variant as well :-)
>
> Well this stuff looks pretty nice (although I'm not sure whether the
> softirq->thread changes are a good idea for performance, I guess we'll
> see).
Yeah, that is indeed an open question and why I have two seperate
patches for now (io-cpu-affinity branch and io-cpu-affinity-kthread
branch). As Ingo mentioned, this is how softirqs are handled in the -rt
branch already.
> You still don't have the option that the Intel patch gave, that is,
> to submit on the completer. I guess that you could do it somewhat
> generically by having a cpuid in the request queue, and update that
> with the completing cpu.
Not sure what you mean, if setting queue_affinity doesn't accomplish it.
If you know the completing CPU to begin with, surely you can just set
the queuing affinity appropriately?
> At least they reported it to be the most efficient scheme in their
> testing, and Dave thought that migrating completions out to submitters
> might be a bottleneck in some cases.
More so than migrating submitters to completers? The advantage of only
movign submitters is that you get rid of the completion locking. Apart
from that, the cost should be the same, especially for the thread based
solution.
--
Jens Axboe
On Fri, Feb 08, 2008 at 08:47:47AM +0100, Jens Axboe wrote:
> On Fri, Feb 08 2008, Nick Piggin wrote:
> > On Thu, Feb 07, 2008 at 07:25:45PM +0100, Jens Axboe wrote:
> > > Hi,
> > >
> > > Here's a variant using kernel threads only, the nasty arch bits are then
> > > not needed. Works for me, no performance testing (that's a hint for Alan
> > > to try and queue up some testing for this variant as well :-)
> >
> > Well this stuff looks pretty nice (although I'm not sure whether the
> > softirq->thread changes are a good idea for performance, I guess we'll
> > see).
>
> Yeah, that is indeed an open question and why I have two seperate
> patches for now (io-cpu-affinity branch and io-cpu-affinity-kthread
> branch). As Ingo mentioned, this is how softirqs are handled in the -rt
> branch already.
True, although there are some IO workloads where -rt falls behind
mainline. May not be purely due to irq threads though, of course.
> > You still don't have the option that the Intel patch gave, that is,
> > to submit on the completer. I guess that you could do it somewhat
> > generically by having a cpuid in the request queue, and update that
> > with the completing cpu.
>
> Not sure what you mean, if setting queue_affinity doesn't accomplish it.
> If you know the completing CPU to begin with, surely you can just set
> the queuing affinity appropriately?
And if you don't?
> > At least they reported it to be the most efficient scheme in their
> > testing, and Dave thought that migrating completions out to submitters
> > might be a bottleneck in some cases.
>
> More so than migrating submitters to completers? The advantage of only
> movign submitters is that you get rid of the completion locking. Apart
> from that, the cost should be the same, especially for the thread based
> solution.
Not specifically for the block layer, but higher layers like xfs.
On Fri, Feb 08 2008, Nick Piggin wrote:
> On Fri, Feb 08, 2008 at 08:47:47AM +0100, Jens Axboe wrote:
> > On Fri, Feb 08 2008, Nick Piggin wrote:
> > > On Thu, Feb 07, 2008 at 07:25:45PM +0100, Jens Axboe wrote:
> > > > Hi,
> > > >
> > > > Here's a variant using kernel threads only, the nasty arch bits are then
> > > > not needed. Works for me, no performance testing (that's a hint for Alan
> > > > to try and queue up some testing for this variant as well :-)
> > >
> > > Well this stuff looks pretty nice (although I'm not sure whether the
> > > softirq->thread changes are a good idea for performance, I guess we'll
> > > see).
> >
> > Yeah, that is indeed an open question and why I have two seperate
> > patches for now (io-cpu-affinity branch and io-cpu-affinity-kthread
> > branch). As Ingo mentioned, this is how softirqs are handled in the -rt
> > branch already.
>
> True, although there are some IO workloads where -rt falls behind
> mainline. May not be purely due to irq threads though, of course.
It's certainly an area that needs to be investigated.
> > > You still don't have the option that the Intel patch gave, that is,
> > > to submit on the completer. I guess that you could do it somewhat
> > > generically by having a cpuid in the request queue, and update that
> > > with the completing cpu.
> >
> > Not sure what you mean, if setting queue_affinity doesn't accomplish it.
> > If you know the completing CPU to begin with, surely you can just set
> > the queuing affinity appropriately?
>
> And if you don't?
Well if you don't ask for anything, you wont get anything :-)
As I mentioned, the patch is a playing ground for trying various setups.
Everything defaults to 'do as usual', set options to setup certain test
scenarios.
> > > At least they reported it to be the most efficient scheme in their
> > > testing, and Dave thought that migrating completions out to submitters
> > > might be a bottleneck in some cases.
> >
> > More so than migrating submitters to completers? The advantage of only
> > movign submitters is that you get rid of the completion locking. Apart
> > from that, the cost should be the same, especially for the thread based
> > solution.
>
> Not specifically for the block layer, but higher layers like xfs.
True, but that's parallel to the initial statement - that migrating
completers is most costly than migrating submitters. So I'd like Dave to
expand on why he thinks that migrating completers it more costly than
submitters, APART from the locking associated with adding the request to
a remote CPU list.
--
Jens Axboe
On Fri, Feb 08, 2008 at 08:59:55AM +0100, Jens Axboe wrote:
> On Fri, Feb 08 2008, Nick Piggin wrote:
> > And if you don't?
>
> Well if you don't ask for anything, you wont get anything :-)
> As I mentioned, the patch is a playing ground for trying various setups.
> Everything defaults to 'do as usual', set options to setup certain test
> scenarios.
I mean if you don't know the completing CPU.
On Fri, Feb 08 2008, Nick Piggin wrote:
> On Fri, Feb 08, 2008 at 08:59:55AM +0100, Jens Axboe wrote:
> > On Fri, Feb 08 2008, Nick Piggin wrote:
> > > And if you don't?
> >
> > Well if you don't ask for anything, you wont get anything :-)
> > As I mentioned, the patch is a playing ground for trying various setups.
> > Everything defaults to 'do as usual', set options to setup certain test
> > scenarios.
>
> I mean if you don't know the completing CPU.
I still don't know quite what part of that patch you are referring to
here. If you don't have queue_affinity set, queueing a new request with
the hardware is generally done on the same CPU that just completed a
request. That is true even without any patches.
--
Jens Axboe
On Fri, Feb 08, 2008 at 09:24:22AM +0100, Jens Axboe wrote:
> On Fri, Feb 08 2008, Nick Piggin wrote:
> > On Fri, Feb 08, 2008 at 08:59:55AM +0100, Jens Axboe wrote:
> > > On Fri, Feb 08 2008, Nick Piggin wrote:
> > > > And if you don't?
> > >
> > > Well if you don't ask for anything, you wont get anything :-)
> > > As I mentioned, the patch is a playing ground for trying various setups.
> > > Everything defaults to 'do as usual', set options to setup certain test
> > > scenarios.
> >
> > I mean if you don't know the completing CPU.
>
> I still don't know quite what part of that patch you are referring to
> here. If you don't have queue_affinity set, queueing a new request with
> the hardware is generally done on the same CPU that just completed a
> request. That is true even without any patches.
Generally, but I guess not always. The database workloads in question
(which you might know very well about ;)) apparently has a lot of
queue empty and unplug conditions. Which I guess is the reason for
Intel's initial patch.
On Thu, Feb 07 2008, Linus Torvalds wrote:
>
>
> On Thu, 7 Feb 2008, Ingo Molnar wrote:
> > INIT_HLIST_NODE(&rq->hash);
> > RB_CLEAR_NODE(&rq->rb_node);
> > - rq->ioprio = 0;
> > - rq->buffer = NULL;
> > - rq->ref_count = 1;
> > - rq->q = q;
> > - rq->special = NULL;
> > - rq->data_len = 0;
> > - rq->data = NULL;
> > - rq->nr_phys_segments = 0;
> > - rq->sense = NULL;
> > - rq->end_io = NULL;
> > - rq->end_io_data = NULL;
> > - rq->completion_data = NULL;
> > - rq->next_rq = NULL;
> > + rq->completion_data = NULL;
> > + /* rq->elevator_private */
> > + /* rq->elevator_private2 */
> > + /* rq->rq_disk */
> > + /* rq->start_time */
> > + rq->nr_phys_segments = 0;
> > + /* rq->nr_hw_segments */
> > + rq->ioprio = 0;
> > + rq->special = NULL;
> > + rq->buffer = NULL;
> ...
>
> Can we please just stop doing these one-by-one assignments, and just do
> something like
>
> memset(rq, 0, sizeof(*rq));
> rq->q = q;
> rq->ref_count = 1;
> INIT_HLIST_NODE(&rq->hash);
> RB_CLEAR_NODE(&rq->rb_node);
>
> instead?
>
> The memset() is likely faster and smaller than one-by-one assignments
> anyway, even if the one-by-ones can avoid initializing some field or there
> ends up being a double initialization..
Looked into this approach and we can't currently do that here, since
some members of the request are being set from blk_alloc_request() and
then from the io scheduler attaching private data to it. So we have to
preserve ->cmd_flags and ->elevator_private and ->elevator_private2 at
least. Now rq_init() is also used for stored requests, so we cannot just
rely on clearing at allocation time.
So I'd prefer being a little conservative here. The below reorders
rq_init() a bit and clears some more members to be on the safer side,
adding comments to why we cannot memset and an associated comment in
blkdev.h.
diff --git a/block/blk-core.c b/block/blk-core.c
index 4afb39c..fba4ca7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -102,27 +102,38 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
}
EXPORT_SYMBOL(blk_get_backing_dev_info);
+/*
+ * We can't just memset() the structure, since the allocation path
+ * already stored some information in the request.
+ */
void rq_init(struct request_queue *q, struct request *rq)
{
INIT_LIST_HEAD(&rq->queuelist);
INIT_LIST_HEAD(&rq->donelist);
-
- rq->errors = 0;
+ rq->q = q;
+ rq->sector = rq->hard_sector = (sector_t) -1;
+ rq->nr_sectors = rq->hard_nr_sectors = 0;
+ rq->current_nr_sectors = rq->hard_cur_sectors = 0;
rq->bio = rq->biotail = NULL;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
+ rq->rq_disk = NULL;
+ rq->nr_phys_segments = 0;
+ rq->nr_hw_segments = 0;
rq->ioprio = 0;
+ rq->special = NULL;
rq->buffer = NULL;
+ rq->tag = -1;
+ rq->errors = 0;
rq->ref_count = 1;
- rq->q = q;
- rq->special = NULL;
+ rq->cmd_len = 0;
+ memset(rq->cmd, 0, sizeof(rq->cmd));
rq->data_len = 0;
+ rq->sense_len = 0;
rq->data = NULL;
- rq->nr_phys_segments = 0;
rq->sense = NULL;
rq->end_io = NULL;
rq->end_io_data = NULL;
- rq->completion_data = NULL;
rq->next_rq = NULL;
}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90392a9..e1888cc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -137,7 +137,9 @@ enum rq_flag_bits {
#define BLK_MAX_CDB 16
/*
- * try to put the fields that are referenced together in the same cacheline
+ * try to put the fields that are referenced together in the same cacheline.
+ * if you modify this structure, be sure to check block/blk-core.c:rq_init()
+ * as well!
*/
struct request {
struct list_head queuelist;
--
Jens Axboe
David Miller wrote:
> From: Linus Torvalds <[email protected]>
> Date: Thu, 7 Feb 2008 09:42:56 -0800 (PST)
>
>> Can we please just stop doing these one-by-one assignments, and just do
>> something like
>>
>> memset(rq, 0, sizeof(*rq));
>> rq->q = q;
>> rq->ref_count = 1;
>> INIT_HLIST_NODE(&rq->hash);
>> RB_CLEAR_NODE(&rq->rb_node);
>>
>> instead?
>>
>> The memset() is likely faster and smaller than one-by-one assignments
>> anyway, even if the one-by-ones can avoid initializing some field or there
>> ends up being a double initialization..
>
> The problem is store buffer compression. At least a few years
> ago this made a huge difference in sk_buff initialization in the
> networking.
>
> Maybe cpus these days have so much store bandwith that doing
> things like the above is OK, but I doubt it :-)
on modern x86 cpus the memset may even be faster if the memory isn't in cache;
the "explicit" method ends up doing Write Allocate on the cache lines
(so read them from memory) even though they then end up being written entirely.
With memset the CPU is told that the entire range is set to a new value, and
the WA can be avoided for the whole-cachelines in the range.
On Fri, Feb 08, 2008 at 07:09:07AM -0800, Arjan van de Ven wrote:
> David Miller wrote:
> >From: Linus Torvalds <[email protected]>
> >Date: Thu, 7 Feb 2008 09:42:56 -0800 (PST)
> >
> >>Can we please just stop doing these one-by-one assignments, and just do
> >>something like
> >>
> >> memset(rq, 0, sizeof(*rq));
> >> rq->q = q;
> >> rq->ref_count = 1;
> >> INIT_HLIST_NODE(&rq->hash);
> >> RB_CLEAR_NODE(&rq->rb_node);
> >>
> >>instead?
> >>
> >>The memset() is likely faster and smaller than one-by-one assignments
> >>anyway, even if the one-by-ones can avoid initializing some field or
> >>there ends up being a double initialization..
> >
> >The problem is store buffer compression. At least a few years
> >ago this made a huge difference in sk_buff initialization in the
> >networking.
> >
> >Maybe cpus these days have so much store bandwith that doing
> >things like the above is OK, but I doubt it :-)
>
> on modern x86 cpus the memset may even be faster if the memory isn't in
> cache;
> the "explicit" method ends up doing Write Allocate on the cache lines
> (so read them from memory) even though they then end up being written
> entirely.
> With memset the CPU is told that the entire range is set to a new value, and
> the WA can be avoided for the whole-cachelines in the range.
Don't you have write combining store buffers? Or is it still speculatively
issuing the reads even before the whole cacheline is combined?
Nick Piggin wrote:
>>> Maybe cpus these days have so much store bandwith that doing
>>> things like the above is OK, but I doubt it :-)
>> on modern x86 cpus the memset may even be faster if the memory isn't in
>> cache;
>> the "explicit" method ends up doing Write Allocate on the cache lines
>> (so read them from memory) even though they then end up being written
>> entirely.
>> With memset the CPU is told that the entire range is set to a new value, and
>> the WA can be avoided for the whole-cachelines in the range.
>
> Don't you have write combining store buffers? Or is it still speculatively
> issuing the reads even before the whole cacheline is combined?
x86 memory order model doesn't allow that quite; and you need a "series" of at least 64 bytes
without any other memory accesses in between even if it would....
not happening in practice.
On Fri, Feb 08, 2008 at 02:56:09PM -0800, Arjan van de Ven wrote:
> Nick Piggin wrote:
> >>>Maybe cpus these days have so much store bandwith that doing
> >>>things like the above is OK, but I doubt it :-)
> >>on modern x86 cpus the memset may even be faster if the memory isn't in
> >>cache;
> >>the "explicit" method ends up doing Write Allocate on the cache lines
> >>(so read them from memory) even though they then end up being written
> >>entirely.
> >>With memset the CPU is told that the entire range is set to a new value,
> >>and
> >>the WA can be avoided for the whole-cachelines in the range.
> >
> >Don't you have write combining store buffers? Or is it still speculatively
> >issuing the reads even before the whole cacheline is combined?
>
> x86 memory order model doesn't allow that quite; and you need a "series" of
> at least 64 bytes
> without any other memory accesses in between even if it would....
> not happening in practice.
OK, fair enough... then it will be a very nice test to see if it
helps. I'm sure you could have an arch specific initialisation
function if it makes a significant difference.
On Fri, Feb 08, 2008 at 08:59:55AM +0100, Jens Axboe wrote:
> > > > At least they reported it to be the most efficient scheme in their
> > > > testing, and Dave thought that migrating completions out to submitters
> > > > might be a bottleneck in some cases.
> > >
> > > More so than migrating submitters to completers? The advantage of only
> > > movign submitters is that you get rid of the completion locking. Apart
> > > from that, the cost should be the same, especially for the thread based
> > > solution.
> >
> > Not specifically for the block layer, but higher layers like xfs.
>
> True, but that's parallel to the initial statement - that migrating
> completers is most costly than migrating submitters. So I'd like Dave to
> expand on why he thinks that migrating completers it more costly than
> submitters, APART from the locking associated with adding the request to
> a remote CPU list.
What I think Nick is referring to is the comments I made that at a
higher layer (e.g. filesystems) migrating completions to the
submitter CPU may be exactly the wrong thing to do. I don't recall
making any comments on migrating submitters - I think others have
already commented on that so I'll ignore that for the moment and
try to explain why completion on submitter CPU /may/ be bad.
For example, in the case of XFS it is fine for data I/O but it is
wrong for transaction I/O completion. We want to direct all
transaction completions to as few CPUs as possible (one, ideally) so
that all the completion processing happens on the same CPU, rather
than bouncing global cachelines and locks between all the CPUs
taking completion interrupts.
In more detail, the XFS transaction subsystem is asynchronous.
We submit the transaction I/O on the CPU that creates the
transaction so the I/O can come from any CPU in the system. If we
then farm the completion processing out to the submission CPU, that
will push it all over the machine and guarantee that we bounce all
of the XFS transaction log structures and locks all over the machine
on completion as well as submission (right now it's lots of
submission CPUs, few completion CPUs).
An example how bad this can get - this patch:
http://oss.sgi.com/archives/xfs/2007-11/msg00217.html
which prevents simultaneous access to the items tracked in the log
during transaction reservation. Having several hundred CPUs trying
to hit this list at once is really bad for performance - the test
app on the 2048p machine that saw this problem went from ~5500s
runtime down to 9s with the above patch.
I use this example because the transaction I/O completion touches
exactly the same list, locks and structures but is limited in
distribution (and therefore contention) by the number of
simultaneous I/O completion CPUs. Doing completion on the submitter
CPU will cause much wider distribution of completion processing and
introduce exactly the same issues as the transaction reservation
side had.
As it goes, with large, multi-device volumes (e.g. big stripe) we
already see issues with simultaneous completion processing (e.g. the
8p machine mentioned in the above link), so I'd really like to avoid
making these problems worse....
Cheers,
Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group
Sorry for delay,
On 02/07, Andrew Morton wrote:
>
> On Thu, 7 Feb 2008 10:18:59 +0100 Jens Axboe <[email protected]> wrote:
>
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -175,6 +175,21 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
> > }
> > EXPORT_SYMBOL_GPL(queue_work);
> >
> > +int fastcall queue_work_on_cpu(struct workqueue_struct *wq,
> > + struct work_struct *work, int cpu)
> > +{
> > + int ret = 0;
> > +
> > + if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
> > + BUG_ON(!list_empty(&work->entry));
> > + __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
> > + ret = 1;
> > + }
> > +
> > + return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(queue_work_on_cpu);
>
> Might as well change queue_work() to call this?
This is possible, but in that case queue_work_on_cpu() should use wq_per_cpu(),
not per_cpu_ptr(). (otherwise queue_work(single_threaded_wq) won't work).
A bit off-topic, the comment near queue_work() says
* We queue the work to the CPU it was submitted, but there is no
* guarantee that it will be processed by that CPU.
This is wrong. Unless cpu_down() happens, we do guarantee it will be processed
by that CPU. Perhaps it makes sense to fix the comment as well?
Oleg.
On Mon, Feb 11, 2008 at 04:22:11PM +1100, David Chinner wrote:
>
> What I think Nick is referring to is the comments I made that at a
> higher layer (e.g. filesystems) migrating completions to the
> submitter CPU may be exactly the wrong thing to do. I don't recall
> making any comments on migrating submitters - I think others have
> already commented on that so I'll ignore that for the moment and
> try to explain why completion on submitter CPU /may/ be bad.
>
> For example, in the case of XFS it is fine for data I/O but it is
> wrong for transaction I/O completion. We want to direct all
> transaction completions to as few CPUs as possible (one, ideally) so
> that all the completion processing happens on the same CPU, rather
> than bouncing global cachelines and locks between all the CPUs
> taking completion interrupts.
So what you want is all XFS processing (for a given filesystem,
presumably) on a limited set of cores (ideally 1) and all block
and SCSI processing (for a given device) on a similarly limited
set.
On Altix, that was far more important than having the interrupt
and issue CPU be close to the hardware -- at least with typical
LSI or Qlogic controllers where there are only one or two MMIO
reads per command issued, and completions can be stacked up.
There is still an advantage to being close to the hardware, but
a much bigger advantage to not bouncing cachelines.
Maybe what you want is a multistage completion mechanism where
each stage can run on a different CPU, if thread context switches
are cheaper than bouncing data structures around....
jeremy