2012-02-14 18:12:53

by Tejun Heo

[permalink] [raw]
Subject: [PATCH block/for-linus 1/3] block: replace icq->changed with icq->flags

icq->changed was used for ICQ_*_CHANGED bits. Rename it to flags and
access it under ioc->lock instead of using atomic bitops.
ioc_get_changed() is added so that the changed part can be fetched and
cleared as before.

icq->flags will be used to carry other flags.

Signed-off-by: Tejun Heo <[email protected]>
---
Jens, these three patches are to fix the swap heavy workload
regression reported by Shaohua.

Shaohua, I ended up changing the second patch, which shouldn't change
anything but it would be great if you can verify this series once
more.

Thank you!

block/blk-ioc.c | 30 ++++++++++++++++++++++++++----
block/cfq-iosched.c | 12 ++++++------
include/linux/iocontext.h | 9 ++++++---
3 files changed, 38 insertions(+), 13 deletions(-)

Index: work/block/blk-ioc.c
===================================================================
--- work.orig/block/blk-ioc.c
+++ work/block/blk-ioc.c
@@ -363,13 +363,13 @@ struct io_cq *ioc_create_icq(struct requ
return icq;
}

-void ioc_set_changed(struct io_context *ioc, int which)
+void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags)
{
struct io_cq *icq;
struct hlist_node *n;

hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
- set_bit(which, &icq->changed);
+ icq->flags |= flags;
}

/**
@@ -387,7 +387,7 @@ void ioc_ioprio_changed(struct io_contex

spin_lock_irqsave(&ioc->lock, flags);
ioc->ioprio = ioprio;
- ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
+ ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED);
spin_unlock_irqrestore(&ioc->lock, flags);
}

@@ -404,11 +404,33 @@ void ioc_cgroup_changed(struct io_contex
unsigned long flags;

spin_lock_irqsave(&ioc->lock, flags);
- ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
+ ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED);
spin_unlock_irqrestore(&ioc->lock, flags);
}
EXPORT_SYMBOL(ioc_cgroup_changed);

+/**
+ * icq_get_changed - fetch and clear icq changed mask
+ * @icq: icq of interest
+ *
+ * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases
+ * @icq->ioc->lock.
+ */
+unsigned icq_get_changed(struct io_cq *icq)
+{
+ unsigned int changed = 0;
+ unsigned long flags;
+
+ if (unlikely(icq->flags & ICQ_CHANGED_MASK)) {
+ spin_lock_irqsave(&icq->ioc->lock, flags);
+ changed = icq->flags & ICQ_CHANGED_MASK;
+ icq->flags &= ~ICQ_CHANGED_MASK;
+ spin_unlock_irqrestore(&icq->ioc->lock, flags);
+ }
+ return changed;
+}
+EXPORT_SYMBOL(icq_get_changed);
+
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",
Index: work/block/cfq-iosched.c
===================================================================
--- work.orig/block/cfq-iosched.c
+++ work/block/cfq-iosched.c
@@ -3470,20 +3470,20 @@ cfq_set_request(struct request_queue *q,
const int rw = rq_data_dir(rq);
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
+ unsigned int changed;

might_sleep_if(gfp_mask & __GFP_WAIT);

spin_lock_irq(q->queue_lock);

/* handle changed notifications */
- if (unlikely(cic->icq.changed)) {
- if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
- changed_ioprio(cic);
+ changed = icq_get_changed(&cic->icq);
+ if (unlikely(changed & ICQ_IOPRIO_CHANGED))
+ changed_ioprio(cic);
#ifdef CONFIG_CFQ_GROUP_IOSCHED
- if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
- changed_cgroup(cic);
+ if (unlikely(changed & ICQ_CGROUP_CHANGED))
+ changed_cgroup(cic);
#endif
- }

new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
Index: work/include/linux/iocontext.h
===================================================================
--- work.orig/include/linux/iocontext.h
+++ work/include/linux/iocontext.h
@@ -6,8 +6,10 @@
#include <linux/workqueue.h>

enum {
- ICQ_IOPRIO_CHANGED,
- ICQ_CGROUP_CHANGED,
+ ICQ_IOPRIO_CHANGED = 1 << 0,
+ ICQ_CGROUP_CHANGED = 1 << 1,
+
+ ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
};

/*
@@ -88,7 +90,7 @@ struct io_cq {
struct rcu_head __rcu_head;
};

- unsigned long changed;
+ unsigned int flags;
};

/*
@@ -139,6 +141,7 @@ struct io_context *get_task_io_context(s
gfp_t gfp_flags, int node);
void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
void ioc_cgroup_changed(struct io_context *ioc);
+unsigned int icq_get_changed(struct io_cq *icq);
#else
struct io_context;
static inline void put_io_context(struct io_context *ioc) { }


2012-02-14 18:16:29

by Tejun Heo

[permalink] [raw]
Subject: [PATCH block/for-linus 2/3] block: simplify ioc_release_fn()

Reverse double lock dancing in ioc_release_fn() can be simplified by
just using trylock on the queue_lock and back out from ioc lock on
trylock failure. Simplify it.

Signed-off-by: Tejun Heo <[email protected]>
---
block/blk-ioc.c | 44 +++++++++-----------------------------------
1 file changed, 9 insertions(+), 35 deletions(-)

Index: work/block/blk-ioc.c
===================================================================
--- work.orig/block/blk-ioc.c
+++ work/block/blk-ioc.c
@@ -79,7 +79,6 @@ static void ioc_release_fn(struct work_s
{
struct io_context *ioc = container_of(work, struct io_context,
release_work);
- struct request_queue *last_q = NULL;
unsigned long flags;

/*
@@ -93,44 +92,19 @@ static void ioc_release_fn(struct work_s
while (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = hlist_entry(ioc->icq_list.first,
struct io_cq, ioc_node);
- struct request_queue *this_q = icq->q;
+ struct request_queue *q = icq->q;

- if (this_q != last_q) {
- /*
- * Need to switch to @this_q. Once we release
- * @ioc->lock, it can go away along with @cic.
- * Hold on to it.
- */
- __blk_get_queue(this_q);
-
- /*
- * blk_put_queue() might sleep thanks to kobject
- * idiocy. Always release both locks, put and
- * restart.
- */
- if (last_q) {
- spin_unlock(last_q->queue_lock);
- spin_unlock_irqrestore(&ioc->lock, flags);
- blk_put_queue(last_q);
- } else {
- spin_unlock_irqrestore(&ioc->lock, flags);
- }
-
- last_q = this_q;
- spin_lock_irqsave(this_q->queue_lock, flags);
- spin_lock_nested(&ioc->lock, 1);
- continue;
+ if (spin_trylock(q->queue_lock)) {
+ ioc_exit_icq(icq);
+ spin_unlock(q->queue_lock);
+ } else {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ cpu_relax();
+ spin_lock_irqsave_nested(&ioc->lock, flags, 1);
}
- ioc_exit_icq(icq);
}

- if (last_q) {
- spin_unlock(last_q->queue_lock);
- spin_unlock_irqrestore(&ioc->lock, flags);
- blk_put_queue(last_q);
- } else {
- spin_unlock_irqrestore(&ioc->lock, flags);
- }
+ spin_unlock_irqrestore(&ioc->lock, flags);

kmem_cache_free(iocontext_cachep, ioc);
}

2012-02-14 18:18:07

by Tejun Heo

[permalink] [raw]
Subject: [PATCH block/for-linus 3/3] block: exit_io_context() should call elevator_exit_icq_fn()

While updating locking, b2efa05265 "block, cfq: unlink
cfq_io_context's immediately" moved elevator_exit_icq_fn() invocation
from exit_io_context() to the final ioc put. While this doesn't cause
catastrophic failure, it effectively removes task exit notification to
elevator and cause noticeable IO performance degradation with CFQ.

On task exit, CFQ used to immediately expire the slice if it was being
used by the exiting task as no more IO would be issued by the task;
however, after b2efa05265, the notification is lost and disk could sit
idle needlessly, leading to noticeable IO performance degradation for
certain workloads.

This patch renames ioc_exit_icq() to ioc_destroy_icq(), separates
elevator_exit_icq_fn() invocation into ioc_exit_icq() and invokes it
from exit_io_context(). ICQ_EXITED flag is added to avoid invoking
the callback more than once for the same icq.

Walking icq_list from ioc side and invoking elevator callback requires
reverse double locking. This may be better implemented using RCU;
unfortunately, using RCU isn't trivial. e.g. RCU protection would
need to cover request_queue and queue_lock switch on cleanup makes
grabbing queue_lock from RCU unsafe. Reverse double locking should
do, at least for now.

Signed-off-by: Tejun Heo <[email protected]>
Reported-and-bisected-by: Shaohua Li <[email protected]>
LKML-Reference: <CANejiEVzs=pUhQSTvUppkDcc2TNZyfohBRLygW5zFmXyk5A-xQ@mail.gmail.com>
---
block/blk-ioc.c | 55 +++++++++++++++++++++++++++++++++++++++-------
include/linux/iocontext.h | 1
2 files changed, 48 insertions(+), 8 deletions(-)

Index: work/block/blk-ioc.c
===================================================================
--- work.orig/block/blk-ioc.c
+++ work/block/blk-ioc.c
@@ -36,11 +36,23 @@ static void icq_free_icq_rcu(struct rcu_
kmem_cache_free(icq->__rcu_icq_cache, icq);
}

-/*
- * Exit and free an icq. Called with both ioc and q locked.
- */
+/* Exit an icq. Called with both ioc and q locked. */
static void ioc_exit_icq(struct io_cq *icq)
{
+ struct elevator_type *et = icq->q->elevator->type;
+
+ if (icq->flags & ICQ_EXITED)
+ return;
+
+ if (et->ops.elevator_exit_icq_fn)
+ et->ops.elevator_exit_icq_fn(icq);
+
+ icq->flags |= ICQ_EXITED;
+}
+
+/* Release an icq. Called with both ioc and q locked. */
+static void ioc_destroy_icq(struct io_cq *icq)
+{
struct io_context *ioc = icq->ioc;
struct request_queue *q = icq->q;
struct elevator_type *et = q->elevator->type;
@@ -60,8 +72,7 @@ static void ioc_exit_icq(struct io_cq *i
if (rcu_dereference_raw(ioc->icq_hint) == icq)
rcu_assign_pointer(ioc->icq_hint, NULL);

- if (et->ops.elevator_exit_icq_fn)
- et->ops.elevator_exit_icq_fn(icq);
+ ioc_exit_icq(icq);

/*
* @icq->q might have gone away by the time RCU callback runs
@@ -95,7 +106,7 @@ static void ioc_release_fn(struct work_s
struct request_queue *q = icq->q;

if (spin_trylock(q->queue_lock)) {
- ioc_exit_icq(icq);
+ ioc_destroy_icq(icq);
spin_unlock(q->queue_lock);
} else {
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -142,13 +153,41 @@ EXPORT_SYMBOL(put_io_context);
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
+ struct io_cq *icq;
+ struct hlist_node *n;
+ unsigned long flags;

task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);

- atomic_dec(&ioc->nr_tasks);
+ if (!atomic_dec_and_test(&ioc->nr_tasks)) {
+ put_io_context(ioc);
+ return;
+ }
+
+ /*
+ * Need ioc lock to walk icq_list and q lock to exit icq. Perform
+ * reverse double locking. Read comment in ioc_release_fn() for
+ * explanation on the nested locking annotation.
+ */
+retry:
+ spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) {
+ if (icq->flags & ICQ_EXITED)
+ continue;
+ if (spin_trylock(icq->q->queue_lock)) {
+ ioc_exit_icq(icq);
+ spin_unlock(icq->q->queue_lock);
+ } else {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ cpu_relax();
+ goto retry;
+ }
+ }
+ spin_unlock_irqrestore(&ioc->lock, flags);
+
put_io_context(ioc);
}

@@ -168,7 +207,7 @@ void ioc_clear_queue(struct request_queu
struct io_context *ioc = icq->ioc;

spin_lock(&ioc->lock);
- ioc_exit_icq(icq);
+ ioc_destroy_icq(icq);
spin_unlock(&ioc->lock);
}
}
Index: work/include/linux/iocontext.h
===================================================================
--- work.orig/include/linux/iocontext.h
+++ work/include/linux/iocontext.h
@@ -8,6 +8,7 @@
enum {
ICQ_IOPRIO_CHANGED = 1 << 0,
ICQ_CGROUP_CHANGED = 1 << 1,
+ ICQ_EXITED = 1 << 2,

ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
};

2012-02-15 01:09:41

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH block/for-linus 1/3] block: replace icq->changed with icq->flags

2012/2/15 Tejun Heo <[email protected]>:
> icq->changed was used for ICQ_*_CHANGED bits. ?Rename it to flags and
> access it under ioc->lock instead of using atomic bitops.
> ioc_get_changed() is added so that the changed part can be fetched and
> cleared as before.
>
> icq->flags will be used to carry other flags.
>
> Signed-off-by: Tejun Heo <[email protected]>
> ---
> Jens, these three patches are to fix the swap heavy workload
> regression reported by Shaohua.
>
> Shaohua, I ended up changing the second patch, which shouldn't change
> anything but it would be great if you can verify this series once
> more.
yes, confirmed the patches fix the regression. Thanks for fixing it.
Tested-by: Shaohua Li <[email protected]>

2012-02-15 08:43:51

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH block/for-linus 1/3] block: replace icq->changed with icq->flags

On 2012-02-15 02:09, Shaohua Li wrote:
> 2012/2/15 Tejun Heo <[email protected]>:
>> icq->changed was used for ICQ_*_CHANGED bits. Rename it to flags and
>> access it under ioc->lock instead of using atomic bitops.
>> ioc_get_changed() is added so that the changed part can be fetched and
>> cleared as before.
>>
>> icq->flags will be used to carry other flags.
>>
>> Signed-off-by: Tejun Heo <[email protected]>
>> ---
>> Jens, these three patches are to fix the swap heavy workload
>> regression reported by Shaohua.
>>
>> Shaohua, I ended up changing the second patch, which shouldn't change
>> anything but it would be great if you can verify this series once
>> more.
> yes, confirmed the patches fix the regression. Thanks for fixing it.
> Tested-by: Shaohua Li <[email protected]>

Great, thanks Tejun - and Shaohua for finding/reporting/testing. I'll
get this applied.

--
Jens Axboe