2014-06-08 08:35:46

by Mike Galbraith

[permalink] [raw]
Subject: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat


[ 172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
[ 172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
[ 172.743117] 2 locks held by rcuos/2/26:
[ 172.743128] #0: (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
[ 172.743135] #1: (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
[ 172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[ 172.743138]
[ 172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
[ 172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
[ 172.743148] ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
[ 172.743151] ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
[ 172.743154] ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
[ 172.743155] Call Trace:
[ 172.743160] [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
[ 172.743163] [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
[ 172.743167] [<ffffffff81589304>] rt_spin_lock+0x24/0x70
[ 172.743171] [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
[ 172.743174] [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
[ 172.743177] [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
[ 172.743180] [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
[ 172.743183] [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[ 172.743185] [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
[ 172.743189] [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
[ 172.743192] [<ffffffff8106e046>] kthread+0xd6/0xf0
[ 172.743194] [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
[ 172.743197] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
[ 172.743200] [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
[ 172.743203] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70

crash> gdb list *percpu_ref_kill_rcu+0x1b4
0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
164 pcpu_count = ACCESS_ONCE(ref->pcpu_count);
165
166 if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
167 __this_cpu_dec(*pcpu_count);
168 else if (unlikely(atomic_dec_and_test(&ref->count)))
169 ref->release(ref);
170
171 rcu_read_unlock_sched();
172 }
173

Ok, so ->release() can't do anything where it may meet a sleeping lock,
but in an -rt kernel, it does that.

Convert struct kioctx ctx_lock/completion_lock to raw_spinlock_t, and
defer final free to a time when we're not under rcu_read_lock_sched().

runltp -f ltp-aio-stress.part1 runs kernel/ltp gripe free.

INFO: ltp-pan reported all tests PASS
LTP Version: 20140422

###############################################################

Done executing testcases.
LTP Version: 20140422
###############################################################


Signed-off-by: Mike Galbraith <[email protected]>
---
fs/aio.c | 61 ++++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 44 insertions(+), 17 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -125,7 +125,7 @@ struct kioctx {
} ____cacheline_aligned_in_smp;

struct {
- spinlock_t ctx_lock;
+ raw_spinlock_t ctx_lock;
struct list_head active_reqs; /* used for cancellation */
} ____cacheline_aligned_in_smp;

@@ -136,13 +136,16 @@ struct kioctx {

struct {
unsigned tail;
- spinlock_t completion_lock;
+ raw_spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;

struct page *internal_pages[AIO_RING_PAGES];
struct file *aio_ring_file;

unsigned id;
+#ifdef CONFIG_PREEMPT_RT_BASE
+ struct rcu_head rcu;
+#endif
};

/*------ sysctl variables----*/
@@ -334,11 +337,11 @@ static int aio_migratepage(struct addres
* while the old page is copied to the new. This prevents new
* events from being lost.
*/
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ raw_spin_lock_irqsave(&ctx->completion_lock, flags);
migrate_page_copy(new, old);
BUG_ON(ctx->ring_pages[idx] != old);
ctx->ring_pages[idx] = new;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);

/* The old page is no longer accessible. */
put_page(old);
@@ -461,14 +464,14 @@ void kiocb_set_cancel_fn(struct kiocb *r
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;

- spin_lock_irqsave(&ctx->ctx_lock, flags);
+ raw_spin_lock_irqsave(&ctx->ctx_lock, flags);

if (!req->ki_list.next)
list_add(&req->ki_list, &ctx->active_reqs);

req->ki_cancel = cancel;

- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);

@@ -493,6 +496,7 @@ static int kiocb_cancel(struct kioctx *c
return cancel(kiocb);
}

+#ifndef CONFIG_PREEMPT_RT_BASE
static void free_ioctx(struct work_struct *work)
{
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -503,13 +507,36 @@ static void free_ioctx(struct work_struc
free_percpu(ctx->cpu);
kmem_cache_free(kioctx_cachep, ctx);
}
+#else
+static void free_ioctx_rcu(struct rcu_head *rcu)
+{
+ struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
+
+ pr_debug("freeing %p\n", ctx);
+
+ aio_free_ring(ctx);
+ free_percpu(ctx->cpu);
+ kmem_cache_free(kioctx_cachep, ctx);
+}
+#endif

static void free_ioctx_reqs(struct percpu_ref *ref)
{
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);

+#ifndef CONFIG_PREEMPT_RT_BASE
INIT_WORK(&ctx->free_work, free_ioctx);
schedule_work(&ctx->free_work);
+#else
+ /*
+ * We're in ->release() under rcu_read_lock_sched(), and can't do
+ * anything that requires taking a sleeping lock, so ->release()
+ * becomes a two stage rcu process for -rt. We've now done the
+ * rcu work that we can under locks made raw to get us this far.
+ * Defer the freeing bit until we're again allowed to schedule().
+ */
+ call_rcu(&ctx->rcu, free_ioctx_rcu);
+#endif
}

/*
@@ -522,7 +549,7 @@ static void free_ioctx_users(struct perc
struct kioctx *ctx = container_of(ref, struct kioctx, users);
struct kiocb *req;

- spin_lock_irq(&ctx->ctx_lock);
+ raw_spin_lock_irq(&ctx->ctx_lock);

while (!list_empty(&ctx->active_reqs)) {
req = list_first_entry(&ctx->active_reqs,
@@ -532,7 +559,7 @@ static void free_ioctx_users(struct perc
kiocb_cancel(ctx, req);
}

- spin_unlock_irq(&ctx->ctx_lock);
+ raw_spin_unlock_irq(&ctx->ctx_lock);

percpu_ref_kill(&ctx->reqs);
percpu_ref_put(&ctx->reqs);
@@ -645,8 +672,8 @@ static struct kioctx *ioctx_alloc(unsign

ctx->max_reqs = nr_events;

- spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->completion_lock);
+ raw_spin_lock_init(&ctx->ctx_lock);
+ raw_spin_lock_init(&ctx->completion_lock);
mutex_init(&ctx->ring_lock);
/* Protect against page migration throughout kiotx setup by keeping
* the ring_lock mutex held until setup is complete. */
@@ -948,9 +975,9 @@ void aio_complete(struct kiocb *iocb, lo
if (iocb->ki_list.next) {
unsigned long flags;

- spin_lock_irqsave(&ctx->ctx_lock, flags);
+ raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
list_del(&iocb->ki_list);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}

/*
@@ -958,7 +985,7 @@ void aio_complete(struct kiocb *iocb, lo
* ctx->completion_lock to prevent other code from messing with the tail
* pointer since we might be called from irq context.
*/
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ raw_spin_lock_irqsave(&ctx->completion_lock, flags);

tail = ctx->tail;
pos = tail + AIO_EVENTS_OFFSET;
@@ -993,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, lo
kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);

- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);

pr_debug("added to ring %p at [%u]\n", iocb, tail);

@@ -1515,7 +1542,7 @@ static struct kiocb *lookup_kiocb(struct
{
struct list_head *pos;

- assert_spin_locked(&ctx->ctx_lock);
+ assert_raw_spin_locked(&ctx->ctx_lock);

if (key != KIOCB_KEY)
return NULL;
@@ -1555,7 +1582,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
if (unlikely(!ctx))
return -EINVAL;

- spin_lock_irq(&ctx->ctx_lock);
+ raw_spin_lock_irq(&ctx->ctx_lock);

kiocb = lookup_kiocb(ctx, iocb, key);
if (kiocb)
@@ -1563,7 +1590,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
else
ret = -EINVAL;

- spin_unlock_irq(&ctx->ctx_lock);
+ raw_spin_unlock_irq(&ctx->ctx_lock);

if (!ret) {
/*


2014-06-09 02:04:16

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat

Hi, rt-people

I don't think it is the correct direction.
Softirq (including local_bh_disable()) in RT kernel should be preemptible.

Fixing these problems via converting spinlock_t to raw_spinlock_t will
result that all spinlock_t used in RCU-callbacks are converted which
means almost the spinlock_t in kernel are converted.

Sudden and superficial thought.
Thanks
Lai

On 06/08/2014 04:35 PM, Mike Galbraith wrote:
>
> [ 172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
> [ 172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
> [ 172.743117] 2 locks held by rcuos/2/26:
> [ 172.743128] #0: (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
> [ 172.743135] #1: (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
> [ 172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [ 172.743138]
> [ 172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
> [ 172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
> [ 172.743148] ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
> [ 172.743151] ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
> [ 172.743154] ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
> [ 172.743155] Call Trace:
> [ 172.743160] [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
> [ 172.743163] [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
> [ 172.743167] [<ffffffff81589304>] rt_spin_lock+0x24/0x70
> [ 172.743171] [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
> [ 172.743174] [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
> [ 172.743177] [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
> [ 172.743180] [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
> [ 172.743183] [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [ 172.743185] [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
> [ 172.743189] [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
> [ 172.743192] [<ffffffff8106e046>] kthread+0xd6/0xf0
> [ 172.743194] [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
> [ 172.743197] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
> [ 172.743200] [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
> [ 172.743203] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
>
> crash> gdb list *percpu_ref_kill_rcu+0x1b4
> 0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
> 164 pcpu_count = ACCESS_ONCE(ref->pcpu_count);
> 165
> 166 if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
> 167 __this_cpu_dec(*pcpu_count);
> 168 else if (unlikely(atomic_dec_and_test(&ref->count)))
> 169 ref->release(ref);
> 170
> 171 rcu_read_unlock_sched();
> 172 }
> 173
>
> Ok, so ->release() can't do anything where it may meet a sleeping lock,
> but in an -rt kernel, it does that.
>
> Convert struct kioctx ctx_lock/completion_lock to raw_spinlock_t, and
> defer final free to a time when we're not under rcu_read_lock_sched().
>
> runltp -f ltp-aio-stress.part1 runs kernel/ltp gripe free.
>
> INFO: ltp-pan reported all tests PASS
> LTP Version: 20140422
>
> ###############################################################
>
> Done executing testcases.
> LTP Version: 20140422
> ###############################################################
>
>
> Signed-off-by: Mike Galbraith <[email protected]>
> ---
> fs/aio.c | 61 ++++++++++++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 44 insertions(+), 17 deletions(-)
>
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -125,7 +125,7 @@ struct kioctx {
> } ____cacheline_aligned_in_smp;
>
> struct {
> - spinlock_t ctx_lock;
> + raw_spinlock_t ctx_lock;
> struct list_head active_reqs; /* used for cancellation */
> } ____cacheline_aligned_in_smp;
>
> @@ -136,13 +136,16 @@ struct kioctx {
>
> struct {
> unsigned tail;
> - spinlock_t completion_lock;
> + raw_spinlock_t completion_lock;
> } ____cacheline_aligned_in_smp;
>
> struct page *internal_pages[AIO_RING_PAGES];
> struct file *aio_ring_file;
>
> unsigned id;
> +#ifdef CONFIG_PREEMPT_RT_BASE
> + struct rcu_head rcu;
> +#endif
> };
>
> /*------ sysctl variables----*/
> @@ -334,11 +337,11 @@ static int aio_migratepage(struct addres
> * while the old page is copied to the new. This prevents new
> * events from being lost.
> */
> - spin_lock_irqsave(&ctx->completion_lock, flags);
> + raw_spin_lock_irqsave(&ctx->completion_lock, flags);
> migrate_page_copy(new, old);
> BUG_ON(ctx->ring_pages[idx] != old);
> ctx->ring_pages[idx] = new;
> - spin_unlock_irqrestore(&ctx->completion_lock, flags);
> + raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);
>
> /* The old page is no longer accessible. */
> put_page(old);
> @@ -461,14 +464,14 @@ void kiocb_set_cancel_fn(struct kiocb *r
> struct kioctx *ctx = req->ki_ctx;
> unsigned long flags;
>
> - spin_lock_irqsave(&ctx->ctx_lock, flags);
> + raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
>
> if (!req->ki_list.next)
> list_add(&req->ki_list, &ctx->active_reqs);
>
> req->ki_cancel = cancel;
>
> - spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> + raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> }
> EXPORT_SYMBOL(kiocb_set_cancel_fn);
>
> @@ -493,6 +496,7 @@ static int kiocb_cancel(struct kioctx *c
> return cancel(kiocb);
> }
>
> +#ifndef CONFIG_PREEMPT_RT_BASE
> static void free_ioctx(struct work_struct *work)
> {
> struct kioctx *ctx = container_of(work, struct kioctx, free_work);
> @@ -503,13 +507,36 @@ static void free_ioctx(struct work_struc
> free_percpu(ctx->cpu);
> kmem_cache_free(kioctx_cachep, ctx);
> }
> +#else
> +static void free_ioctx_rcu(struct rcu_head *rcu)
> +{
> + struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
> +
> + pr_debug("freeing %p\n", ctx);
> +
> + aio_free_ring(ctx);
> + free_percpu(ctx->cpu);
> + kmem_cache_free(kioctx_cachep, ctx);
> +}
> +#endif
>
> static void free_ioctx_reqs(struct percpu_ref *ref)
> {
> struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
>
> +#ifndef CONFIG_PREEMPT_RT_BASE
> INIT_WORK(&ctx->free_work, free_ioctx);
> schedule_work(&ctx->free_work);
> +#else
> + /*
> + * We're in ->release() under rcu_read_lock_sched(), and can't do
> + * anything that requires taking a sleeping lock, so ->release()
> + * becomes a two stage rcu process for -rt. We've now done the
> + * rcu work that we can under locks made raw to get us this far.
> + * Defer the freeing bit until we're again allowed to schedule().
> + */
> + call_rcu(&ctx->rcu, free_ioctx_rcu);
> +#endif
> }
>
> /*
> @@ -522,7 +549,7 @@ static void free_ioctx_users(struct perc
> struct kioctx *ctx = container_of(ref, struct kioctx, users);
> struct kiocb *req;
>
> - spin_lock_irq(&ctx->ctx_lock);
> + raw_spin_lock_irq(&ctx->ctx_lock);
>
> while (!list_empty(&ctx->active_reqs)) {
> req = list_first_entry(&ctx->active_reqs,
> @@ -532,7 +559,7 @@ static void free_ioctx_users(struct perc
> kiocb_cancel(ctx, req);
> }
>
> - spin_unlock_irq(&ctx->ctx_lock);
> + raw_spin_unlock_irq(&ctx->ctx_lock);
>
> percpu_ref_kill(&ctx->reqs);
> percpu_ref_put(&ctx->reqs);
> @@ -645,8 +672,8 @@ static struct kioctx *ioctx_alloc(unsign
>
> ctx->max_reqs = nr_events;
>
> - spin_lock_init(&ctx->ctx_lock);
> - spin_lock_init(&ctx->completion_lock);
> + raw_spin_lock_init(&ctx->ctx_lock);
> + raw_spin_lock_init(&ctx->completion_lock);
> mutex_init(&ctx->ring_lock);
> /* Protect against page migration throughout kiotx setup by keeping
> * the ring_lock mutex held until setup is complete. */
> @@ -948,9 +975,9 @@ void aio_complete(struct kiocb *iocb, lo
> if (iocb->ki_list.next) {
> unsigned long flags;
>
> - spin_lock_irqsave(&ctx->ctx_lock, flags);
> + raw_spin_lock_irqsave(&ctx->ctx_lock, flags);
> list_del(&iocb->ki_list);
> - spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> + raw_spin_unlock_irqrestore(&ctx->ctx_lock, flags);
> }
>
> /*
> @@ -958,7 +985,7 @@ void aio_complete(struct kiocb *iocb, lo
> * ctx->completion_lock to prevent other code from messing with the tail
> * pointer since we might be called from irq context.
> */
> - spin_lock_irqsave(&ctx->completion_lock, flags);
> + raw_spin_lock_irqsave(&ctx->completion_lock, flags);
>
> tail = ctx->tail;
> pos = tail + AIO_EVENTS_OFFSET;
> @@ -993,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, lo
> kunmap_atomic(ring);
> flush_dcache_page(ctx->ring_pages[0]);
>
> - spin_unlock_irqrestore(&ctx->completion_lock, flags);
> + raw_spin_unlock_irqrestore(&ctx->completion_lock, flags);
>
> pr_debug("added to ring %p at [%u]\n", iocb, tail);
>
> @@ -1515,7 +1542,7 @@ static struct kiocb *lookup_kiocb(struct
> {
> struct list_head *pos;
>
> - assert_spin_locked(&ctx->ctx_lock);
> + assert_raw_spin_locked(&ctx->ctx_lock);
>
> if (key != KIOCB_KEY)
> return NULL;
> @@ -1555,7 +1582,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
> if (unlikely(!ctx))
> return -EINVAL;
>
> - spin_lock_irq(&ctx->ctx_lock);
> + raw_spin_lock_irq(&ctx->ctx_lock);
>
> kiocb = lookup_kiocb(ctx, iocb, key);
> if (kiocb)
> @@ -1563,7 +1590,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t
> else
> ret = -EINVAL;
>
> - spin_unlock_irq(&ctx->ctx_lock);
> + raw_spin_unlock_irq(&ctx->ctx_lock);
>
> if (!ret) {
> /*
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
> .
>

2014-06-09 03:17:17

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat

On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> Hi, rt-people
>
> I don't think it is the correct direction.

Yup, it's a band-aid, ergo RFC.

-Mike

2014-06-09 06:22:14

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat

On Mon, 2014-06-09 at 05:17 +0200, Mike Galbraith wrote:
> On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> > Hi, rt-people
> >
> > I don't think it is the correct direction.
>
> Yup, it's a band-aid, ergo RFC.

Making aio play by the rules is safe. Another option is to bend the
rules up a little. With the below, ltp aio-stress testcases haven't yet
griped or exploded either.. which somehow isn't quite as comforting as
"is safe" :)

---
fs/aio.c | 6 ++++++
1 file changed, 6 insertions(+)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -509,7 +509,9 @@ static void free_ioctx_reqs(struct percp
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);

INIT_WORK(&ctx->free_work, free_ioctx);
+ preempt_enable_rt();
schedule_work(&ctx->free_work);
+ preempt_disable_rt();
}

/*
@@ -522,7 +524,9 @@ static void free_ioctx_users(struct perc
struct kioctx *ctx = container_of(ref, struct kioctx, users);
struct kiocb *req;

+ preempt_enable_rt();
spin_lock_irq(&ctx->ctx_lock);
+ local_irq_disable_rt();

while (!list_empty(&ctx->active_reqs)) {
req = list_first_entry(&ctx->active_reqs,
@@ -536,6 +540,8 @@ static void free_ioctx_users(struct perc

percpu_ref_kill(&ctx->reqs);
percpu_ref_put(&ctx->reqs);
+ preempt_disable_rt();
+ local_irq_enable_rt();
}

static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)

2014-06-09 09:05:48

by Pavel Vasilyev

[permalink] [raw]
Subject: Re: [RFC PATCH] rt/aio: fix rcu garbage collection might_sleep() splat

09.06.2014 10:22, Mike Galbraith пишет:
> On Mon, 2014-06-09 at 05:17 +0200, Mike Galbraith wrote:
>> On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
>>> Hi, rt-people

> @@ -522,7 +524,9 @@ static void free_ioctx_users(struct perc
> struct kioctx *ctx = container_of(ref, struct kioctx, users);
> struct kiocb *req;
>
> + preempt_enable_rt();
> spin_lock_irq(&ctx->ctx_lock);
> + local_irq_disable_rt();
>
> while (!list_empty(&ctx->active_reqs)) {
> req = list_first_entry(&ctx->active_reqs,
> @@ -536,6 +540,8 @@ static void free_ioctx_users(struct perc
>
> percpu_ref_kill(&ctx->reqs);
> percpu_ref_put(&ctx->reqs);
> + preempt_disable_rt();
> + local_irq_enable_rt();


I think, enable_/disable_ must be as mirror reflections


preempt_enable_rt();
local_irq_disable_rt();

do_something();

local_irq_enable_rt();
preempt_disable_rt();



--

Pavel.

2014-06-10 03:47:35

by Mike Galbraith

[permalink] [raw]
Subject: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> Hi, rt-people
>
> I don't think it is the correct direction.
> Softirq (including local_bh_disable()) in RT kernel should be preemptible.

How about the below then?

I was sorely tempted to post a tiny variant that dropped taking ctx_lock
in free_ioctx_users() entirely, as someone diddling with no reference
didn't make sense. Cc Ben, he would know.

[ 172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
[ 172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
[ 172.743117] 2 locks held by rcuos/2/26:
[ 172.743128] #0: (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
[ 172.743135] #1: (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
[ 172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[ 172.743138]
[ 172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
[ 172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
[ 172.743148] ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
[ 172.743151] ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
[ 172.743154] ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
[ 172.743155] Call Trace:
[ 172.743160] [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
[ 172.743163] [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
[ 172.743167] [<ffffffff81589304>] rt_spin_lock+0x24/0x70
[ 172.743171] [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
[ 172.743174] [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
[ 172.743177] [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
[ 172.743180] [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
[ 172.743183] [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
[ 172.743185] [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
[ 172.743189] [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
[ 172.743192] [<ffffffff8106e046>] kthread+0xd6/0xf0
[ 172.743194] [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
[ 172.743197] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
[ 172.743200] [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
[ 172.743203] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70

crash> gdb list *percpu_ref_kill_rcu+0x1b4
0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
164 pcpu_count = ACCESS_ONCE(ref->pcpu_count);
165
166 if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
167 __this_cpu_dec(*pcpu_count);
168 else if (unlikely(atomic_dec_and_test(&ref->count)))
169 ref->release(ref);
170
171 rcu_read_unlock_sched();
172 }
173

ref->release() path can't take sleeping locks, but in an rt kernel it does.
Defer ->release() work via call_rcu() to move sleeping locks out from under
rcu_read_lock_sched().

Signed-off-by: Mike Galbraith <[email protected]>
---
fs/aio.c | 38 +++++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 13 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -110,8 +110,6 @@ struct kioctx {
struct page **ring_pages;
long nr_pages;

- struct work_struct free_work;
-
struct {
/*
* This counts the number of available slots in the ringbuffer,
@@ -143,6 +141,7 @@ struct kioctx {
struct file *aio_ring_file;

unsigned id;
+ struct rcu_head rcu;
};

/*------ sysctl variables----*/
@@ -493,9 +492,9 @@ static int kiocb_cancel(struct kioctx *c
return cancel(kiocb);
}

-static void free_ioctx(struct work_struct *work)
+static void free_ioctx_rcu(struct rcu_head *rcu)
{
- struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+ struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);

pr_debug("freeing %p\n", ctx);

@@ -508,18 +507,12 @@ static void free_ioctx_reqs(struct percp
{
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);

- INIT_WORK(&ctx->free_work, free_ioctx);
- schedule_work(&ctx->free_work);
+ call_rcu(&ctx->rcu, free_ioctx_rcu);
}

-/*
- * When this function runs, the kioctx has been removed from the "hash table"
- * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
- * now it's safe to cancel any that need to be.
- */
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_rcu(struct rcu_head *rcu)
{
- struct kioctx *ctx = container_of(ref, struct kioctx, users);
+ struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
struct kiocb *req;

spin_lock_irq(&ctx->ctx_lock);
@@ -538,6 +531,25 @@ static void free_ioctx_users(struct perc
percpu_ref_put(&ctx->reqs);
}

+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
+ *
+ * NOTE: -rt must defer this to avoid taking sleeping ctx_lock while under
+ * rcu_real_lock_sched() during ->release().
+ */
+static void free_ioctx_users(struct percpu_ref *ref)
+{
+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+ call_rcu(&ctx->rcu, free_ioctx_users_rcu);
+#else
+ free_ioctx_users_rcu(&ctx->rcu);
+#endif
+}
+
static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
{
unsigned i, new_nr;


2014-06-10 17:50:05

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> > Hi, rt-people
> >
> > I don't think it is the correct direction.
> > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
>
> How about the below then?
>
> I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> in free_ioctx_users() entirely, as someone diddling with no reference
> didn't make sense. Cc Ben, he would know.

That should be okay... Let's ask Kent to chime in on whether this looks
safe to him on the percpu ref front as well, since he's the one who wrote
this code.

-ben


> [ 172.743098] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:768
> [ 172.743116] in_atomic(): 1, irqs_disabled(): 0, pid: 26, name: rcuos/2
> [ 172.743117] 2 locks held by rcuos/2/26:
> [ 172.743128] #0: (rcu_callback){.+.+..}, at: [<ffffffff810b1a12>] rcu_nocb_kthread+0x1e2/0x380
> [ 172.743135] #1: (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812acd26>] percpu_ref_kill_rcu+0xa6/0x1c0
> [ 172.743138] Preemption disabled at:[<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [ 172.743138]
> [ 172.743142] CPU: 0 PID: 26 Comm: rcuos/2 Not tainted 3.14.4-rt5 #31
> [ 172.743143] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
> [ 172.743148] ffff8802231aa190 ffff8802231a5d08 ffffffff81582e9e 0000000000000000
> [ 172.743151] ffff8802231a5d28 ffffffff81077aeb ffff880209f68140 ffff880209f681c0
> [ 172.743154] ffff8802231a5d48 ffffffff81589304 ffff880209f68000 ffff880209f68000
> [ 172.743155] Call Trace:
> [ 172.743160] [<ffffffff81582e9e>] dump_stack+0x4e/0x9c
> [ 172.743163] [<ffffffff81077aeb>] __might_sleep+0xfb/0x170
> [ 172.743167] [<ffffffff81589304>] rt_spin_lock+0x24/0x70
> [ 172.743171] [<ffffffff811c5790>] free_ioctx_users+0x30/0x130
> [ 172.743174] [<ffffffff812ace34>] percpu_ref_kill_rcu+0x1b4/0x1c0
> [ 172.743177] [<ffffffff812acd26>] ? percpu_ref_kill_rcu+0xa6/0x1c0
> [ 172.743180] [<ffffffff812acc80>] ? percpu_ref_kill_and_confirm+0x70/0x70
> [ 172.743183] [<ffffffff810b1a93>] rcu_nocb_kthread+0x263/0x380
> [ 172.743185] [<ffffffff810b1a12>] ? rcu_nocb_kthread+0x1e2/0x380
> [ 172.743189] [<ffffffff810b1830>] ? rcu_report_exp_rnp.isra.52+0xc0/0xc0
> [ 172.743192] [<ffffffff8106e046>] kthread+0xd6/0xf0
> [ 172.743194] [<ffffffff8158900c>] ? _raw_spin_unlock_irq+0x2c/0x70
> [ 172.743197] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
> [ 172.743200] [<ffffffff81591eec>] ret_from_fork+0x7c/0xb0
> [ 172.743203] [<ffffffff8106df70>] ? __kthread_parkme+0x70/0x70
>
> crash> gdb list *percpu_ref_kill_rcu+0x1b4
> 0xffffffff812ace34 is in percpu_ref_kill_rcu (include/linux/percpu-refcount.h:169).
> 164 pcpu_count = ACCESS_ONCE(ref->pcpu_count);
> 165
> 166 if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
> 167 __this_cpu_dec(*pcpu_count);
> 168 else if (unlikely(atomic_dec_and_test(&ref->count)))
> 169 ref->release(ref);
> 170
> 171 rcu_read_unlock_sched();
> 172 }
> 173
>
> ref->release() path can't take sleeping locks, but in an rt kernel it does.
> Defer ->release() work via call_rcu() to move sleeping locks out from under
> rcu_read_lock_sched().
>
> Signed-off-by: Mike Galbraith <[email protected]>
> ---
> fs/aio.c | 38 +++++++++++++++++++++++++-------------
> 1 file changed, 25 insertions(+), 13 deletions(-)
>
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -110,8 +110,6 @@ struct kioctx {
> struct page **ring_pages;
> long nr_pages;
>
> - struct work_struct free_work;
> -
> struct {
> /*
> * This counts the number of available slots in the ringbuffer,
> @@ -143,6 +141,7 @@ struct kioctx {
> struct file *aio_ring_file;
>
> unsigned id;
> + struct rcu_head rcu;
> };
>
> /*------ sysctl variables----*/
> @@ -493,9 +492,9 @@ static int kiocb_cancel(struct kioctx *c
> return cancel(kiocb);
> }
>
> -static void free_ioctx(struct work_struct *work)
> +static void free_ioctx_rcu(struct rcu_head *rcu)
> {
> - struct kioctx *ctx = container_of(work, struct kioctx, free_work);
> + struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
>
> pr_debug("freeing %p\n", ctx);
>
> @@ -508,18 +507,12 @@ static void free_ioctx_reqs(struct percp
> {
> struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
>
> - INIT_WORK(&ctx->free_work, free_ioctx);
> - schedule_work(&ctx->free_work);
> + call_rcu(&ctx->rcu, free_ioctx_rcu);
> }
>
> -/*
> - * When this function runs, the kioctx has been removed from the "hash table"
> - * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
> - * now it's safe to cancel any that need to be.
> - */
> -static void free_ioctx_users(struct percpu_ref *ref)
> +static void free_ioctx_users_rcu(struct rcu_head *rcu)
> {
> - struct kioctx *ctx = container_of(ref, struct kioctx, users);
> + struct kioctx *ctx = container_of(rcu, struct kioctx, rcu);
> struct kiocb *req;
>
> spin_lock_irq(&ctx->ctx_lock);
> @@ -538,6 +531,25 @@ static void free_ioctx_users(struct perc
> percpu_ref_put(&ctx->reqs);
> }
>
> +/*
> + * When this function runs, the kioctx has been removed from the "hash table"
> + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
> + * now it's safe to cancel any that need to be.
> + *
> + * NOTE: -rt must defer this to avoid taking sleeping ctx_lock while under
> + * rcu_real_lock_sched() during ->release().
> + */
> +static void free_ioctx_users(struct percpu_ref *ref)
> +{
> + struct kioctx *ctx = container_of(ref, struct kioctx, users);
> +
> +#ifdef CONFIG_PREEMPT_RT_BASE
> + call_rcu(&ctx->rcu, free_ioctx_users_rcu);
> +#else
> + free_ioctx_users_rcu(&ctx->rcu);
> +#endif
> +}
> +
> static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
> {
> unsigned i, new_nr;
>
>

--
"Thought is the essence of where you are now."

2014-06-11 04:10:33

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

On Tue, 2014-06-10 at 13:50 -0400, Benjamin LaHaise wrote:
> On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> > On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> > > Hi, rt-people
> > >
> > > I don't think it is the correct direction.
> > > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> >
> > How about the below then?
> >
> > I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> > in free_ioctx_users() entirely, as someone diddling with no reference
> > didn't make sense. Cc Ben, he would know.
>
> That should be okay... Let's ask Kent to chime in on whether this looks
> safe to him on the percpu ref front as well, since he's the one who wrote
> this code.

Looking at the gizzard of our in-tree user of kiocb_set_cancel_fn()
(gadget), cancel() leads to dequeue() methods, which take other sleeping
locks, so tiniest variant is not an option, patchlet stands.

-Mike

2014-06-12 20:25:37

by Kent Overstreet

[permalink] [raw]
Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

On Tue, Jun 10, 2014 at 01:50:01PM -0400, Benjamin LaHaise wrote:
> On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> > On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> > > Hi, rt-people
> > >
> > > I don't think it is the correct direction.
> > > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> >
> > How about the below then?
> >
> > I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> > in free_ioctx_users() entirely, as someone diddling with no reference
> > didn't make sense. Cc Ben, he would know.
>
> That should be okay... Let's ask Kent to chime in on whether this looks
> safe to him on the percpu ref front as well, since he's the one who wrote
> this code.

Ok, finally got around to reading the whole thread - honestly, I would just punt
to workqueue to do the free_ioctx_users(). AFAICT that should be perfectly safe
and even aside from rt it would be good change so we're not cancelling an
arbitrary number of kiocbs from rcu callback context.

Kind of a related change, it should be possible to just grab the entire list of
kiocbs with ctx_lock held (remove them all at once from ctx->active_reqs), then
cancel them without ctx_lock held.

2014-06-25 15:24:47

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

On Thu, Jun 12, 2014 at 01:26:02PM -0700, Kent Overstreet wrote:
> On Tue, Jun 10, 2014 at 01:50:01PM -0400, Benjamin LaHaise wrote:
> > On Tue, Jun 10, 2014 at 05:47:28AM +0200, Mike Galbraith wrote:
> > > On Mon, 2014-06-09 at 10:08 +0800, Lai Jiangshan wrote:
> > > > Hi, rt-people
> > > >
> > > > I don't think it is the correct direction.
> > > > Softirq (including local_bh_disable()) in RT kernel should be preemptible.
> > >
> > > How about the below then?
> > >
> > > I was sorely tempted to post a tiny variant that dropped taking ctx_lock
> > > in free_ioctx_users() entirely, as someone diddling with no reference
> > > didn't make sense. Cc Ben, he would know.
> >
> > That should be okay... Let's ask Kent to chime in on whether this looks
> > safe to him on the percpu ref front as well, since he's the one who wrote
> > this code.
>
> Ok, finally got around to reading the whole thread - honestly, I would just punt
> to workqueue to do the free_ioctx_users(). AFAICT that should be perfectly safe
> and even aside from rt it would be good change so we're not cancelling an
> arbitrary number of kiocbs from rcu callback context.

I finally have some time to look at this patch in detail. I'd rather do the
below variant that does what Kent suggested. Mike, can you confirm that
this fixes the issue you reported? It's on top of my current aio-next tree
at git://git.kvack.org/~bcrl/aio-next.git . If that's okay, I'll queue it
up. Does this bug fix need to end up in -stable kernels as well or would it
end up in the -rt tree?

> Kind of a related change, it should be possible to just grab the entire list of
> kiocbs with ctx_lock held (remove them all at once from ctx->active_reqs), then
> cancel them without ctx_lock held.

No, that is not safe. Cancellation has to be done under ctx_lock.

-ben
--
"Thought is the essence of where you are now."


diff --git a/fs/aio.c b/fs/aio.c
index c1d8c48..0b038f2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -526,9 +526,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
* now it's safe to cancel any that need to be.
*/
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_work(struct work_struct *work)
{
- struct kioctx *ctx = container_of(ref, struct kioctx, users);
+ struct kioctx *ctx = container_of(work, struct kioctx, free_work);
struct kiocb *req;

spin_lock_irq(&ctx->ctx_lock);
@@ -547,6 +547,18 @@ static void free_ioctx_users(struct percpu_ref *ref)
percpu_ref_put(&ctx->reqs);
}

+static void free_ioctx_users(struct percpu_ref *ref)
+{
+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
+#ifdef CONFIG_PREEMPT_RT_BAS
+ INIT_WORK(&ctx->free_work, free_ioctx_users_work);
+ schedule_work(&ctx->free_work);
+#else
+ free_ioctx_users_work(&ctx->free_work);
+#endif
+}
+
+
static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
{
unsigned i, new_nr;

2014-06-26 07:37:21

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

Hi Ben,

On Wed, 2014-06-25 at 11:24 -0400, Benjamin LaHaise wrote:

> I finally have some time to look at this patch in detail. I'd rather do the
> below variant that does what Kent suggested. Mike, can you confirm that
> this fixes the issue you reported? It's on top of my current aio-next tree
> at git://git.kvack.org/~bcrl/aio-next.git . If that's okay, I'll queue it
> up. Does this bug fix need to end up in -stable kernels as well or would it
> end up in the -rt tree?

It's an -rt specific problem, so presumably any fix would only go into
-rt trees until it manages to get merged.

I knew intervening change wasn't likely to fix the might_sleep() splat
up, but did the test anyway with fixed up CONFIG_PREEMPT_RT_BASE typo.
schedule_work() leads to an rtmutex, so -rt still has to ship that out
from under rcu_read_lock_sched().

marge:/usr/local/src/kernel/linux-3.14-rt # quilt applied|tail
patches/mm-memcg-make-refill_stock-use-get_cpu_light.patch
patches/printk-fix-lockdep-instrumentation-of-console_sem.patch
patches/aio-block-io_destroy-until-all-context-requests-are-completed.patch
patches/fs-aio-Remove-ctx-parameter-in-kiocb_cancel.patch
patches/aio-report-error-from-io_destroy-when-threads-race-in-io_destroy.patch
patches/aio-cleanup-flatten-kill_ioctx.patch
patches/aio-fix-aio-request-leak-when-events-are-reaped-by-userspace.patch
patches/aio-fix-kernel-memory-disclosure-in-io_getevents-introduced-in-v3.10.patch
patches/aio-change-exit_aio-to-load-mm-ioctx_table-once-and-avoid-rcu_read_lock.patch
patches/rt-aio-fix-rcu-garbage-collection-might_sleep-splat-ben.patch

[ 191.057656] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:792
[ 191.057672] in_atomic(): 1, irqs_disabled(): 0, pid: 22, name: rcuc/0
[ 191.057674] 2 locks held by rcuc/0/22:
[ 191.057684] #0: (rcu_callback){.+.+..}, at: [<ffffffff810ceb87>] rcu_cpu_kthread+0x2d7/0x840
[ 191.057691] #1: (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812e52f6>] percpu_ref_kill_rcu+0xa6/0x1c0
[ 191.057694] Preemption disabled at:[<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
[ 191.057695]
[ 191.057698] CPU: 0 PID: 22 Comm: rcuc/0 Tainted: GF W 3.14.8-rt5 #47
[ 191.057699] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
[ 191.057704] ffff88007c5d8000 ffff88007c5d7c98 ffffffff815696ed 0000000000000000
[ 191.057708] ffff88007c5d7cb8 ffffffff8108c3e5 ffff88007dc0e120 000000000000e120
[ 191.057711] ffff88007c5d7cd8 ffffffff8156f404 ffff88007dc0e120 ffff88007dc0e120
[ 191.057712] Call Trace:
[ 191.057716] [<ffffffff815696ed>] dump_stack+0x4e/0x9c
[ 191.057720] [<ffffffff8108c3e5>] __might_sleep+0x105/0x180
[ 191.057723] [<ffffffff8156f404>] rt_spin_lock+0x24/0x70
[ 191.057727] [<ffffffff81078897>] queue_work_on+0x67/0x1a0
[ 191.057731] [<ffffffff81216fc2>] free_ioctx_users+0x72/0x80
[ 191.057734] [<ffffffff812e5404>] percpu_ref_kill_rcu+0x1b4/0x1c0
[ 191.057737] [<ffffffff812e52f6>] ? percpu_ref_kill_rcu+0xa6/0x1c0
[ 191.057740] [<ffffffff812e5250>] ? percpu_ref_kill_and_confirm+0x70/0x70
[ 191.057742] [<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
[ 191.057745] [<ffffffff810ceb87>] ? rcu_cpu_kthread+0x2d7/0x840
[ 191.057749] [<ffffffff8108a76d>] smpboot_thread_fn+0x1dd/0x340
[ 191.057752] [<ffffffff8156c45a>] ? schedule+0x2a/0xa0
[ 191.057755] [<ffffffff8108a590>] ? smpboot_register_percpu_thread+0x100/0x100
[ 191.057758] [<ffffffff81081ca6>] kthread+0xd6/0xf0
[ 191.057761] [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70
[ 191.057764] [<ffffffff815780bc>] ret_from_fork+0x7c/0xb0
[ 191.057767] [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70

2014-06-26 16:42:08

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

On Thu, Jun 26, 2014 at 09:37:14AM +0200, Mike Galbraith wrote:
> Hi Ben,
>
> On Wed, 2014-06-25 at 11:24 -0400, Benjamin LaHaise wrote:
>
> > I finally have some time to look at this patch in detail. I'd rather do the
> > below variant that does what Kent suggested. Mike, can you confirm that
> > this fixes the issue you reported? It's on top of my current aio-next tree
> > at git://git.kvack.org/~bcrl/aio-next.git . If that's okay, I'll queue it
> > up. Does this bug fix need to end up in -stable kernels as well or would it
> > end up in the -rt tree?
>
> It's an -rt specific problem, so presumably any fix would only go into
> -rt trees until it manages to get merged.
>
> I knew intervening change wasn't likely to fix the might_sleep() splat
> up, but did the test anyway with fixed up CONFIG_PREEMPT_RT_BASE typo.
> schedule_work() leads to an rtmutex, so -rt still has to ship that out
> from under rcu_read_lock_sched().

So that doesn't fix it. I think you should fix schedule_work(), because
that should be callable from any context. Abusing RCU instead of using
schedule_work() is not the right way to fix this.

-ben

> marge:/usr/local/src/kernel/linux-3.14-rt # quilt applied|tail
> patches/mm-memcg-make-refill_stock-use-get_cpu_light.patch
> patches/printk-fix-lockdep-instrumentation-of-console_sem.patch
> patches/aio-block-io_destroy-until-all-context-requests-are-completed.patch
> patches/fs-aio-Remove-ctx-parameter-in-kiocb_cancel.patch
> patches/aio-report-error-from-io_destroy-when-threads-race-in-io_destroy.patch
> patches/aio-cleanup-flatten-kill_ioctx.patch
> patches/aio-fix-aio-request-leak-when-events-are-reaped-by-userspace.patch
> patches/aio-fix-kernel-memory-disclosure-in-io_getevents-introduced-in-v3.10.patch
> patches/aio-change-exit_aio-to-load-mm-ioctx_table-once-and-avoid-rcu_read_lock.patch
> patches/rt-aio-fix-rcu-garbage-collection-might_sleep-splat-ben.patch
>
> [ 191.057656] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:792
> [ 191.057672] in_atomic(): 1, irqs_disabled(): 0, pid: 22, name: rcuc/0
> [ 191.057674] 2 locks held by rcuc/0/22:
> [ 191.057684] #0: (rcu_callback){.+.+..}, at: [<ffffffff810ceb87>] rcu_cpu_kthread+0x2d7/0x840
> [ 191.057691] #1: (rcu_read_lock_sched){.+.+..}, at: [<ffffffff812e52f6>] percpu_ref_kill_rcu+0xa6/0x1c0
> [ 191.057694] Preemption disabled at:[<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
> [ 191.057695]
> [ 191.057698] CPU: 0 PID: 22 Comm: rcuc/0 Tainted: GF W 3.14.8-rt5 #47
> [ 191.057699] Hardware name: MEDIONPC MS-7502/MS-7502, BIOS 6.00 PG 12/26/2007
> [ 191.057704] ffff88007c5d8000 ffff88007c5d7c98 ffffffff815696ed 0000000000000000
> [ 191.057708] ffff88007c5d7cb8 ffffffff8108c3e5 ffff88007dc0e120 000000000000e120
> [ 191.057711] ffff88007c5d7cd8 ffffffff8156f404 ffff88007dc0e120 ffff88007dc0e120
> [ 191.057712] Call Trace:
> [ 191.057716] [<ffffffff815696ed>] dump_stack+0x4e/0x9c
> [ 191.057720] [<ffffffff8108c3e5>] __might_sleep+0x105/0x180
> [ 191.057723] [<ffffffff8156f404>] rt_spin_lock+0x24/0x70
> [ 191.057727] [<ffffffff81078897>] queue_work_on+0x67/0x1a0
> [ 191.057731] [<ffffffff81216fc2>] free_ioctx_users+0x72/0x80
> [ 191.057734] [<ffffffff812e5404>] percpu_ref_kill_rcu+0x1b4/0x1c0
> [ 191.057737] [<ffffffff812e52f6>] ? percpu_ref_kill_rcu+0xa6/0x1c0
> [ 191.057740] [<ffffffff812e5250>] ? percpu_ref_kill_and_confirm+0x70/0x70
> [ 191.057742] [<ffffffff810cebca>] rcu_cpu_kthread+0x31a/0x840
> [ 191.057745] [<ffffffff810ceb87>] ? rcu_cpu_kthread+0x2d7/0x840
> [ 191.057749] [<ffffffff8108a76d>] smpboot_thread_fn+0x1dd/0x340
> [ 191.057752] [<ffffffff8156c45a>] ? schedule+0x2a/0xa0
> [ 191.057755] [<ffffffff8108a590>] ? smpboot_register_percpu_thread+0x100/0x100
> [ 191.057758] [<ffffffff81081ca6>] kthread+0xd6/0xf0
> [ 191.057761] [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70
> [ 191.057764] [<ffffffff815780bc>] ret_from_fork+0x7c/0xb0
> [ 191.057767] [<ffffffff81081bd0>] ? __kthread_parkme+0x70/0x70
>

--
"Thought is the essence of where you are now."

Subject: Re: [RFC PATCH V2] rt/aio: fix rcu garbage collection might_sleep() splat

* Benjamin LaHaise | 2014-06-25 11:24:45 [-0400]:

>I finally have some time to look at this patch in detail. I'd rather do the
>below variant that does what Kent suggested. Mike, can you confirm that
>this fixes the issue you reported? It's on top of my current aio-next tree
>at git://git.kvack.org/~bcrl/aio-next.git . If that's okay, I'll queue it
>up. Does this bug fix need to end up in -stable kernels as well or would it
>end up in the -rt tree?

This looks smaller compared to the RCU version. Since recently I have
simple-workqueue in -RT so I could replace schedule_work() with it. And
looking at the code I would have to do this change to free_ioctx_users()
and free_ioctx_reqs(). So here is what I am about to add for next -RT.

Ach. It compiles and I've never seen that splat so any feedback is
welcome :)

diff --git a/fs/aio.c b/fs/aio.c
index 14b93159ef83..551fcfe3fd58 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -40,6 +40,7 @@
#include <linux/ramfs.h>
#include <linux/percpu-refcount.h>
#include <linux/mount.h>
+#include <linux/work-simple.h>

#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -110,7 +111,7 @@ struct kioctx {
struct page **ring_pages;
long nr_pages;

- struct work_struct free_work;
+ struct swork_event free_work;

/*
* signals when all in-flight requests are done
@@ -226,6 +227,7 @@ static int __init aio_setup(void)
.mount = aio_mount,
.kill_sb = kill_anon_super,
};
+ BUG_ON(swork_get());
aio_mnt = kern_mount(&aio_fs);
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
@@ -505,9 +507,9 @@ static int kiocb_cancel(struct kiocb *kiocb)
return cancel(kiocb);
}

-static void free_ioctx(struct work_struct *work)
+static void free_ioctx(struct swork_event *sev)
{
- struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+ struct kioctx *ctx = container_of(sev, struct kioctx, free_work);

pr_debug("freeing %p\n", ctx);

@@ -526,8 +528,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
if (ctx->requests_done)
complete(ctx->requests_done);

- INIT_WORK(&ctx->free_work, free_ioctx);
- schedule_work(&ctx->free_work);
+ INIT_SWORK(&ctx->free_work, free_ioctx);
+ swork_queue(&ctx->free_work);
}

/*
@@ -535,9 +537,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
* now it's safe to cancel any that need to be.
*/
-static void free_ioctx_users(struct percpu_ref *ref)
+static void free_ioctx_users_work(struct swork_event *sev)
{
- struct kioctx *ctx = container_of(ref, struct kioctx, users);
+ struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
struct kiocb *req;

spin_lock_irq(&ctx->ctx_lock);
@@ -556,6 +558,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
percpu_ref_put(&ctx->reqs);
}

+static void free_ioctx_users(struct percpu_ref *ref)
+{
+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+ INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
+ swork_queue(&ctx->free_work);
+}
+
static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
{
unsigned i, new_nr;
--
2.1.4