Subject: [ANNOUNCE] v5.14-rc4-rt4

Dear RT folks!

I'm pleased to announce the v5.14-rc4-rt4 patch set.

Changes since v5.14-rc4-rt3:

- Updating the locking bits:

- Use proper task state in ww_mutex_lock_interruptible(), reported
by Mike Galbraith.

- Fix the wake_q handling for a task which blocks simultaneously as
a regular task and additionally as a sleeper on a sleeping lock.
Regression introduced in the v5.14 cycle, reported by Mike
Galbraith, patched by Thomas Gleixner.

- Address the futex related review comments by Peter Zijlstra.

Known issues
- netconsole triggers WARN.

- The "Memory controller" (CONFIG_MEMCG) has been disabled.

- A RCU and ARM64 warning has been fixed by Valentin Schneider. It is
still not clear if the RCU related change is correct.

The delta patch against v5.14-rc4-rt3 is appended below and can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/incr/patch-5.14-rc4-rt3-rt4.patch.xz

You can get this release via the git tree at:

git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.14-rc4-rt4

The RT patch against v5.14-rc4 can be found here:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patch-5.14-rc4-rt4.patch.xz

The split quilt queue is available at:

https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patches-5.14-rc4-rt4.tar.xz

Sebastian

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index cdd98d57c0e6a..8e57a6660aad1 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -61,11 +61,5 @@ static inline bool wake_q_empty(struct wake_q_head *head)

extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
-extern void __wake_up_q(struct wake_q_head *head, unsigned int state);
-
-static inline void wake_up_q(struct wake_q_head *head)
-{
- __wake_up_q(head, TASK_NORMAL);
-}
-
+extern void wake_up_q(struct wake_q_head *head);
#endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/kernel/futex.c b/kernel/futex.c
index 7fc061ee5f2d4..c4e28bd37abcb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1853,15 +1853,16 @@ enum {
static inline bool futex_requeue_pi_prepare(struct futex_q *q,
struct futex_pi_state *pi_state)
{
- int cur, res, new;
+ int old, new;

/*
* Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
* already set Q_REQUEUE_PI_IGNORE to signal that requeue should
* ignore the waiter.
*/
- for (cur = atomic_read(&q->requeue_state);; cur = res) {
- if (cur == Q_REQUEUE_PI_IGNORE)
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
return false;

/*
@@ -1872,74 +1873,68 @@ static inline bool futex_requeue_pi_prepare(struct futex_q *q,
* trylock, but that would just add more conditionals
* all over the place for a dubious value.
*/
- if (cur != Q_REQUEUE_PI_NONE)
+ if (old != Q_REQUEUE_PI_NONE)
break;

new = Q_REQUEUE_PI_IN_PROGRESS;
- res = atomic_cmpxchg(&q->requeue_state, cur, new);
- if (likely(cur == res))
- break;
- }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
q->pi_state = pi_state;
return true;
}

static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
{
- int cur, res, new;
+ int old, new;

- for (cur = atomic_read(&q->requeue_state);; cur = res) {
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
if (locked >= 0) {
/* Requeue succeeded. Set DONE or LOCKED */
new = Q_REQUEUE_PI_DONE + locked;
- } else if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
/* Deadlock, no early wakeup interleave */
new = Q_REQUEUE_PI_NONE;
} else {
/* Deadlock, early wakeup interleave. */
new = Q_REQUEUE_PI_IGNORE;
}
-
- res = atomic_cmpxchg(&q->requeue_state, cur, new);
- if (likely(cur == res))
- break;
- }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));

#ifdef CONFIG_PREEMPT_RT
/* If the waiter interleaved with the requeue let it know */
- if (unlikely(cur == Q_REQUEUE_PI_WAIT))
+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
rcuwait_wake_up(&q->requeue_wait);
#endif
}

static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
{
- int cur, new, res;
+ int old, new;

- for (cur = atomic_read(&q->requeue_state);; cur = res) {
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
/* Is requeue done already? */
- if (cur >= Q_REQUEUE_PI_DONE)
- break;
+ if (old >= Q_REQUEUE_PI_DONE)
+ return old;

/*
* If not done, then tell the requeue code to either ignore
* the waiter or to wake it up once the requeue is done.
*/
- new = !cur ? Q_REQUEUE_PI_IGNORE : Q_REQUEUE_PI_WAIT;
- res = atomic_cmpxchg(&q->requeue_state, cur, new);
- if (likely(cur == res))
- break;
- }
+ new = Q_REQUEUE_PI_WAIT;
+ if (old == Q_REQUEUE_PI_NONE)
+ new = Q_REQUEUE_PI_IGNORE;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));

/* If the requeue was in progress, wait for it to complete */
- if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
#ifdef CONFIG_PREEMPT_RT
rcuwait_wait_event(&q->requeue_wait,
atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
TASK_UNINTERRUPTIBLE);
#else
- while (atomic_read(&q->requeue_state) == Q_REQUEUE_PI_WAIT)
- cpu_relax();
+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
#endif
}

@@ -2039,7 +2034,10 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
if (!top_waiter)
return 0;

- /* Ensure that this is a waiter sitting in futex_wait_requeue_pi() */
+ /*
+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+ * and waiting on the 'waitqueue' futex which is always !PI.
+ */
if (!top_waiter->rt_waiter || top_waiter->pi_state)
ret = -EINVAL;

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e347bbc12641d..eadaface1fd29 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -429,7 +429,10 @@ static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
struct rt_mutex_waiter *w)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
- wake_q_add(&wqh->rt_head, w->task);
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(w->task);
+ wqh->rtlock_task = w->task;
} else {
wake_q_add(&wqh->head, w->task);
}
@@ -437,8 +440,11 @@ static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,

static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
{
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !wake_q_empty(&wqh->rt_head))
- __wake_up_q(&wqh->rt_head, TASK_RTLOCK_WAIT);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }

if (!wake_q_empty(&wqh->head))
wake_up_q(&wqh->head);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 9697b04c40e68..61256de5bd66a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -44,18 +44,18 @@ struct rt_mutex_waiter {
/**
* rt_wake_q_head - Wrapper around regular wake_q_head to support
* "sleeping" spinlocks on RT
- * @head: The regular wake_q_head for sleeping lock variants
- * @rt_head: The wake_q_head for RT lock (spin/rwlock) variants
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
*/
struct rt_wake_q_head {
struct wake_q_head head;
- struct wake_q_head rt_head;
+ struct task_struct *rtlock_task;
};

#define DEFINE_RT_WAKE_Q(name) \
struct rt_wake_q_head name = { \
.head = WAKE_Q_HEAD_INITIALIZER(name.head), \
- .rt_head = WAKE_Q_HEAD_INITIALIZER(name.rt_head),\
+ .rtlock_task = NULL, \
}

/*
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index 9b7e3f413a828..519092ee5e88e 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(ww_mutex_lock);
int __sched
ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
}
EXPORT_SYMBOL(ww_mutex_lock_interruptible);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 87e0ad6665260..a742e9dc9a7cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -920,7 +920,7 @@ void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
put_task_struct(task);
}

-void __wake_up_q(struct wake_q_head *head, unsigned int state)
+void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;

@@ -936,7 +936,7 @@ void __wake_up_q(struct wake_q_head *head, unsigned int state)
* wake_up_process() executes a full barrier, which pairs with
* the queueing in wake_q_add() so as not to miss wakeups.
*/
- wake_up_state(task, state);
+ wake_up_process(task);
put_task_struct(task);
}
}
diff --git a/localversion-rt b/localversion-rt
index 1445cd65885cd..ad3da1bcab7e8 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt3
+-rt4


2021-08-04 09:12:20

by Daniel Wagner

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

Hi Sebastian,

On Mon, Aug 02, 2021 at 06:27:50PM +0200, Sebastian Andrzej Siewior wrote:
> Dear RT folks!
>
> I'm pleased to announce the v5.14-rc4-rt4 patch set.

stress-ng is able to trigger a BUG on my old x86_64 box very
easy. Unfortunately, there is not much in the logs though.

+ ./cyclictest.sh -D 15m -p 98 -i 1000 -t 2 -a 0-1 -h 100 -w 'stress-ng --class io --all 1'
# /dev/cpu_dma_latency set to 0us
[ 21.059000] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[ 21.059008] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 321, name: iou-wrk-314
[ 21.059010] Preemption disabled at:
[ 21.059011] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[ 23.076059] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[ 23.076064] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 324, name: iou-wrk-314
[ 23.076066] Preemption disabled at:
[ 23.076067] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[ 37.416490] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[ 37.416495] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 369, name: iou-wrk-314
[ 37.416498] Preemption disabled at:
[ 37.416498] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[ 38.416858] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[ 38.416863] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 411, name: iou-wrk-314
[ 38.416864] Preemption disabled at:
[ 38.416865] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[ 43.486360] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[ 43.486366] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 414, name: iou-wrk-314
[ 43.486368] Preemption disabled at:
[ 43.486369] [<ffffffffa5223ce7>] schedule+0xa7/0xf0

Thanks,
Daniel

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
> Odd. Do you have a config for that, please?

No need.
| [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
| [ 90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [ 90.202561] Call Trace:
| [ 90.202577] dump_stack_lvl+0x34/0x44
| [ 90.202584] ___might_sleep.cold+0x87/0x94
| [ 90.202588] rt_spin_lock+0x19/0x70
| [ 90.202593] ___slab_alloc+0xcb/0x7d0
| [ 90.202598] ? newidle_balance.constprop.0+0xf5/0x3b0
| [ 90.202603] ? dequeue_entity+0xc3/0x290
| [ 90.202605] ? io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202610] ? pick_next_task_fair+0xb9/0x330
| [ 90.202612] ? __schedule+0x670/0x1410
| [ 90.202615] ? io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
| [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202625] io_wq_worker_sleeping+0x37/0x50
| [ 90.202628] schedule+0x30/0xd0
| [ 90.202630] schedule_timeout+0x8f/0x1a0
| [ 90.202634] ? __bpf_trace_tick_stop+0x10/0x10
| [ 90.202637] io_wqe_worker+0xfd/0x320
| [ 90.202641] ? finish_task_switch.isra.0+0xd3/0x290
| [ 90.202644] ? io_worker_handle_work+0x670/0x670
| [ 90.202646] ? io_worker_handle_work+0x670/0x670
| [ 90.202649] ret_from_fork+0x22/0x30

le look.

> > Thanks,
> > Daniel

Sebastian

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
> > Odd. Do you have a config for that, please?
>
> No need.
> | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
> | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
> | [ 90.202561] Call Trace:

> | [ 90.202588] rt_spin_lock+0x19/0x70
> | [ 90.202593] ___slab_alloc+0xcb/0x7d0

> | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
> | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
> | [ 90.202625] io_wq_worker_sleeping+0x37/0x50
> | [ 90.202628] schedule+0x30/0xd0
>
> le look.

So this is due to commit
685fe7feedb96 ("io-wq: eliminate the need for a manager thread")

introduced in the v5.13-rc1 merge window. The call chain is
schedule()
sched_submit_work()
preempt_disable();
io_wq_worker_sleeping()
raw_spin_lock_irq(&worker->wqe->lock);
io_wqe_dec_running(worker);
io_queue_worker_create()
kmalloc(sizeof(*cwd), GFP_ATOMIC);

The lock wqe::lock has been turned into a raw_spinlock_t in commit
95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")

after a careful analysis of the code at that time. This commit breaks
things. Is this really needed?

> > > Thanks,
> > > Daniel

Sebastian

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 10:24:18 [+0200], Daniel Wagner wrote:
> Hi Sebastian,
>
> On Mon, Aug 02, 2021 at 06:27:50PM +0200, Sebastian Andrzej Siewior wrote:
> > Dear RT folks!
> >
> > I'm pleased to announce the v5.14-rc4-rt4 patch set.
>
> stress-ng is able to trigger a BUG on my old x86_64 box very
> easy. Unfortunately, there is not much in the logs though.

Odd. Do you have a config for that, please?

> Thanks,
> Daniel

Sebastian

2021-08-04 13:45:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
> > On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
> > > Odd. Do you have a config for that, please?
> >
> > No need.
> > | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> > | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
> > | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
> > | [ 90.202561] Call Trace:
> …
> > | [ 90.202588] rt_spin_lock+0x19/0x70
> > | [ 90.202593] ___slab_alloc+0xcb/0x7d0
> …
> > | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
> > | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
> > | [ 90.202625] io_wq_worker_sleeping+0x37/0x50
> > | [ 90.202628] schedule+0x30/0xd0
> >
> > le look.
>
> So this is due to commit
> 685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>
> introduced in the v5.13-rc1 merge window. The call chain is
> schedule()
> sched_submit_work()
> preempt_disable();
> io_wq_worker_sleeping()
> raw_spin_lock_irq(&worker->wqe->lock);
> io_wqe_dec_running(worker);
> io_queue_worker_create()
> kmalloc(sizeof(*cwd), GFP_ATOMIC);
>
> The lock wqe::lock has been turned into a raw_spinlock_t in commit
> 95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>
> after a careful analysis of the code at that time. This commit breaks
> things. Is this really needed?

Urgh, doing allocs from schedule seems really yuck. Can we please not do
this?

2021-08-04 13:48:36

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 7:17 AM, Peter Zijlstra wrote:
> On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
>>> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
>>>> Odd. Do you have a config for that, please?
>>>
>>> No need.
>>> | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>>> | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>>> | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
>>> | [ 90.202561] Call Trace:
>> …
>>> | [ 90.202588] rt_spin_lock+0x19/0x70
>>> | [ 90.202593] ___slab_alloc+0xcb/0x7d0
>> …
>>> | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
>>> | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
>>> | [ 90.202625] io_wq_worker_sleeping+0x37/0x50
>>> | [ 90.202628] schedule+0x30/0xd0
>>>
>>> le look.
>>
>> So this is due to commit
>> 685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>>
>> introduced in the v5.13-rc1 merge window. The call chain is
>> schedule()
>> sched_submit_work()
>> preempt_disable();
>> io_wq_worker_sleeping()
>> raw_spin_lock_irq(&worker->wqe->lock);
>> io_wqe_dec_running(worker);
>> io_queue_worker_create()
>> kmalloc(sizeof(*cwd), GFP_ATOMIC);
>>
>> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>> 95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>>
>> after a careful analysis of the code at that time. This commit breaks
>> things. Is this really needed?
>
> Urgh, doing allocs from schedule seems really yuck. Can we please not do
> this?

Agree, I have an idea of how to get rid of it. Let me experiment a bit...

--
Jens Axboe

2021-08-04 14:48:23

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 7:32 AM, Jens Axboe wrote:
> On 8/4/21 7:17 AM, Peter Zijlstra wrote:
>> On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
>>> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
>>>> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
>>>>> Odd. Do you have a config for that, please?
>>>>
>>>> No need.
>>>> | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>>>> | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>>>> | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
>>>> | [ 90.202561] Call Trace:
>>> …
>>>> | [ 90.202588] rt_spin_lock+0x19/0x70
>>>> | [ 90.202593] ___slab_alloc+0xcb/0x7d0
>>> …
>>>> | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
>>>> | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
>>>> | [ 90.202625] io_wq_worker_sleeping+0x37/0x50
>>>> | [ 90.202628] schedule+0x30/0xd0
>>>>
>>>> le look.
>>>
>>> So this is due to commit
>>> 685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>>>
>>> introduced in the v5.13-rc1 merge window. The call chain is
>>> schedule()
>>> sched_submit_work()
>>> preempt_disable();
>>> io_wq_worker_sleeping()
>>> raw_spin_lock_irq(&worker->wqe->lock);
>>> io_wqe_dec_running(worker);
>>> io_queue_worker_create()
>>> kmalloc(sizeof(*cwd), GFP_ATOMIC);
>>>
>>> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>>> 95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>>>
>>> after a careful analysis of the code at that time. This commit breaks
>>> things. Is this really needed?
>>
>> Urgh, doing allocs from schedule seems really yuck. Can we please not do
>> this?
>
> Agree, I have an idea of how to get rid of it. Let me experiment a bit...

Something like this should do it - the only thing we need the allocation for
is short lived, queueing a task_work item to create a new worker. We can
manage this on a per-existing worker basis, and just have the tw/index
stored in the worker itself. That avoids an allocation off schedule ->
going to sleep.

Totally untested, but I think the principle is sound. I'll run it through
some testing.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 50dc93ffc153..97eaaf25a429 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {

struct completion ref_done;

+ unsigned long create_state;
+ struct callback_head create_work;
+ int create_index;
+
struct rcu_head rcu;
};

@@ -261,42 +265,44 @@ static void io_wqe_inc_running(struct io_worker *worker)
atomic_inc(&acct->nr_running);
}

-struct create_worker_data {
- struct callback_head work;
- struct io_wqe *wqe;
- int index;
-};
-
static void create_worker_cb(struct callback_head *cb)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
struct io_wq *wq;

- cwd = container_of(cb, struct create_worker_data, work);
- wq = cwd->wqe->wq;
- create_io_worker(wq, cwd->wqe, cwd->index);
- kfree(cwd);
+ worker = container_of(cb, struct io_worker, create_work);
+ wq = worker->wqe->wq;
+ create_io_worker(wq, worker->wqe, worker->create_index);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}

-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wqe_acct *acct)
{
- struct create_worker_data *cwd;
struct io_wq *wq = wqe->wq;

/* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail;
+ /*
+ * create_state manages ownership of create_work/index. We should
+ * only need one entry per worker, as the worker going to sleep
+ * will trigger the condition, and waking will clear it once it
+ * runs the task_work.
+ */
+ if (test_bit(0, &worker->create_state) ||
+ test_and_set_bit_lock(0, &worker->create_state))
+ goto fail;

- cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
- if (cwd) {
- init_task_work(&cwd->work, create_worker_cb);
- cwd->wqe = wqe;
- cwd->index = acct->index;
- if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
- return;
+ io_worker_get(worker);
+ init_task_work(&worker->create_work, create_worker_cb);
+ worker->create_index = acct->index;
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ return;

- kfree(cwd);
- }
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
@@ -314,7 +320,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, acct);
+ io_queue_worker_create(wqe, worker, acct);
}
}

@@ -973,12 +979,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

static bool io_task_work_match(struct callback_head *cb, void *data)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;

if (cb->func != create_worker_cb)
return false;
- cwd = container_of(cb, struct create_worker_data, work);
- return cwd->wqe->wq == data;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker->wqe->wq == data;
}

void io_wq_exit_start(struct io_wq *wq)
@@ -995,12 +1001,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
return;

while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct create_worker_data *cwd;
+ struct io_worker *worker;

- cwd = container_of(cb, struct create_worker_data, work);
- atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+ worker = container_of(cb, struct io_worker, create_work);
+ atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
io_worker_ref_put(wq);
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}

rcu_read_lock();

--
Jens Axboe

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 08:23:55 [-0600], Jens Axboe wrote:
> Totally untested, but I think the principle is sound. I'll run it through
> some testing.

This is needed:

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 1192ee2abd982..77ec6896edaa5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -428,9 +428,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
}

if (stall_hash != -1U) {
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
}

return NULL;


otherwise the spinlock_t lock in io_wait_on_hash() is acquired with
disabled interrupts which is a no-no on -RT.
With that it all looks good as far as I can tell.
Thank you.

Sebastian

2021-08-04 15:43:30

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 9:33 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 08:23:55 [-0600], Jens Axboe wrote:
>> Totally untested, but I think the principle is sound. I'll run it through
>> some testing.
>
> This is needed:
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 1192ee2abd982..77ec6896edaa5 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -428,9 +428,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> }
>
> if (stall_hash != -1U) {
> - raw_spin_unlock(&wqe->lock);
> + raw_spin_unlock_irq(&wqe->lock);
> io_wait_on_hash(wqe, stall_hash);
> - raw_spin_lock(&wqe->lock);
> + raw_spin_lock_irq(&wqe->lock);
> }
>
> return NULL;
>
>
> otherwise the spinlock_t lock in io_wait_on_hash() is acquired with
> disabled interrupts which is a no-no on -RT.
> With that it all looks good as far as I can tell.
> Thank you.

I'm confused, the waitqueue locks are always IRQ disabling.

--
Jens Axboe

2021-08-04 15:55:19

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 9:47 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 09:39:30 [-0600], Jens Axboe wrote:
>> I'm confused, the waitqueue locks are always IRQ disabling.
>
> spin_lock_irq() does not disable interrupts on -RT. The patch above
> produces:
>
> | BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> | in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2020, name: iou-wrk-2018
> | 1 lock held by iou-wrk-2018/2020:
> | #0: ffff888111a47de8 (&hash->wait){+.+.}-{0:0}, at: io_worker_handle_work+0x443/0x630
> | irq event stamp: 10
> | hardirqs last enabled at (9): [<ffffffff81c47818>] _raw_spin_unlock_irqrestore+0x28/0x70
> | hardirqs last disabled at (10): [<ffffffff81c4769e>] _raw_spin_lock_irq+0x3e/0x40
> | softirqs last enabled at (0): [<ffffffff81077238>] copy_process+0x8f8/0x2020
> | softirqs last disabled at (0): [<0000000000000000>] 0x0
> | CPU: 5 PID: 2020 Comm: iou-wrk-2018 Tainted: G W 5.14.0-rc4-rt4+ #97
> | Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
> | Call Trace:
> | dump_stack_lvl+0x45/0x59
> | ___might_sleep.cold+0xa6/0xb6
> | rt_spin_lock+0x35/0xc0
> | ? io_worker_handle_work+0x443/0x630
> | io_worker_handle_work+0x443/0x630
> | io_wqe_worker+0xb4/0x340
> | ? lockdep_hardirqs_on_prepare+0xd4/0x170
> | ? _raw_spin_unlock_irqrestore+0x28/0x70
> | ? _raw_spin_unlock_irqrestore+0x28/0x70
> | ? io_worker_handle_work+0x630/0x630
> | ? rt_mutex_slowunlock+0x2ba/0x310
> | ? io_worker_handle_work+0x630/0x630
> | ret_from_fork+0x22/0x30
>
>
> But indeed, you are right, my snippet breaks non-RT. So this then maybe:
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 57d3cdddcdb3e..0b931ac3c83e6 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -384,7 +384,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
> {
> struct io_wq *wq = wqe->wq;
>
> - spin_lock(&wq->hash->wait.lock);
> + spin_lock_irq(&wq->hash->wait.lock);
> if (list_empty(&wqe->wait.entry)) {
> __add_wait_queue(&wq->hash->wait, &wqe->wait);
> if (!test_bit(hash, &wq->hash->map)) {
> @@ -392,7 +392,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
> list_del_init(&wqe->wait.entry);
> }
> }
> - spin_unlock(&wq->hash->wait.lock);
> + spin_unlock_irq(&wq->hash->wait.lock);
> }
>
> static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> @@ -430,9 +430,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> }
>
> if (stall_hash != -1U) {
> - raw_spin_unlock(&wqe->lock);
> + raw_spin_unlock_irq(&wqe->lock);
> io_wait_on_hash(wqe, stall_hash);
> - raw_spin_lock(&wqe->lock);
> + raw_spin_lock_irq(&wqe->lock);
> }
>
> return NULL;
>
> (this is on-top of the patch you sent earlier and Daniel Cc: me on after
> I checked that the problem/warning still exists).

That'd work on non-RT as well, but it makes it worse on non-RT as well with
the irq enable/disable dance. While that's not the end of the world, would
be nice to have a solution that doesn't sacrifice anything, yet doesn't
make RT unhappy.

--
Jens Axboe

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 09:49:48 [-0600], Jens Axboe wrote:
>
> That'd work on non-RT as well, but it makes it worse on non-RT as well with
> the irq enable/disable dance. While that's not the end of the world, would
> be nice to have a solution that doesn't sacrifice anything, yet doesn't
> make RT unhappy.

There were plans to make local_irq_disable() mostly a nop (similar to
what ppc64 does). But I have no idea what happened. I hope that work
gets picked up.

Sebastian

2021-08-04 16:08:57

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 9:57 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 09:49:48 [-0600], Jens Axboe wrote:
>>
>> That'd work on non-RT as well, but it makes it worse on non-RT as well with
>> the irq enable/disable dance. While that's not the end of the world, would
>> be nice to have a solution that doesn't sacrifice anything, yet doesn't
>> make RT unhappy.
>
> There were plans to make local_irq_disable() mostly a nop (similar to
> what ppc64 does). But I have no idea what happened. I hope that work
> gets picked up.

So what do you propose in the interim? As far as io_uring is concerned,
it's not a _huge_ deal to do the IRQ dance, but it does bother me that
we're making things slightly worse for the mainline kernel just to make
the out-of-tree patches happy.

--
Jens Axboe

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 12:17:04 [-0400], Steven Rostedt wrote:
> Perhaps in this situation, we could open code it to:
>
> if (stall_hash != -1U) {
> raw_spin_unlock(&wqe->lock);
>
> /* On RT the spin_lock_irq() does not disable interrupts */
> if (IS_ENABLED(CONFIG_PREEMPT_RT))
> local_irq_enable();

no preemption happens here with NEED_RESCHED set.

> io_wait_on_hash(wqe, stall_hash);
>
> if (IS_ENABLED(CONFIG_PREEMPT_RT))
> local_irq_disable();
> raw_spin_lock(&wqe->lock);
> }
>
> Note, I haven't looked at the rest of the code to know the ripple
> effect of this, but I'm just suggesting the idea.
>

Sebastian

2021-08-04 16:58:19

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 10:47 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 10:22:59 [-0600], Jens Axboe wrote:
>>
>> In that regard, I do still consider those patches out-of-tree, which
>> they are. And while I'm more sympathetic to them compared to other
>> out-of-tree code as there's a long term plan to get it all in, it's
>> still out-of-tree. Best solution here is probably to just carry that
>> particular change in the RT patchset for now.
>
> So today in the morning I learned that there is a memory allocation in
> an IRQ-off section and now, a patch later, it is almost gone. So that
> makes me actually happy :)

1 out of 2 is better than 0 ;-)

> The spin_lock_irq() vs local_irq_disable() + spin_lock() is documented
> in Documentation/locking/locktypes.rst.
> That said I have no problem by carrying that patch in the RT-patchset
> and revisit it later.

Right, I suspect that was added as a pre RT patch dump at some point.
It's a newer thing. Is it actually possible to set PREEMPT_RT in the
mainline kernel? Looks like it depends on ARCH_SUPPORTS_RT and nobody
sets that.

So I agree that just carrying your solution in the RT patchset is fine
for now, we can revisit later.

--
Jens Axboe

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 12:47:55 [-0400], Steven Rostedt wrote:
> On Wed, 4 Aug 2021 18:31:19 +0200
> Sebastian Andrzej Siewior <[email protected]> wrote:
>
> > CPU-local wake-ups just set NEED_RESCHED and wait for preempt_enable()
> > to do the magic. Just because the code not perform wake_up() now does
> > not mean it will not do so in the future. Also it is here as an example
> > which might be copied somewhere else.
>
> Does this mean all local_irq_disable/enable() is audited? What do you do for;
>
> local_irq_disable();
> [..]
> wakeup_process(x); /* on local CPU */
> [..]
> local_irq_enable();

I hunted and fixed a few of those. I still have few
preempt_check_resched_rt() which I want fix other than what is in RT.

> And if local_irq_disable() is not used anymore, or seldom, what harm
> would it be to add a preemption check to that caller? And change
> local_irq_enable() that is used internally by other atom functions be
> called __local_irq_enable()?
>
> Not to mention that we could just open code that too:
>
> if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> local_irq_enable();
> preempt_check_resched();
> }
>
> And make it ugly enough that nobody will want to copy it :-)

I remember that the amount of enthusiasm was quite low when it was
suggested that local_irq_enable() gets additionally the preempt-check.
Maybe was due to the people involved :)
But we managed to work around it for most callers. Therefore we I
wouldn't suggest local_irq_disable_rt(). We had it in -RT, we had a
bunch of users and all of them were fixed in a different way.

Same goes btw. for preempt_disable_rt() which has been reduced to vmstat
and had previously more users :)

> -- Steve

Sebastian

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 10:57:11 [-0600], Jens Axboe wrote:
> > The spin_lock_irq() vs local_irq_disable() + spin_lock() is documented
> > in Documentation/locking/locktypes.rst.
> > That said I have no problem by carrying that patch in the RT-patchset
> > and revisit it later.
>
> Right, I suspect that was added as a pre RT patch dump at some point.
> It's a newer thing. Is it actually possible to set PREEMPT_RT in the
> mainline kernel? Looks like it depends on ARCH_SUPPORTS_RT and nobody
> sets that.

Yes. It is upstream now. The documentation and CONFIG_PREEMPT_RT symbol.
The Kconfig symbol has already users outside of the -RT patch.

You can't enable it yet since we require a few things which are on their
way.

Having the symbol and the documentation helped to convince some people
:)

> So I agree that just carrying your solution in the RT patchset is fine
> for now, we can revisit later.

Okay.

Sebastian

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 09:39:30 [-0600], Jens Axboe wrote:
> I'm confused, the waitqueue locks are always IRQ disabling.

spin_lock_irq() does not disable interrupts on -RT. The patch above
produces:

| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2020, name: iou-wrk-2018
| 1 lock held by iou-wrk-2018/2020:
| #0: ffff888111a47de8 (&hash->wait){+.+.}-{0:0}, at: io_worker_handle_work+0x443/0x630
| irq event stamp: 10
| hardirqs last enabled at (9): [<ffffffff81c47818>] _raw_spin_unlock_irqrestore+0x28/0x70
| hardirqs last disabled at (10): [<ffffffff81c4769e>] _raw_spin_lock_irq+0x3e/0x40
| softirqs last enabled at (0): [<ffffffff81077238>] copy_process+0x8f8/0x2020
| softirqs last disabled at (0): [<0000000000000000>] 0x0
| CPU: 5 PID: 2020 Comm: iou-wrk-2018 Tainted: G W 5.14.0-rc4-rt4+ #97
| Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| Call Trace:
| dump_stack_lvl+0x45/0x59
| ___might_sleep.cold+0xa6/0xb6
| rt_spin_lock+0x35/0xc0
| ? io_worker_handle_work+0x443/0x630
| io_worker_handle_work+0x443/0x630
| io_wqe_worker+0xb4/0x340
| ? lockdep_hardirqs_on_prepare+0xd4/0x170
| ? _raw_spin_unlock_irqrestore+0x28/0x70
| ? _raw_spin_unlock_irqrestore+0x28/0x70
| ? io_worker_handle_work+0x630/0x630
| ? rt_mutex_slowunlock+0x2ba/0x310
| ? io_worker_handle_work+0x630/0x630
| ret_from_fork+0x22/0x30


But indeed, you are right, my snippet breaks non-RT. So this then maybe:

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 57d3cdddcdb3e..0b931ac3c83e6 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -384,7 +384,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{
struct io_wq *wq = wqe->wq;

- spin_lock(&wq->hash->wait.lock);
+ spin_lock_irq(&wq->hash->wait.lock);
if (list_empty(&wqe->wait.entry)) {
__add_wait_queue(&wq->hash->wait, &wqe->wait);
if (!test_bit(hash, &wq->hash->map)) {
@@ -392,7 +392,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
list_del_init(&wqe->wait.entry);
}
}
- spin_unlock(&wq->hash->wait.lock);
+ spin_unlock_irq(&wq->hash->wait.lock);
}

static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
@@ -430,9 +430,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
}

if (stall_hash != -1U) {
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock_irq(&wqe->lock);
io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock_irq(&wqe->lock);
}

return NULL;

(this is on-top of the patch you sent earlier and Daniel Cc: me on after
I checked that the problem/warning still exists).

Sebastian

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 10:05:39 [-0600], Jens Axboe wrote:
> So what do you propose in the interim? As far as io_uring is concerned,
> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
> we're making things slightly worse for the mainline kernel just to make
> the out-of-tree patches happy.

I'm sorry but I propose nothing other than the dance. I don't think
PeterZ/ tglx will appreciate the ifdefery to avoid the IRQ-on/off here.
I have no other ideas here.

Sebastian

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 12:25:41 [-0400], Steven Rostedt wrote:
> On Wed, 4 Aug 2021 18:22:31 +0200
> Sebastian Andrzej Siewior <[email protected]> wrote:
>
> > no preemption happens here with NEED_RESCHED set.
>
> But if interrupts were disabled, how would NEED_RESCHED be set? As soon
> as you enable interrupts, the interrupt that sets NEED_RESCHED would
> trigger the preemption.

CPU-local wake-ups just set NEED_RESCHED and wait for preempt_enable()
to do the magic. Just because the code not perform wake_up() now does
not mean it will not do so in the future. Also it is here as an example
which might be copied somewhere else.

> -- Steve

Sebastian

2021-08-04 19:29:11

by Steven Rostedt

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On Wed, 4 Aug 2021 09:49:48 -0600
Jens Axboe <[email protected]> wrote:

> > @@ -430,9 +430,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> > }
> >
> > if (stall_hash != -1U) {
> > - raw_spin_unlock(&wqe->lock);
> > + raw_spin_unlock_irq(&wqe->lock);
> > io_wait_on_hash(wqe, stall_hash);
> > - raw_spin_lock(&wqe->lock);
> > + raw_spin_lock_irq(&wqe->lock);
> > }
> >
> > return NULL;
> >
> > (this is on-top of the patch you sent earlier and Daniel Cc: me on after
> > I checked that the problem/warning still exists).
>
> That'd work on non-RT as well, but it makes it worse on non-RT as well with
> the irq enable/disable dance. While that's not the end of the world, would
> be nice to have a solution that doesn't sacrifice anything, yet doesn't
> make RT unhappy.

We use to have something like:

local_irq_disable_rt()

that would only disable irqs when PREEMPT_RT was configured, but this
was considered "ugly" and removed to try to only use spin_lock_irq()
and raw_spin_lock_irq(). But for this situation, it looks like it would
do exactly what you wanted. Not sacrifice anything yet keep RT happy.

Not sure that's a solution still. :-/

Perhaps in this situation, we could open code it to:

if (stall_hash != -1U) {
raw_spin_unlock(&wqe->lock);

/* On RT the spin_lock_irq() does not disable interrupts */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_enable();

io_wait_on_hash(wqe, stall_hash);

if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_disable();
raw_spin_lock(&wqe->lock);
}

Note, I haven't looked at the rest of the code to know the ripple
effect of this, but I'm just suggesting the idea.

-- Steve

2021-08-04 19:38:13

by Steven Rostedt

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On Wed, 4 Aug 2021 10:05:39 -0600
Jens Axboe <[email protected]> wrote:

> So what do you propose in the interim? As far as io_uring is concerned,
> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
> we're making things slightly worse for the mainline kernel just to make
> the out-of-tree patches happy.

Note that the purpose of these patches are to be able to bring those
out-of-tree patches into the kernel such that they are no longer
out-of-tree.

-- Steve

2021-08-04 19:38:34

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 10:20 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 10:05:39 [-0600], Jens Axboe wrote:
>> So what do you propose in the interim? As far as io_uring is concerned,
>> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
>> we're making things slightly worse for the mainline kernel just to make
>> the out-of-tree patches happy.
>
> I'm sorry but I propose nothing other than the dance. I don't think
> PeterZ/ tglx will appreciate the ifdefery to avoid the IRQ-on/off here.
> I have no other ideas here.

Alright, I'll see if I can think of something. Don't like doing ifdefs
either, or enable checks - but at least it'd only hurt readability
slightly at that point.

--
Jens Axboe

2021-08-04 19:49:32

by Jens Axboe

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 8/4/21 10:20 AM, Steven Rostedt wrote:
> On Wed, 4 Aug 2021 10:05:39 -0600
> Jens Axboe <[email protected]> wrote:
>
>> So what do you propose in the interim? As far as io_uring is concerned,
>> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
>> we're making things slightly worse for the mainline kernel just to make
>> the out-of-tree patches happy.
>
> Note that the purpose of these patches are to be able to bring those
> out-of-tree patches into the kernel such that they are no longer
> out-of-tree.

Sure, I realize that. And I've always been accommodating to making
pieces of code more RT friendly, I just don't like doing it for cases
where we are making mainline worse.

In that regard, I do still consider those patches out-of-tree, which
they are. And while I'm more sympathetic to them compared to other
out-of-tree code as there's a long term plan to get it all in, it's
still out-of-tree. Best solution here is probably to just carry that
particular change in the RT patchset for now.

--
Jens Axboe

2021-08-04 19:49:47

by Steven Rostedt

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On Wed, 4 Aug 2021 18:22:31 +0200
Sebastian Andrzej Siewior <[email protected]> wrote:

> no preemption happens here with NEED_RESCHED set.

But if interrupts were disabled, how would NEED_RESCHED be set? As soon
as you enable interrupts, the interrupt that sets NEED_RESCHED would
trigger the preemption.

-- Steve

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 10:22:59 [-0600], Jens Axboe wrote:
>
> In that regard, I do still consider those patches out-of-tree, which
> they are. And while I'm more sympathetic to them compared to other
> out-of-tree code as there's a long term plan to get it all in, it's
> still out-of-tree. Best solution here is probably to just carry that
> particular change in the RT patchset for now.

So today in the morning I learned that there is a memory allocation in
an IRQ-off section and now, a patch later, it is almost gone. So that
makes me actually happy :)

The spin_lock_irq() vs local_irq_disable() + spin_lock() is documented
in Documentation/locking/locktypes.rst.
That said I have no problem by carrying that patch in the RT-patchset
and revisit it later.

Sebastian

2021-08-04 20:06:15

by Steven Rostedt

[permalink] [raw]
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On Wed, 4 Aug 2021 18:31:19 +0200
Sebastian Andrzej Siewior <[email protected]> wrote:

> CPU-local wake-ups just set NEED_RESCHED and wait for preempt_enable()
> to do the magic. Just because the code not perform wake_up() now does
> not mean it will not do so in the future. Also it is here as an example
> which might be copied somewhere else.

Does this mean all local_irq_disable/enable() is audited? What do you do for;

local_irq_disable();
[..]
wakeup_process(x); /* on local CPU */
[..]
local_irq_enable();

And if local_irq_disable() is not used anymore, or seldom, what harm
would it be to add a preemption check to that caller? And change
local_irq_enable() that is used internally by other atom functions be
called __local_irq_enable()?

Not to mention that we could just open code that too:

if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
local_irq_enable();
preempt_check_resched();
}

And make it ugly enough that nobody will want to copy it :-)

-- Steve

Subject: Re: [ANNOUNCE] v5.14-rc4-rt4

On 2021-08-04 10:57:11 [-0600], Jens Axboe wrote:
>
> 1 out of 2 is better than 0 ;-)

The patch at the end of the email is what I ended up after rebasing to
-rc5. Also after testing I figured out that tools/io_uring/io_uring-cp.c
is no longer working. The resulting file has a size of either 0 or 4KiB.
Then, tglx pointed out that the example in liburing shares the same file
name and is actually working.

From: Jens Axboe <[email protected]>
Date: Wed, 4 Aug 2021 08:43:43 -0600
Subject: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path

Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
stress-ng:

| [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89
| [ 90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [ 90.202561] Call Trace:
| [ 90.202577] dump_stack_lvl+0x34/0x44
| [ 90.202584] ___might_sleep.cold+0x87/0x94
| [ 90.202588] rt_spin_lock+0x19/0x70
| [ 90.202593] ___slab_alloc+0xcb/0x7d0
| [ 90.202598] ? newidle_balance.constprop.0+0xf5/0x3b0
| [ 90.202603] ? dequeue_entity+0xc3/0x290
| [ 90.202605] ? io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202610] ? pick_next_task_fair+0xb9/0x330
| [ 90.202612] ? __schedule+0x670/0x1410
| [ 90.202615] ? io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0
| [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0
| [ 90.202625] io_wq_worker_sleeping+0x37/0x50
| [ 90.202628] schedule+0x30/0xd0
| [ 90.202630] schedule_timeout+0x8f/0x1a0
| [ 90.202634] ? __bpf_trace_tick_stop+0x10/0x10
| [ 90.202637] io_wqe_worker+0xfd/0x320
| [ 90.202641] ? finish_task_switch.isra.0+0xd3/0x290
| [ 90.202644] ? io_worker_handle_work+0x670/0x670
| [ 90.202646] ? io_worker_handle_work+0x670/0x670
| [ 90.202649] ret_from_fork+0x22/0x30

which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
a raw spinlock. Besides that not working on RT, doing any kind of
allocation from inside schedule() is kind of nasty and should be avoided
if at all possible.

This particular path happens when an io-wq worker goes to sleep, and we
need a new worker to handle pending work. We currently allocate a small
data item to hold the information we need to create a new worker, but we
can instead include this data in the io_worker struct itself and just
protect it with a single bit lock. We only really need one per worker
anyway, as we will have run pending work between to sleep cycles.

https://lore.kernel.org/lkml/[email protected]/

Reported-by: Daniel Wagner <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
Tested-by: Daniel Wagner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
fs/io-wq.c | 75 ++++++++++++++++++++++++++++++++++---------------------------
1 file changed, 42 insertions(+), 33 deletions(-)

--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {

struct completion ref_done;

+ unsigned long create_state;
+ struct callback_head create_work;
+ int create_index;
+
struct rcu_head rcu;
};

@@ -270,50 +274,54 @@ static void io_wqe_inc_running(struct io
atomic_inc(&acct->nr_running);
}

-struct create_worker_data {
- struct callback_head work;
- struct io_wqe *wqe;
- int index;
-};
-
static void create_worker_cb(struct callback_head *cb)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;
struct io_wq *wq;
struct io_wqe *wqe;
struct io_wqe_acct *acct;

- cwd = container_of(cb, struct create_worker_data, work);
- wqe = cwd->wqe;
- wq = wqe->wq;
- acct = &wqe->acct[cwd->index];
+ worker = container_of(cb, struct io_worker, create_work);
+ wqe = worker->wqe;
+ wq = worker->wq;
+ acct = &wqe->acct[worker->create_index];
raw_spin_lock_irq(&wqe->lock);
if (acct->nr_workers < acct->max_workers)
acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
- create_io_worker(wq, cwd->wqe, cwd->index);
- kfree(cwd);
+ create_io_worker(wq, wqe, worker->create_index);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}

-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker
+ *worker, struct io_wqe_acct *acct)
{
- struct create_worker_data *cwd;
struct io_wq *wq = wqe->wq;

/* raced with exit, just ignore create call */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
goto fail;
+ if (!io_worker_get(worker))
+ goto fail;
+ /*
+ * create_state manages ownership of create_work/index. We should
+ * only need one entry per worker, as the worker going to sleep
+ * will trigger the condition, and waking will clear it once it
+ * runs the task_work.
+ */
+ if (test_bit(0, &worker->create_state) ||
+ test_and_set_bit_lock(0, &worker->create_state))
+ goto fail_release;
+
+ init_task_work(&worker->create_work, create_worker_cb);
+ worker->create_index = acct->index;
+ if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+ return;

- cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
- if (cwd) {
- init_task_work(&cwd->work, create_worker_cb);
- cwd->wqe = wqe;
- cwd->index = acct->index;
- if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
- return;
-
- kfree(cwd);
- }
+ clear_bit_unlock(0, &worker->create_state);
+fail_release:
+ io_worker_release(worker);
fail:
atomic_dec(&acct->nr_running);
io_worker_ref_put(wq);
@@ -331,7 +339,7 @@ static void io_wqe_dec_running(struct io
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, acct);
+ io_queue_worker_create(wqe, worker, acct);
}
}

@@ -992,12 +1000,12 @@ struct io_wq *io_wq_create(unsigned boun

static bool io_task_work_match(struct callback_head *cb, void *data)
{
- struct create_worker_data *cwd;
+ struct io_worker *worker;

if (cb->func != create_worker_cb)
return false;
- cwd = container_of(cb, struct create_worker_data, work);
- return cwd->wqe->wq == data;
+ worker = container_of(cb, struct io_worker, create_work);
+ return worker->wqe->wq == data;
}

void io_wq_exit_start(struct io_wq *wq)
@@ -1014,12 +1022,13 @@ static void io_wq_exit_workers(struct io
return;

while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct create_worker_data *cwd;
+ struct io_worker *worker;

- cwd = container_of(cb, struct create_worker_data, work);
- atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+ worker = container_of(cb, struct io_worker, create_work);
+ atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
io_worker_ref_put(wq);
- kfree(cwd);
+ clear_bit_unlock(0, &worker->create_state);
+ io_worker_release(worker);
}

rcu_read_lock();

Sebastian

2021-08-10 16:25:03

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path

Hi Sebastian,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.14-rc5]
[cannot apply to next-20210810]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9a73fa375d58fee5262dd16473c8e7522bdf44de
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# https://github.com/0day-ci/linux/commit/7fd11421b3672d0230a9b529445014d99185b387
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
git checkout 7fd11421b3672d0230a9b529445014d99185b387
# save the attached .config to linux build tree
mkdir build_dir
make W=1 O=build_dir ARCH=um SUBARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

fs/io-wq.c: In function 'create_worker_cb':
>> fs/io-wq.c:286:15: error: 'struct io_worker' has no member named 'wq'; did you mean 'wqe'?
286 | wq = worker->wq;
| ^~
| wqe


vim +286 fs/io-wq.c

276
277 static void create_worker_cb(struct callback_head *cb)
278 {
279 struct io_worker *worker;
280 struct io_wq *wq;
281 struct io_wqe *wqe;
282 struct io_wqe_acct *acct;
283
284 worker = container_of(cb, struct io_worker, create_work);
285 wqe = worker->wqe;
> 286 wq = worker->wq;
287 acct = &wqe->acct[worker->create_index];
288 raw_spin_lock_irq(&wqe->lock);
289 if (acct->nr_workers < acct->max_workers)
290 acct->nr_workers++;
291 raw_spin_unlock_irq(&wqe->lock);
292 create_io_worker(wq, wqe, worker->create_index);
293 clear_bit_unlock(0, &worker->create_state);
294 io_worker_release(worker);
295 }
296

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (2.47 kB)
.config.gz (9.31 kB)
Download all attachments

2021-08-10 16:44:25

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path

Hi Sebastian,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.14-rc5]
[cannot apply to next-20210810]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9a73fa375d58fee5262dd16473c8e7522bdf44de
config: x86_64-randconfig-a012-20210809 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 614c7d03877fd99c2de47429b15be3f00306a3bd)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/7fd11421b3672d0230a9b529445014d99185b387
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
git checkout 7fd11421b3672d0230a9b529445014d99185b387
# save the attached .config to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> fs/io-wq.c:286:15: error: no member named 'wq' in 'struct io_worker'
wq = worker->wq;
~~~~~~ ^
1 error generated.


vim +286 fs/io-wq.c

276
277 static void create_worker_cb(struct callback_head *cb)
278 {
279 struct io_worker *worker;
280 struct io_wq *wq;
281 struct io_wqe *wqe;
282 struct io_wqe_acct *acct;
283
284 worker = container_of(cb, struct io_worker, create_work);
285 wqe = worker->wqe;
> 286 wq = worker->wq;
287 acct = &wqe->acct[worker->create_index];
288 raw_spin_lock_irq(&wqe->lock);
289 if (acct->nr_workers < acct->max_workers)
290 acct->nr_workers++;
291 raw_spin_unlock_irq(&wqe->lock);
292 create_io_worker(wq, wqe, worker->create_index);
293 clear_bit_unlock(0, &worker->create_state);
294 io_worker_release(worker);
295 }
296

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (2.65 kB)
.config.gz (31.38 kB)
Download all attachments