Hi all,
Brandon and I have been debugging a nasty race that leads to
BUG_ON(!newowner) in fixup_pi_state_owner() in futex.c. So far
we've only been able to reproduce the issue on 4.9.y-rt kernels.
We are still testing if this is a problem for later RT branches.
The original reproducing app was complex and took about a month
to trigger it. However based on event traces I was able to put
together the .c reproducer below. This triggers the BUG_ON on a
2-core Intel Atom E3825 system when using 4.9.y-rt within
hours (up to a day).
Here's an attempt to describe in a pseudo trace the sequence of
events that leads to the BUG_ON. All threads start on the same
CPU (CPU1 in the linked traces). The waiter, waker and boosted
threads start as SCHED_OTHER prio:120, rt_thread is configured as
SCHED_FIFO prio:92.
waiter_thread waker_thread boosted_thread rt_thread
clock_nanosleep()
futex(FUTEX_WAIT_REQUEUE_PI,
cond, lock1, timeout)
futex_wait_requeue_pi()
futex_wait_queue_me()
<scheduled out>
futex(FUTEX_LOCK_PI, lock2)
clock_nanosleep()
futex(FUTEX_LOCK_PI, lock1)
futex(FUTEX_CMP_REQUEUE_PI,
cond, lock1)
futex(FUTEX_UNLOCK_PI, lock1)
<local_timer_entry>
<sched_wakeup: rt_thread>
<sched_wakeup: boosted_thread>
<local_timer_exit>
<sched_wakeup: waiter_thread>
<sched_switch: rt_thread>
<migrated to cpu 0> futex(FUTEX_LOCK_PI, lock2)
futex(FUTEX_LOCK_PI, lock1)
<scheduled out>
<sched_pi_setprio: boosted_thread,
oldprio=120 newprio=92>
<sched_wakeup: boosted_thread>
<sched_switch: waiter_thread>
rt_mutex_wait_proxy_lock() <local_timer_entry>
-ETIMEDOUT // boosted thread gets
fixup_owner() // delayed expiring
fixup_pi_state_owner() // unrelated timers
// acquires wait_lock
__rt_mutex_futex_trylock()
__rt_mutex_slowtrylock() <local_timer_exit>
try_to_take_rt_mutex() // spins on wait_lock
// fails to take the lock
// the lock has no owner
// but it has higher priority
// waiters enqueued (i.e. boosted_thread)
BUG_ON(!newowner)
Full ftrace event dumps on OOPS are available at:
https://github.com/gratian/traces
In the same repo you'll also find info on the boot messages for
this hardware and a capture of a kgdb session with prints for the
data structures involved.
Some important data points - at the time of the crash:
* the q->pi_state->pi_mutex->owner is 0x1
* the q->pi_state->pi_mutex has waiters enqueued and the
current 'waiter_thread' task is not the highest priority
waiter
* the q->pi_state->owner is the 'waiter_thread'
You can find the kernel branch used in these tests here:
https://github.com/gratian/linux/commits/stable-rt/v4.9-rt-ni-serial
The other thing worth mentioning is the fact that the comment in
fixup_pi_state_owner() related to this:
/*
* Since we just failed the trylock; there must be an owner.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
BUG_ON(!newowner);
seems incomplete since to my eye there is a code path in
__try_to_take_rt_mutex() through which the trylock can fail
because the current task is not the highest priority waiter but
the rt mutex doesn't have the owner set yet (i.e. the highest
priority waiter didn't get a chance to fix-up the owner field
yet).
I'll be happy to provide any additional info.
Thoughts and suggestions appreciated.
Thanks,
Gratian
--8<---------------cut here---------------start------------->8---
/*
* fbomb.c
* Reproducer for BUG_ON(!newowner) in fixup_pi_state_owner()
* <linux>/kernel/futex.c
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdarg.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <err.h>
#include <errno.h>
#include <sched.h>
#include <time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/sysinfo.h>
#include <linux/futex.h>
#include <limits.h>
#define TEST_CPU 1
#define NSEC_PER_USEC 1000ULL
#define NSEC_PER_SEC 1000000000ULL
#define FUTEX_TID_MASK 0x3fffffff
typedef uint32_t futex_t;
static futex_t cond;
static futex_t lock1 = 0;
static futex_t lock2 = 0;
static pthread_barrier_t start_barrier;
static int futex_lock_pi(futex_t *futex)
{
int ret;
pid_t tid;
futex_t zero = 0;
if (!futex)
return EINVAL;
tid = syscall(SYS_gettid);
if (tid == (*futex & FUTEX_TID_MASK))
return EDEADLOCK;
ret = __atomic_compare_exchange_n(futex, &zero, tid, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
/* no pending waiters; we got the lock */
if (ret)
return 0;
ret = syscall(SYS_futex, futex,
FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG,
0, NULL, NULL, 0);
return (ret) ? errno : 0;
}
static int futex_unlock_pi(futex_t *futex)
{
int ret;
pid_t tid;
if (!futex)
return EINVAL;
tid = syscall(SYS_gettid);
if (tid != (*futex & FUTEX_TID_MASK))
return EPERM;
ret = __atomic_compare_exchange_n(futex, &tid, 0, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
if (ret)
return 0;
ret = syscall(SYS_futex, futex,
FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG,
0, NULL, NULL, 0);
return (ret) ? errno : 0;
}
static int futex_wait_requeue_pi(futex_t *cond, futex_t *futex,
const struct timespec *to)
{
int ret;
(*cond)++;
futex_unlock_pi(futex);
ret = syscall(SYS_futex, cond,
FUTEX_WAIT_REQUEUE_PI | FUTEX_PRIVATE_FLAG,
*cond, to, futex, 0);
if (ret) {
ret = errno;
/* on error, the kernel didn't acquire the lock for us */
futex_lock_pi(futex);
}
return ret;
}
static int futex_cmp_requeue_pi(futex_t *cond, futex_t *futex)
{
int ret;
(*cond)++;
ret = syscall(SYS_futex, cond,
FUTEX_CMP_REQUEUE_PI | FUTEX_PRIVATE_FLAG,
1, INT_MAX, futex, *cond);
return (ret) ? errno : 0;
}
static void tsnorm(struct timespec *ts)
{
while (ts->tv_nsec >= NSEC_PER_SEC) {
ts->tv_nsec -= NSEC_PER_SEC;
ts->tv_sec++;
}
}
static unsigned long long tsdiff(struct timespec *start, struct timespec *end)
{
unsigned long long t1 = (unsigned long long)(start->tv_sec) * NSEC_PER_SEC +
start->tv_nsec;
unsigned long long t2 = (unsigned long long)(end->tv_sec) * NSEC_PER_SEC +
end->tv_nsec;
return t2 - t1;
}
static void set_cpu_affinity(int cpu)
{
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask) < 0)
err(1, "Failed to set the CPU affinity");
}
static void clr_cpu_affinity()
{
cpu_set_t mask;
int i;
CPU_ZERO(&mask);
for (i = 0; i < get_nprocs(); i++)
CPU_SET(i, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask) < 0)
err(1, "Failed to clear the CPU affinity");
}
static void set_fifo_priority(int prio)
{
struct sched_param schedp;
memset(&schedp, 0, sizeof(schedp));
schedp.sched_priority = prio;
if (sched_setscheduler(0, SCHED_FIFO, &schedp) < 0)
err(1, "Failed to set the test thread priority");
}
static void do_busy_work(unsigned long long nsec)
{
struct timespec start_ts;
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &start_ts);
do {
clock_gettime(CLOCK_MONOTONIC, &ts);
} while (tsdiff(&start_ts, &ts) < nsec);
}
static void* boosted_thread(void *param)
{
struct timespec sleep_ts = {.tv_sec = 0, .tv_nsec = 600000ULL};
while (1)
{
set_cpu_affinity(TEST_CPU);
pthread_barrier_wait(&start_barrier);
clr_cpu_affinity();
futex_lock_pi(&lock2);
clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL);
futex_lock_pi(&lock1);
futex_unlock_pi(&lock1);
futex_unlock_pi(&lock2);
}
}
#define SWEEP_START 500000ULL
#define SWEEP_END 900000ULL
#define SWEEP_STEP 100
static void* rt_thread(void *param)
{
struct timespec sleep_ts;
unsigned long long nsec;
set_cpu_affinity(TEST_CPU);
set_fifo_priority(7);
nsec = SWEEP_START;
while(1) {
pthread_barrier_wait(&start_barrier);
nsec += SWEEP_STEP;
if (nsec > SWEEP_END)
nsec = SWEEP_START;
sleep_ts.tv_sec = 0;
sleep_ts.tv_nsec = nsec;
clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL);
/* preempt the waiter thread right after it got
* signaled and allow the boosted_thread to block on
* lock1 after taking lock2 */
do_busy_work(2000000ULL);
/* this should boost the boosted_thread and migrate it */
futex_lock_pi(&lock2);
futex_unlock_pi(&lock2);
}
}
static void* waiter_thread(void *param)
{
struct timespec ts;
set_cpu_affinity(TEST_CPU);
while(1) {
pthread_barrier_wait(&start_barrier);
futex_lock_pi(&lock1);
clock_gettime(CLOCK_MONOTONIC, &ts);
ts.tv_nsec += 800000ULL;
tsnorm(&ts);
futex_wait_requeue_pi(&cond, &lock1, &ts);
futex_unlock_pi(&lock1);
}
}
static void* waker_thread(void *param)
{
struct timespec sleep_ts = {.tv_sec = 0, .tv_nsec = 500000ULL};
set_cpu_affinity(TEST_CPU);
while(1) {
pthread_barrier_wait(&start_barrier);
clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL);
futex_lock_pi(&lock1);
futex_cmp_requeue_pi(&cond, &lock1);
futex_unlock_pi(&lock1);
}
}
int main(int argc, char *argv[])
{
pthread_t waiter;
pthread_t waker;
pthread_t rt;
pthread_t boosted;
if (pthread_barrier_init(&start_barrier, NULL, 4))
err(1, "Failed to create start_barrier");
if (pthread_create(&waker, NULL, waker_thread, NULL) != 0)
err(1, "Failed to create waker thread");
pthread_setname_np(waker, "f_waker");
if (pthread_create(&waiter, NULL, waiter_thread, NULL) != 0)
err(1, "Failed to create waiter thread");
pthread_setname_np(waiter, "f_waiter");
if (pthread_create(&rt, NULL, rt_thread, NULL) != 0)
err(1, "Failed to create rt thread");
pthread_setname_np(rt, "f_rt");
if (pthread_create(&boosted, NULL, boosted_thread, NULL) != 0)
err(1, "Failed to create boosted thread");
pthread_setname_np(boosted, "f_boosted");
/* block here */
pthread_join(waker, NULL);
pthread_join(waiter, NULL);
pthread_join(rt, NULL);
pthread_join(boosted, NULL);
}
--8<---------------cut here---------------end--------------->8---
Hi all,
I apologize for waking up the futex demons (and replying to my own
email), but ...
Gratian Crisan writes:
>
> Brandon and I have been debugging a nasty race that leads to
> BUG_ON(!newowner) in fixup_pi_state_owner() in futex.c. So far
> we've only been able to reproduce the issue on 4.9.y-rt kernels.
> We are still testing if this is a problem for later RT branches.
I was able to reproduce the BUG_ON(!newowner) in fixup_pi_state_owner()
with a 5.10.0-rc1-rt1 kernel (currently testing 5.10.0-rc2-rt4).
The kernel branch I had set up for this test is available here:
https://github.com/gratian/linux/commits/devel-rt/v5.10-rc1-rt1-ni-serial
It has a minimal set of commits on top of devel-rt/5.10.0-rc1-rt1 to
enable the UART on this hardware and add a kgdb breakpoint (so I can
extract more information before the BUG_ON).
I've captured the reproducer, boot messages, OOPS, full event ftrace
dump and some kgdb info here: https://github.com/gratian/traces
The abbreviated OOPS (w/o the ftrace dump since it's too large to
include inline; see link[1]):
[15812.238958] ------------[ cut here ]------------
[15812.238963] kernel BUG at kernel/futex.c:2399!
[15812.238974] invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI
[15812.261428] CPU: 1 PID: 1972 Comm: f_waiter Tainted: G U W 5.10.0-rc1-rt1-00015-g10c3af983f2e #1
[15812.271341] Hardware name: National Instruments NI cRIO-9035/NI cRIO-9035, BIOS 1.3.0f1 07/18/2016
[15812.280298] RIP: 0010:fixup_pi_state_owner.isra.0+0x2d7/0x380
[15812.286047] Code: 0f 85 a5 fe ff ff 48 8b 7d b8 e8 e4 fe 6b 00 85 c0 0f 85 94 fe ff ff 4d 8b 75 28 49 83 e6 fe 0f 85 bd fd ff ff e8 f9 f2 02 00 <0f> 0b 48 8b 45 a8 48 8b 38 e8 ab 1a 6c 00 48 8b 7d b8 e8 52 19 6c
[15812.304817] RSP: 0018:ffffc90001877b88 EFLAGS: 00010046
[15812.310043] RAX: 0000000000000000 RBX: ffff888006c52640 RCX: 0000000000000001
[15812.317177] RDX: 0000000000000078 RSI: 000000000000005c RDI: ffff888003e7f028
[15812.324310] RBP: ffffc90001877c08 R08: 0000000000000001 R09: ffff888003e7f010
[15812.331443] R10: 00000000000001d8 R11: 0000000000000000 R12: 000056469b8770e4
[15812.338576] R13: ffff888003e7f000 R14: 0000000000000000 R15: ffff88801f281708
[15812.345713] FS: 00007f0d0f4e1700(0000) GS:ffff888037b00000(0000) knlGS:0000000000000000
[15812.353802] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[15812.359547] CR2: 00007f2f6e5830e1 CR3: 0000000008a36000 CR4: 00000000001006e0
[15812.366682] Call Trace:
[15812.369131] fixup_owner+0x6a/0x70
[15812.372534] futex_wait_requeue_pi.constprop.0+0x5a1/0x6b0
[15812.378024] ? __hrtimer_init_sleeper+0x60/0x60
[15812.382562] do_futex+0x515/0xc90
[15812.385879] ? trace_buffer_unlock_commit_regs+0x40/0x1f0
[15812.391278] ? trace_event_buffer_commit+0x7b/0x260
[15812.396158] ? trace_event_raw_event_sys_enter+0x9a/0x100
[15812.401558] ? _copy_from_user+0x2b/0x60
[15812.405484] __x64_sys_futex+0x144/0x180
[15812.409410] do_syscall_64+0x38/0x50
[15812.412986] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[15812.418037] RIP: 0033:0x7f0d0ec6b4c9
[15812.421613] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 9f 39 2b 00 f7 d8 64 89 01 48
[15812.440383] RSP: 002b:00007f0d0f4e0ec8 EFLAGS: 00000216 ORIG_RAX: 00000000000000ca
[15812.447952] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0d0ec6b4c9
[15812.455085] RDX: 00000000001a560b RSI: 000000000000008b RDI: 000056469b8770e0
[15812.462219] RBP: 00007f0d0f4e0f10 R08: 000056469b8770e4 R09: 0000000000000000
[15812.469352] R10: 00007f0d0f4e0f30 R11: 0000000000000216 R12: 0000000000000000
[15812.476485] R13: 00007ffdd55cb86f R14: 0000000000000000 R15: 00007f0d0f56e040
[15812.483622] Modules linked in: g_ether u_ether libcomposite udc_core ipv6 i915 mousedev hid_multitouch intel_gtt coretemp drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops aesni_intel glue_helper crypto_simd igb drm i2c_i801 lpc_ich mfd_core i2c_algo_bit i2c_smbus agpgart dwc3_pci video backlight button [last unloaded: rng_core]
[15812.513719] Dumping ftrace buffer:
[15812.517121] ---------------------------------
<snip> see:
[1] https://github.com/gratian/traces/blob/main/oops_ftrace-5.10.0-rc1-rt1-00015-g10c3af983f2e-dump1.txt
[16492.155124] ---------------------------------
[16492.159486] ---[ end trace 0000000000000003 ]---
[16492.159489] printk: enabled sync mode
[16492.167762] RIP: 0010:fixup_pi_state_owner.isra.0+0x2d7/0x380
[16492.173513] Code: 0f 85 a5 fe ff ff 48 8b 7d b8 e8 e4 fe 6b 00 85 c0 0f 85 94 fe ff ff 4d 8b 75 28 49 83 e6 fe 0f 85 bd fd ff ff e8 f9 f2 02 00 <0f> 0b 48 8b 45 a8 48 8b 38 e8 ab 1a 6c 00 48 8b 7d b8 e8 52 19 6c
[16492.192282] RSP: 0018:ffffc90001877b88 EFLAGS: 00010046
[16492.197508] RAX: 0000000000000000 RBX: ffff888006c52640 RCX: 0000000000000001
[16492.204642] RDX: 0000000000000078 RSI: 000000000000005c RDI: ffff888003e7f028
[16492.211776] RBP: ffffc90001877c08 R08: 0000000000000001 R09: ffff888003e7f010
[16492.218910] R10: 00000000000001d8 R11: 0000000000000000 R12: 000056469b8770e4
[16492.226043] R13: ffff888003e7f000 R14: 0000000000000000 R15: ffff88801f281708
[16492.233180] FS: 00007f0d0f4e1700(0000) GS:ffff888037b00000(0000) knlGS:0000000000000000
[16492.241270] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[16492.247016] CR2: 00007f2f6e5830e1 CR3: 0000000008a36000 CR4: 00000000001006e0
[16492.254150] Kernel panic - not syncing: Fatal exception
[16493.395619] Shutting down cpus with NMI
[16493.399460] Dumping ftrace buffer:
[16493.402861] (ftrace buffer empty)
[16493.406436] Kernel Offset: disabled
[16493.409923] Rebooting in 30 seconds..
[16523.417644] ACPI MEMORY or I/O RESET_REG.
I've also included the modified reproducer that triggers the issue
below.
Ideas?
Thanks,
Gratian
--8<---------------cut here---------------start------------->8---
/*
* fbomb_v2.c
* Reproducer for BUG_ON(!newowner) in fixup_pi_state_owner()
* <linux>/kernel/futex.c
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdarg.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <err.h>
#include <errno.h>
#include <sched.h>
#include <time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/sysinfo.h>
#include <linux/futex.h>
#include <limits.h>
#define TEST_CPU 1
#define NSEC_PER_USEC 1000ULL
#define NSEC_PER_SEC 1000000000ULL
#define NOISE_THREADS 64
#define FUTEX_TID_MASK 0x3fffffff
typedef uint32_t futex_t;
static futex_t cond;
static futex_t lock1 = 0;
static futex_t lock2 = 0;
static pthread_barrier_t start_barrier;
static int futex_lock_pi(futex_t *futex)
{
int ret;
pid_t tid;
futex_t zero = 0;
if (!futex)
return EINVAL;
tid = syscall(SYS_gettid);
if (tid == (*futex & FUTEX_TID_MASK))
return EDEADLOCK;
ret = __atomic_compare_exchange_n(futex, &zero, tid, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
/* no pending waiters; we got the lock */
if (ret)
return 0;
ret = syscall(SYS_futex, futex,
FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG,
0, NULL, NULL, 0);
return (ret) ? errno : 0;
}
static int futex_unlock_pi(futex_t *futex)
{
int ret;
pid_t tid;
if (!futex)
return EINVAL;
tid = syscall(SYS_gettid);
if (tid != (*futex & FUTEX_TID_MASK))
return EPERM;
ret = __atomic_compare_exchange_n(futex, &tid, 0, false,
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
if (ret)
return 0;
ret = syscall(SYS_futex, futex,
FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG,
0, NULL, NULL, 0);
return (ret) ? errno : 0;
}
static int futex_wait_requeue_pi(futex_t *cond, futex_t *futex,
const struct timespec *to)
{
int ret;
(*cond)++;
futex_unlock_pi(futex);
ret = syscall(SYS_futex, cond,
FUTEX_WAIT_REQUEUE_PI | FUTEX_PRIVATE_FLAG,
*cond, to, futex, 0);
if (ret) {
ret = errno;
/* on error, the kernel didn't acquire the lock for us */
futex_lock_pi(futex);
}
return ret;
}
static int futex_cmp_requeue_pi(futex_t *cond, futex_t *futex)
{
int ret;
(*cond)++;
ret = syscall(SYS_futex, cond,
FUTEX_CMP_REQUEUE_PI | FUTEX_PRIVATE_FLAG,
1, INT_MAX, futex, *cond);
return (ret) ? errno : 0;
}
static void tsnorm(struct timespec *ts)
{
while (ts->tv_nsec >= NSEC_PER_SEC) {
ts->tv_nsec -= NSEC_PER_SEC;
ts->tv_sec++;
}
}
static unsigned long long tsdiff(struct timespec *start, struct timespec *end)
{
unsigned long long t1 = (unsigned long long)(start->tv_sec) * NSEC_PER_SEC +
start->tv_nsec;
unsigned long long t2 = (unsigned long long)(end->tv_sec) * NSEC_PER_SEC +
end->tv_nsec;
return t2 - t1;
}
static void set_cpu_affinity(int cpu)
{
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask) < 0)
err(1, "Failed to set the CPU affinity");
}
static void clr_cpu_affinity()
{
cpu_set_t mask;
int i;
CPU_ZERO(&mask);
for (i = 0; i < get_nprocs(); i++)
CPU_SET(i, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask) < 0)
err(1, "Failed to clear the CPU affinity");
}
static void set_fifo_priority(int prio)
{
struct sched_param schedp;
memset(&schedp, 0, sizeof(schedp));
schedp.sched_priority = prio;
if (sched_setscheduler(0, SCHED_FIFO, &schedp) < 0)
err(1, "Failed to set the test thread priority");
}
static void do_busy_work(unsigned long long nsec)
{
struct timespec start_ts;
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &start_ts);
do {
clock_gettime(CLOCK_MONOTONIC, &ts);
} while (tsdiff(&start_ts, &ts) < nsec);
}
static void* boosted_thread(void *param)
{
struct timespec sleep_ts = {.tv_sec = 0, .tv_nsec = 600000ULL};
while (1)
{
set_cpu_affinity(TEST_CPU);
pthread_barrier_wait(&start_barrier);
clr_cpu_affinity();
futex_lock_pi(&lock2);
clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL);
futex_lock_pi(&lock1);
futex_unlock_pi(&lock1);
futex_unlock_pi(&lock2);
}
}
#define SWEEP_START 500000ULL
#define SWEEP_END 900000ULL
#define SWEEP_STEP 100
static void* rt_thread(void *param)
{
struct timespec sleep_ts;
unsigned long long nsec;
set_cpu_affinity(TEST_CPU);
set_fifo_priority(7);
nsec = SWEEP_START;
while(1) {
pthread_barrier_wait(&start_barrier);
nsec += SWEEP_STEP;
if (nsec > SWEEP_END)
nsec = SWEEP_START;
sleep_ts.tv_sec = 0;
sleep_ts.tv_nsec = nsec;
clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL);
/* preempt the waiter thread right after it got
* signaled and allow the boosted_thread to block on
* lock1 after taking lock2 */
do_busy_work(2000000ULL);
/* this should boost the boosted_thread and migrate it */
futex_lock_pi(&lock2);
futex_unlock_pi(&lock2);
}
}
static void* waiter_thread(void *param)
{
struct timespec ts;
set_cpu_affinity(TEST_CPU);
while(1) {
pthread_barrier_wait(&start_barrier);
futex_lock_pi(&lock1);
clock_gettime(CLOCK_MONOTONIC, &ts);
ts.tv_nsec += 800000ULL;
tsnorm(&ts);
futex_wait_requeue_pi(&cond, &lock1, &ts);
futex_unlock_pi(&lock1);
}
}
static void* waker_thread(void *param)
{
struct timespec sleep_ts = {.tv_sec = 0, .tv_nsec = 500000ULL};
set_cpu_affinity(TEST_CPU);
while(1) {
pthread_barrier_wait(&start_barrier);
clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL);
futex_lock_pi(&lock1);
futex_cmp_requeue_pi(&cond, &lock1);
futex_unlock_pi(&lock1);
}
}
static void* noise_thread(void *param)
{
struct timespec ts;
set_cpu_affinity(0);
while(1) {
ts.tv_sec = 0;
ts.tv_nsec = random() % 1000000;
clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL);
}
}
int main(int argc, char *argv[])
{
pthread_t waiter;
pthread_t waker;
pthread_t rt;
pthread_t boosted;
pthread_t noise[NOISE_THREADS];
int i;
if (pthread_barrier_init(&start_barrier, NULL, 4))
err(1, "Failed to create start_barrier");
if (pthread_create(&waker, NULL, waker_thread, NULL) != 0)
err(1, "Failed to create waker thread");
pthread_setname_np(waker, "f_waker");
if (pthread_create(&waiter, NULL, waiter_thread, NULL) != 0)
err(1, "Failed to create waiter thread");
pthread_setname_np(waiter, "f_waiter");
if (pthread_create(&rt, NULL, rt_thread, NULL) != 0)
err(1, "Failed to create rt thread");
pthread_setname_np(rt, "f_rt");
if (pthread_create(&boosted, NULL, boosted_thread, NULL) != 0)
err(1, "Failed to create boosted thread");
pthread_setname_np(boosted, "f_boosted");
for (i = 0; i < NOISE_THREADS; i++) {
if (pthread_create(&noise[i], NULL, noise_thread, NULL) != 0)
err(1, "Failed to create noise thread");
pthread_setname_np(noise[i], "f_noise");
}
/* block here */
pthread_join(waker, NULL);
pthread_join(waiter, NULL);
pthread_join(rt, NULL);
pthread_join(boosted, NULL);
}
--8<---------------cut here---------------end--------------->8---
On Tue, 2020-11-03 at 17:31 -0600, Gratian Crisan wrote:
> Hi all,
>
> I apologize for waking up the futex demons (and replying to my own
> email), but ...
>
> Gratian Crisan writes:
> >
> > Brandon and I have been debugging a nasty race that leads to
> > BUG_ON(!newowner) in fixup_pi_state_owner() in futex.c. So far
> > we've only been able to reproduce the issue on 4.9.y-rt kernels.
> > We are still testing if this is a problem for later RT branches.
>
> I was able to reproduce the BUG_ON(!newowner) in fixup_pi_state_owner()
> with a 5.10.0-rc1-rt1 kernel (currently testing 5.10.0-rc2-rt4).
My box says it's generic.
KERNEL: vmlinux-5.10.0.gb7cbaf5-master.gz
DUMPFILE: vmcore
CPUS: 8
DATE: Wed Nov 4 01:46:56 2020
UPTIME: 00:02:06
LOAD AVERAGE: 0.25, 0.15, 0.06
TASKS: 726
NODENAME: homer
RELEASE: 5.10.0.gb7cbaf5-master
VERSION: #26 SMP Tue Nov 3 14:10:35 CET 2020
MACHINE: x86_64 (3591 Mhz)
MEMORY: 16 GB
PANIC: ""
PID: 4631
COMMAND: "f_waiter"
TASK: ffff88818a1fb900 [THREAD_INFO: ffff88818a1fb900]
CPU: 1
STATE: TASK_RUNNING (PANIC)
crash.rt> bt
PID: 4631 TASK: ffff88818a1fb900 CPU: 1 COMMAND: "f_waiter"
#0 [ffff88816a0b3a58] machine_kexec at ffffffff8104b2dc
#1 [ffff88816a0b3aa0] __crash_kexec at ffffffff810fc97a
#2 [ffff88816a0b3b60] crash_kexec at ffffffff810fda55
#3 [ffff88816a0b3b70] oops_end at ffffffff81021813
#4 [ffff88816a0b3b90] do_trap at ffffffff8101eaec
#5 [ffff88816a0b3be0] do_error_trap at ffffffff8101ebd5
#6 [ffff88816a0b3c20] exc_invalid_op at ffffffff816d8bdb
#7 [ffff88816a0b3c40] asm_exc_invalid_op at ffffffff81800a62
#8 [ffff88816a0b3cc8] fixup_pi_state_owner at ffffffff810f065c
#9 [ffff88816a0b3d58] futex_wait_requeue_pi.constprop.0 at ffffffff810f1fcb
#10 [ffff88816a0b3ec8] do_futex at ffffffff810f2482
#11 [ffff88816a0b3ed8] __x64_sys_futex at ffffffff810f2ab5
#12 [ffff88816a0b3f40] do_syscall_64 at ffffffff816d88c3
#13 [ffff88816a0b3f50] entry_SYSCALL_64_after_hwframe at ffffffff8180007c
RIP: 00007f665b94f839 RSP: 00007f665b056e88 RFLAGS: 00000212
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f665b94f839
RDX: 0000000000000509 RSI: 000000000000008b RDI: 00000000006020c0
RBP: 00007f665b056ed0 R8: 00000000006020c4 R9: 0000000000000000
R10: 00007f665b056ef0 R11: 0000000000000212 R12: 00007ffd42284c3e
R13: 00007ffd42284c3f R14: 0000000000000000 R15: 00007ffd42284c40
ORIG_RAX: 00000000000000ca CS: 0033 SS: 002b
crash.rt> gdb list *0xffffffff810f065c
0xffffffff810f065c is in fixup_pi_state_owner (kernel/futex.c:2386).
2381
2382 /*
2383 * Since we just failed the trylock; there must be an owner.
2384 */
2385 newowner = rt_mutex_owner(&pi_state->pi_mutex);
2386 BUG_ON(!newowner);
2387 } else {
2388 WARN_ON_ONCE(argowner != current);
2389 if (oldowner == current) {
2390 /*
crash.rt>
On Wed, 2020-11-04 at 01:56 +0100, Mike Galbraith wrote:
> On Tue, 2020-11-03 at 17:31 -0600, Gratian Crisan wrote:
> > Hi all,
> >
> > I apologize for waking up the futex demons (and replying to my own
> > email), but ...
> >
> > Gratian Crisan writes:
> > >
> > > Brandon and I have been debugging a nasty race that leads to
> > > BUG_ON(!newowner) in fixup_pi_state_owner() in futex.c. So far
> > > we've only been able to reproduce the issue on 4.9.y-rt kernels.
> > > We are still testing if this is a problem for later RT branches.
> >
> > I was able to reproduce the BUG_ON(!newowner) in fixup_pi_state_owner()
> > with a 5.10.0-rc1-rt1 kernel (currently testing 5.10.0-rc2-rt4).
>
> My box says it's generic.
---
kernel/futex.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2383,7 +2383,18 @@ static int fixup_pi_state_owner(u32 __us
* Since we just failed the trylock; there must be an owner.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
+
+ /*
+ * Why? Because I know what I'm doing with these beasts? Nope,
+ * but what the hell, a busy restart loop let f_boosted become
+ * owner, so go for it. Box still boots, works, no longer makes
+ * boom with fbomb_v2, and as an added bonus, didn't even blow
+ * futextests all up. Maybe it'll help... or not, we'll see.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_err;
+ }
} else {
WARN_ON_ONCE(argowner != current);
if (oldowner == current) {
On Tue, Nov 03 2020 at 17:31, Gratian Crisan wrote:
> I apologize for waking up the futex demons (and replying to my own
> email), but ...
I was staring at it already but couldn't wrap my head around it.
> Gratian Crisan writes:
> I was able to reproduce the BUG_ON(!newowner) in fixup_pi_state_owner()
> with a 5.10.0-rc1-rt1 kernel (currently testing 5.10.0-rc2-rt4).
>
> I've captured the reproducer, boot messages, OOPS, full event ftrace
> dump and some kgdb info here: https://github.com/gratian/traces
>
> The abbreviated OOPS (w/o the ftrace dump since it's too large to
> include inline; see link[1]):
Let me start at that then.
Thanks,
tglx
On Wed, Nov 04 2020 at 08:42, Mike Galbraith wrote:
> On Wed, 2020-11-04 at 01:56 +0100, Mike Galbraith wrote:
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2383,7 +2383,18 @@ static int fixup_pi_state_owner(u32 __us
> * Since we just failed the trylock; there must be an owner.
> */
> newowner = rt_mutex_owner(&pi_state->pi_mutex);
> - BUG_ON(!newowner);
> +
> + /*
> + * Why? Because I know what I'm doing with these beasts? Nope,
> + * but what the hell, a busy restart loop let f_boosted become
> + * owner, so go for it. Box still boots, works, no longer makes
> + * boom with fbomb_v2, and as an added bonus, didn't even blow
> + * futextests all up. Maybe it'll help... or not, we'll see.
> + */
> + if (unlikely(!newowner)) {
> + err = -EAGAIN;
> + goto handle_err;
Yes, that cures it, but does not really explain why newowner is
NULL. Lemme stare more.
On Wed, Nov 04 2020 at 11:24, Thomas Gleixner wrote:
> On Wed, Nov 04 2020 at 08:42, Mike Galbraith wrote:
>> On Wed, 2020-11-04 at 01:56 +0100, Mike Galbraith wrote:
>> --- a/kernel/futex.c
>> +++ b/kernel/futex.c
>> @@ -2383,7 +2383,18 @@ static int fixup_pi_state_owner(u32 __us
>> * Since we just failed the trylock; there must be an owner.
>> */
>> newowner = rt_mutex_owner(&pi_state->pi_mutex);
>> - BUG_ON(!newowner);
>> +
>> + /*
>> + * Why? Because I know what I'm doing with these beasts? Nope,
>> + * but what the hell, a busy restart loop let f_boosted become
>> + * owner, so go for it. Box still boots, works, no longer makes
>> + * boom with fbomb_v2, and as an added bonus, didn't even blow
>> + * futextests all up. Maybe it'll help... or not, we'll see.
>> + */
>> + if (unlikely(!newowner)) {
>> + err = -EAGAIN;
>> + goto handle_err;
>
> Yes, that cures it, but does not really explain why newowner is
> NULL. Lemme stare more.
Aside of that it's going to create inconsistent state in the worst
case. There is something really fishy in the trace Gratian provided....
On Wed, Nov 04 2020 at 12:17, Thomas Gleixner wrote:
> On Wed, Nov 04 2020 at 11:24, Thomas Gleixner wrote:
>> On Wed, Nov 04 2020 at 08:42, Mike Galbraith wrote:
>>> On Wed, 2020-11-04 at 01:56 +0100, Mike Galbraith wrote:
>>> --- a/kernel/futex.c
>>> +++ b/kernel/futex.c
>>> @@ -2383,7 +2383,18 @@ static int fixup_pi_state_owner(u32 __us
>>> * Since we just failed the trylock; there must be an owner.
>>> */
>>> newowner = rt_mutex_owner(&pi_state->pi_mutex);
>>> - BUG_ON(!newowner);
>>> +
>>> + /*
>>> + * Why? Because I know what I'm doing with these beasts? Nope,
>>> + * but what the hell, a busy restart loop let f_boosted become
>>> + * owner, so go for it. Box still boots, works, no longer makes
>>> + * boom with fbomb_v2, and as an added bonus, didn't even blow
>>> + * futextests all up. Maybe it'll help... or not, we'll see.
>>> + */
>>> + if (unlikely(!newowner)) {
>>> + err = -EAGAIN;
>>> + goto handle_err;
>>
>> Yes, that cures it, but does not really explain why newowner is
>> NULL. Lemme stare more.
>
> Aside of that it's going to create inconsistent state in the worst
> case. There is something really fishy in the trace Gratian provided....
Ok. With some more info added to the trace I could convince myself, that
Gratians trace is not so fishy at all and that Mike's fix is actually
the right thing to do.
I'll post that with a proper comment and changelog. Mike, can I add your
Signed-off-by to the thing?
On Wed, 2020-11-04 at 14:26 +0100, Thomas Gleixner wrote:
>
> I'll post that with a proper comment and changelog. Mike, can I add your
> Signed-off-by to the thing?
I suppose.
-Mike
From: Mike Galbraith <[email protected]>
Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
This is one possible chain of events leading to this:
Task Prio Operation
T1 120 lock(F)
T2 120 lock(F) -> blocks (top waiter)
T3 50 (RT) lock(F) -> boosts T3 and blocks (new top waiter)
XX timeout/ -> wakes T2
signal
T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter
and the lower priority T2 cannot steal the lock.
-> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
The comment states that this is invalid and rt_mutex_real_owner() must
return a non NULL owner when the trylock failed, but in case of a queued
and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
state. The higher priority waiter has simply not yet managed to take over
the rtmutex.
The BUG_ON() is therefore wrong and this is just another retry condition in
fixup_pi_state_owner().
Drop the locks, so that T3 can make progress, and then try the fixup again.
Gratian provided a great analysis, traces and a reproducer. The analysis is
to the point, but it confused the hell out of that tglx dude who had to
page in all the futex horrors again. Condensed version is above.
[ tglx: Wrote comment and changelog ]
Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
Reported-by: Gratian Crisan <[email protected]>
Signed-off-by: Mike Galbraith <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: https://lore.kernel.org/r/[email protected]
---
kernel/futex.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2380,10 +2380,22 @@ static int fixup_pi_state_owner(u32 __us
}
/*
- * Since we just failed the trylock; there must be an owner.
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ ret = -EAGAIN;
+ goto handle_err;
+ }
} else {
WARN_ON_ONCE(argowner != current);
if (oldowner == current) {
On Wed, Nov 04 2020 at 16:12, Thomas Gleixner wrote:
> From: Mike Galbraith <[email protected]>
>
> Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
> This is one possible chain of events leading to this:
>
> Task Prio Operation
> T1 120 lock(F)
> T2 120 lock(F) -> blocks (top waiter)
> T3 50 (RT) lock(F) -> boosts T3 and blocks (new top waiter)
boosts T1 obviously as Sebastian just pointed out to me. /me pulls the
futex induced brain damage excuse ...
On Wed, 2020-11-04 at 16:12 +0100, Thomas Gleixner wrote:
> From: Mike Galbraith <[email protected]>
Hrmph. How about a suggested-by, or just take it. That dinky diag hit
the mark (not _entirely_ by accident, but..;) is irrelevant, you did
all of the work to make it a patch.
-Mike
> Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
> This is one possible chain of events leading to this:
>
> Task Prio Operation
> T1 120 lock(F)
> T2 120 lock(F) -> blocks (top waiter)
> T3 50 (RT) lock(F) -> boosts T3 and blocks (new top waiter)
> XX timeout/ -> wakes T2
> signal
> T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
> T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter
> and the lower priority T2 cannot steal the lock.
> -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
>
> The comment states that this is invalid and rt_mutex_real_owner() must
> return a non NULL owner when the trylock failed, but in case of a queued
> and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
> state. The higher priority waiter has simply not yet managed to take over
> the rtmutex.
>
> The BUG_ON() is therefore wrong and this is just another retry condition in
> fixup_pi_state_owner().
>
> Drop the locks, so that T3 can make progress, and then try the fixup again.
>
> Gratian provided a great analysis, traces and a reproducer. The analysis is
> to the point, but it confused the hell out of that tglx dude who had to
> page in all the futex horrors again. Condensed version is above.
>
> [ tglx: Wrote comment and changelog ]
>
> Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
> Reported-by: Gratian Crisan <[email protected]>
> Signed-off-by: Mike Galbraith <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: [email protected]
> Link: https://lore.kernel.org/r/[email protected]
> ---
> kernel/futex.c | 16 ++++++++++++++--
> 1 file changed, 14 insertions(+), 2 deletions(-)
>
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2380,10 +2380,22 @@ static int fixup_pi_state_owner(u32 __us
> }
>
> /*
> - * Since we just failed the trylock; there must be an owner.
> + * The trylock just failed, so either there is an owner or
> + * there is a higher priority waiter than this one.
> */
> newowner = rt_mutex_owner(&pi_state->pi_mutex);
> - BUG_ON(!newowner);
> + /*
> + * If the higher priority waiter has not yet taken over the
> + * rtmutex then newowner is NULL. We can't return here with
> + * that state because it's inconsistent vs. the user space
> + * state. So drop the locks and try again. It's a valid
> + * situation and not any different from the other retry
> + * conditions.
> + */
> + if (unlikely(!newowner)) {
> + ret = -EAGAIN;
> + goto handle_err;
> + }
> } else {
> WARN_ON_ONCE(argowner != current);
> if (oldowner == current) {
Thomas Gleixner writes:
> From: Mike Galbraith <[email protected]>
>
> Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
> This is one possible chain of events leading to this:
>
> Task Prio Operation
> T1 120 lock(F)
> T2 120 lock(F) -> blocks (top waiter)
> T3 50 (RT) lock(F) -> boosts T3 and blocks (new top waiter)
> XX timeout/ -> wakes T2
> signal
> T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
> T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter
> and the lower priority T2 cannot steal the lock.
> -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
>
> The comment states that this is invalid and rt_mutex_real_owner() must
> return a non NULL owner when the trylock failed, but in case of a queued
> and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
> state. The higher priority waiter has simply not yet managed to take over
> the rtmutex.
>
> The BUG_ON() is therefore wrong and this is just another retry condition in
> fixup_pi_state_owner().
>
> Drop the locks, so that T3 can make progress, and then try the fixup again.
>
> Gratian provided a great analysis, traces and a reproducer. The analysis is
> to the point, but it confused the hell out of that tglx dude who had to
> page in all the futex horrors again. Condensed version is above.
>
> [ tglx: Wrote comment and changelog ]
>
> Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
> Reported-by: Gratian Crisan <[email protected]>
> Signed-off-by: Mike Galbraith <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: [email protected]
> Link: https://urldefense.com/v3/__https://lore.kernel.org/r/[email protected]__;!!FbZ0ZwI3Qg!5INAsNbAVSp3jaNkkjFazSRC86BpcZnVM3-oTDYl0KijU6jA5pWYk4KI79_L5F4$
LGTM, no crashes in my testing today.
-Gratian
> ---
> kernel/futex.c | 16 ++++++++++++++--
> 1 file changed, 14 insertions(+), 2 deletions(-)
>
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2380,10 +2380,22 @@ static int fixup_pi_state_owner(u32 __us
> }
>
> /*
> - * Since we just failed the trylock; there must be an owner.
> + * The trylock just failed, so either there is an owner or
> + * there is a higher priority waiter than this one.
> */
> newowner = rt_mutex_owner(&pi_state->pi_mutex);
> - BUG_ON(!newowner);
> + /*
> + * If the higher priority waiter has not yet taken over the
> + * rtmutex then newowner is NULL. We can't return here with
> + * that state because it's inconsistent vs. the user space
> + * state. So drop the locks and try again. It's a valid
> + * situation and not any different from the other retry
> + * conditions.
> + */
> + if (unlikely(!newowner)) {
> + ret = -EAGAIN;
> + goto handle_err;
> + }
> } else {
> WARN_ON_ONCE(argowner != current);
> if (oldowner == current) {
The following commit has been merged into the locking/urgent branch of tip:
Commit-ID: 63c1b4db662a0967dd7839a2fbaa5300e553901d
Gitweb: https://git.kernel.org/tip/63c1b4db662a0967dd7839a2fbaa5300e553901d
Author: Mike Galbraith <[email protected]>
AuthorDate: Wed, 04 Nov 2020 16:12:44 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Fri, 06 Nov 2020 22:24:58 +01:00
futex: Handle transient "ownerless" rtmutex state correctly
Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
This is one possible chain of events leading to this:
Task Prio Operation
T1 120 lock(F)
T2 120 lock(F) -> blocks (top waiter)
T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter)
XX timeout/ -> wakes T2
signal
T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter
and the lower priority T2 cannot steal the lock.
-> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
The comment states that this is invalid and rt_mutex_real_owner() must
return a non NULL owner when the trylock failed, but in case of a queued
and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
state. The higher priority waiter has simply not yet managed to take over
the rtmutex.
The BUG_ON() is therefore wrong and this is just another retry condition in
fixup_pi_state_owner().
Drop the locks, so that T3 can make progress, and then try the fixup again.
Gratian provided a great analysis, traces and a reproducer. The analysis is
to the point, but it confused the hell out of that tglx dude who had to
page in all the futex horrors again. Condensed version is above.
[ tglx: Wrote comment and changelog ]
Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
Reported-by: Gratian Crisan <[email protected]>
Signed-off-by: Mike Galbraith <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: https://lore.kernel.org/r/[email protected]
Link: https://lore.kernel.org/r/[email protected]
---
kernel/futex.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index f8614ef..7406914 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2380,10 +2380,22 @@ retry:
}
/*
- * Since we just failed the trylock; there must be an owner.
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ ret = -EAGAIN;
+ goto handle_err;
+ }
} else {
WARN_ON_ONCE(argowner != current);
if (oldowner == current) {
On Fri, 2020-11-06 at 21:26 +0000, tip-bot2 for Mike Galbraith wrote:
>
> ---
> kernel/futex.c | 16 ++++++++++++++--
> 1 file changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index f8614ef..7406914 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -2380,10 +2380,22 @@ retry:
> }
>
> /*
> - * Since we just failed the trylock; there must be an owner.
> + * The trylock just failed, so either there is an owner or
> + * there is a higher priority waiter than this one.
> */
> newowner = rt_mutex_owner(&pi_state->pi_mutex);
> - BUG_ON(!newowner);
> + /*
> + * If the higher priority waiter has not yet taken over the
> + * rtmutex then newowner is NULL. We can't return here with
> + * that state because it's inconsistent vs. the user space
> + * state. So drop the locks and try again. It's a valid
> + * situation and not any different from the other retry
> + * conditions.
> + */
> + if (unlikely(!newowner)) {
> + ret = -EAGAIN;
^^^
My box just discovered an unnoticed typo. That 'ret' should read 'err'
so we goto retry, else fbomb_v2 proggy will trigger gripeage.
[ 44.089233] fuse: init (API version 7.32)
[ 78.485163] ------------[ cut here ]------------
[ 78.485171] WARNING: CPU: 1 PID: 4557 at kernel/futex.c:2482 fixup_pi_state_owner.isra.17+0x125/0x350
[ 78.485171] ------------[ cut here ]------------
[ 78.485174] WARNING: CPU: 2 PID: 4559 at kernel/futex.c:1486 do_futex+0x920/0xaf0
<snip>
-Mike
On Sat, Nov 07 2020 at 18:05, Mike Galbraith wrote:
> On Fri, 2020-11-06 at 21:26 +0000, tip-bot2 for Mike Galbraith wrote:
>> + if (unlikely(!newowner)) {
>> + ret = -EAGAIN;
> ^^^
>
> My box just discovered an unnoticed typo. That 'ret' should read 'err'
> so we goto retry, else fbomb_v2 proggy will trigger gripeage.
Bah. I noticed and wanted to fix it before committing and then
forgot. Fixed now.
The following commit has been merged into the locking/urgent branch of tip:
Commit-ID: 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627
Gitweb: https://git.kernel.org/tip/9f5d1c336a10c0d24e83e40b4c1b9539f7dba627
Author: Mike Galbraith <[email protected]>
AuthorDate: Wed, 04 Nov 2020 16:12:44 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Sat, 07 Nov 2020 22:07:04 +01:00
futex: Handle transient "ownerless" rtmutex state correctly
Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
This is one possible chain of events leading to this:
Task Prio Operation
T1 120 lock(F)
T2 120 lock(F) -> blocks (top waiter)
T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter)
XX timeout/ -> wakes T2
signal
T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter
and the lower priority T2 cannot steal the lock.
-> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
The comment states that this is invalid and rt_mutex_real_owner() must
return a non NULL owner when the trylock failed, but in case of a queued
and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
state. The higher priority waiter has simply not yet managed to take over
the rtmutex.
The BUG_ON() is therefore wrong and this is just another retry condition in
fixup_pi_state_owner().
Drop the locks, so that T3 can make progress, and then try the fixup again.
Gratian provided a great analysis, traces and a reproducer. The analysis is
to the point, but it confused the hell out of that tglx dude who had to
page in all the futex horrors again. Condensed version is above.
[ tglx: Wrote comment and changelog ]
Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
Reported-by: Gratian Crisan <[email protected]>
Signed-off-by: Mike Galbraith <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: https://lore.kernel.org/r/[email protected]
Link: https://lore.kernel.org/r/[email protected]
---
kernel/futex.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index f8614ef..ac32887 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2380,10 +2380,22 @@ retry:
}
/*
- * Since we just failed the trylock; there must be an owner.
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
*/
newowner = rt_mutex_owner(&pi_state->pi_mutex);
- BUG_ON(!newowner);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_err;
+ }
} else {
WARN_ON_ONCE(argowner != current);
if (oldowner == current) {