Dear RT folks!
I'm pleased to announce the v5.9.1-rt19 patch set.
Changes since v5.9.1-rt18:
- Mike Galbraith reported a possible circular locking dependency with
a seqcount. Backported a patch from upstream solving the issue.
- David Runge reported a crash in the block layer.
- The migrate-disable series by Peter Zijlstra has been update to v4.
- Mike Galbraith reported a possible hang due to the new printk code.
Dropped a patch from the printk code which was causing the problem
as suggested by John Ogness.
- The rtmutex clean up in v5.9-rc8-rt12 restructured the code path and
removed the blk_schedule_flush_plug() invocation from the locking
path. It turns out that it is still required and has been added
back.
- A small rtmutex related clean up:
- Remove rt_mutex_lock_killable(), it has no users.
- Use _mutex_lock_io_nested() for _mutex_lock_io() to avoid
duplicated code.
Known issues
- It has been pointed out that due to changes to the printk code the
internal buffer representation changed. This is only an issue if tools
like `crash' are used to extract the printk buffer from a kernel memory
image.
The delta patch against v5.9.1-rt18 is appended below and can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/incr/patch-5.9.1-rt18-rt19.patch.xz
You can get this release via the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.9.1-rt19
The RT patch against v5.9.1 can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patch-5.9.1-rt19.patch.xz
The split quilt queue is available at:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patches-5.9.1-rt19.tar.xz
Sebastian
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e37aa31332b70..99d2fb51e0e84 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -647,7 +647,7 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
int cpu = raw_smp_processor_id();
- if (!IS_ENABLED(CONFIG_SMP) ||
+ if (!IS_ENABLED(CONFIG_SMP) || IS_ENABLED(CONFIG_PREEMPT_RT) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
return false;
diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
index 7179367bfb5e2..f0b2e07cd5c57 100644
--- a/include/linux/mutex_rt.h
+++ b/include/linux/mutex_rt.h
@@ -29,7 +29,6 @@ struct mutex {
extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
extern void __lockfunc _mutex_lock(struct mutex *lock);
-extern void __lockfunc _mutex_lock_io(struct mutex *lock);
extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
@@ -46,7 +45,7 @@ extern void __lockfunc _mutex_unlock(struct mutex *lock);
#define mutex_lock_killable(l) _mutex_lock_killable(l)
#define mutex_trylock(l) _mutex_trylock(l)
#define mutex_unlock(l) _mutex_unlock(l)
-#define mutex_lock_io(l) _mutex_lock_io(l);
+#define mutex_lock_io(l) _mutex_lock_io_nested(l, 0);
#define __mutex_owner(l) ((l)->lock.owner)
@@ -77,7 +76,7 @@ do { \
# define mutex_lock_killable_nested(l, s) \
_mutex_lock_killable(l)
# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
-# define mutex_lock_io_nested(l, s) _mutex_lock_io(l)
+# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s)
#endif
# define mutex_init(mutex) \
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 5308cd7ddddf0..b02009f530263 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -112,7 +112,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock);
#endif
extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
-extern int rt_mutex_lock_killable(struct rt_mutex *lock);
extern int rt_mutex_trylock(struct rt_mutex *lock);
extern void rt_mutex_unlock(struct rt_mutex *lock);
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index f73c7eb68f27c..76e44e6c01004 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -173,6 +173,19 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
* @lock: Pointer to the associated lock
*/
+#define seqcount_LOCKNAME_init(s, _lock, lockname) \
+ do { \
+ seqcount_##lockname##_t *____s = (s); \
+ seqcount_init(&____s->seqcount); \
+ __SEQ_LOCK(____s->lock = (_lock)); \
+ } while (0)
+
+#define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock)
+#define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock)
+#define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock);
+#define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex);
+#define seqcount_ww_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, ww_mutex);
+
/*
* SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers
* seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t
@@ -190,13 +203,6 @@ typedef struct seqcount_##lockname { \
__SEQ_LOCK(locktype *lock); \
} seqcount_##lockname##_t; \
\
-static __always_inline void \
-seqcount_##lockname##_init(seqcount_##lockname##_t *s, locktype *lock) \
-{ \
- seqcount_init(&s->seqcount); \
- __SEQ_LOCK(s->lock = lock); \
-} \
- \
static __always_inline seqcount_t * \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \
{ \
@@ -284,8 +290,8 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu
__SEQ_LOCK(.lock = (assoc_lock)) \
}
-#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
+#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c
index 35b06711997dd..2b849e6b9b4ae 100644
--- a/kernel/locking/mutex-rt.c
+++ b/kernel/locking/mutex-rt.c
@@ -65,6 +65,7 @@
#include <linux/fs.h>
#include <linux/futex.h>
#include <linux/hrtimer.h>
+#include <linux/blkdev.h>
#include "rtmutex_common.h"
@@ -85,55 +86,24 @@ void __mutex_do_init(struct mutex *mutex, const char *name,
}
EXPORT_SYMBOL(__mutex_do_init);
+static int _mutex_lock_blk_flush(struct mutex *lock, int state)
+{
+ /*
+ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too
+ * late if one of the callbacks needs to acquire a sleeping lock.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+ return __rt_mutex_lock_state(&lock->lock, state);
+}
+
void __lockfunc _mutex_lock(struct mutex *lock)
{
mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(_mutex_lock);
-void __lockfunc _mutex_lock_io(struct mutex *lock)
-{
- int token;
-
- token = io_schedule_prepare();
- _mutex_lock(lock);
- io_schedule_finish(token);
-}
-EXPORT_SYMBOL_GPL(_mutex_lock_io);
-
-int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
-{
- int ret;
-
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
- return ret;
-}
-EXPORT_SYMBOL(_mutex_lock_interruptible);
-
-int __lockfunc _mutex_lock_killable(struct mutex *lock)
-{
- int ret;
-
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
- return ret;
-}
-EXPORT_SYMBOL(_mutex_lock_killable);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
-{
- mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
- __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(_mutex_lock_nested);
-
void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
{
int token;
@@ -147,10 +117,42 @@ void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
}
EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
{
mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
- __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(_mutex_lock_nest_lock);
@@ -159,7 +161,7 @@ int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass
int ret;
mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
- ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
+ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE);
if (ret)
mutex_release(&lock->dep_map, _RET_IP_);
return ret;
@@ -171,7 +173,7 @@ int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
int ret;
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
+ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE);
if (ret)
mutex_release(&lock->dep_map, _RET_IP_);
return ret;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ef22e1b52f8c8..cc0d7e99be00a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -2034,21 +2034,6 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
return __rt_mutex_slowtrylock(lock);
}
-/**
- * rt_mutex_lock_killable - lock a rt_mutex killable
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- */
-int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
-{
- return rt_mutex_lock_state(lock, 0, TASK_KILLABLE);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
-
int __sched __rt_mutex_trylock(struct rt_mutex *lock)
{
#ifdef CONFIG_PREEMPT_RT
diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
index bca7a448206d6..ab05ce0903537 100644
--- a/kernel/locking/rwsem-rt.c
+++ b/kernel/locking/rwsem-rt.c
@@ -3,6 +3,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
+#include <linux/blkdev.h>
#include "rtmutex_common.h"
@@ -87,6 +88,13 @@ static int __sched __down_read_common(struct rw_semaphore *sem, int state)
if (__down_read_trylock(sem))
return 0;
+ /*
+ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too
+ * late if one of the callbacks needs to acquire a sleeping lock.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+
might_sleep();
raw_spin_lock_irq(&m->wait_lock);
/*
@@ -209,6 +217,13 @@ static int __sched __down_write_common(struct rw_semaphore *sem, int state)
struct rt_mutex *m = &sem->rtmutex;
unsigned long flags;
+ /*
+ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too
+ * late if one of the callbacks needs to acquire a sleeping lock.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+
/* Take the rtmutex as a first step */
if (__rt_mutex_lock_state(m, state))
return -EINTR;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 78a277ea5c351..0c56873396a99 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3140,66 +3140,6 @@ const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
}
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
-/**
- * pr_flush() - Wait for printing threads to catch up.
- *
- * @timeout_ms: The maximum time (in ms) to wait.
- * @reset_on_progress: Reset the timeout if forward progress is seen.
- *
- * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
- * represents infinite waiting.
- *
- * If @reset_on_progress is true, the timeout will be reset whenever any
- * printer has been seen to make some forward progress.
- *
- * Context: Any context if @timeout_ms is 0. Otherwise process context and
- * may sleep if a printer is not caught up.
- * Return: true if all enabled printers are caught up.
- */
-static bool pr_flush(int timeout_ms, bool reset_on_progress)
-{
- int remaining = timeout_ms;
- struct console *con;
- u64 last_diff = 0;
- u64 printk_seq;
- u64 diff;
- u64 seq;
-
- seq = prb_next_seq(prb);
-
- for (;;) {
- diff = 0;
-
- for_each_console(con) {
- if (!(con->flags & CON_ENABLED))
- continue;
- printk_seq = atomic64_read(&con->printk_seq);
- if (printk_seq < seq)
- diff += seq - printk_seq;
- }
-
- if (diff != last_diff && reset_on_progress)
- remaining = timeout_ms;
-
- if (!diff || remaining == 0)
- break;
-
- if (remaining < 0) {
- msleep(100);
- } else if (remaining < 100) {
- msleep(remaining);
- remaining = 0;
- } else {
- msleep(100);
- remaining -= 100;
- }
-
- last_diff = diff;
- }
-
- return (diff == 0);
-}
-
/**
* kmsg_dump - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
@@ -3222,12 +3162,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
sync_mode = true;
pr_info("enabled sync mode\n");
}
-
- /*
- * Give the printing threads time to flush, allowing up to 1
- * second of no printing forward progress before giving up.
- */
- pr_flush(1000, true);
}
rcu_read_lock();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d54f1e7ef867..34158d8ecf194 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1903,9 +1903,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
}
struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
- struct completion *done;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+struct set_affinity_pending {
+ refcount_t refs;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};
/*
@@ -1937,8 +1944,10 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
+ struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
+ int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
@@ -1947,7 +1956,7 @@ static int migration_cpu_stop(void *data)
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1957,6 +1966,8 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1966,23 +1977,71 @@ static int migration_cpu_stop(void *data)
if (is_migration_disabled(p))
goto out;
- if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
- p->wake_cpu = arg->dest_cpu;
-
- if (arg->done) {
+ if (pending) {
p->migration_pending = NULL;
complete = true;
}
+
+ /* migrate_enable() -- we must not race against SCA */
+ if (dest_cpu < 0) {
+ /*
+ * When this was migrate_enable() but we no longer
+ * have a @pending, a concurrent SCA 'fixed' things
+ * and we should be valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+ }
+
+ if (task_on_rq_queued(p))
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
+ else
+ p->wake_cpu = dest_cpu;
+
+ } else if (dest_cpu < 0) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * When this was migrate_enable() but we no longer have an
+ * @pending, a concurrent SCA 'fixed' things and we should be
+ * valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ return 0;
}
out:
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
- local_irq_enable();
+ task_rq_unlock(rq, p, &rf);
if (complete)
- complete_all(arg->done);
+ complete_all(&pending->done);
+
+ /* For pending->{arg,stop_work} */
+ pending = arg->pending;
+ if (pending && refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
return 0;
}
@@ -2095,13 +2154,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
__do_set_cpus_allowed(p, new_mask, 0);
}
-struct set_affinity_pending {
- refcount_t refs;
- struct completion done;
- struct cpu_stop_work stop_work;
- struct migration_arg arg;
-};
-
/*
* This function is wildly self concurrent; here be dragons.
*
@@ -2173,8 +2225,8 @@ struct set_affinity_pending {
* pending affinity completion is preceded an uninstallion of
* p->migration_pending done with p->pi_lock held.
*/
-static int affine_move_task(struct rq *rq, struct rq_flags *rf,
- struct task_struct *p, int dest_cpu, unsigned int flags)
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
struct migration_arg arg = {
@@ -2240,13 +2292,18 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
if (WARN_ON_ONCE(!pending))
return -EINVAL;
- arg.done = &pending->done;
-
if (flags & SCA_MIGRATE_ENABLE) {
+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
- pending->arg = arg;
+
+ pending->arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1,
+ .pending = pending,
+ };
+
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
@@ -2370,7 +2427,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
p->nr_cpus_allowed != 1);
}
- return affine_move_task(rq, &rf, p, dest_cpu, flags);
+ return affine_move_task(rq, p, &rf, dest_cpu, flags);
out:
task_rq_unlock(rq, p, &rf);
@@ -3609,6 +3666,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
}
diff --git a/localversion-rt b/localversion-rt
index 9e7cd66d9f44f..483ad771f201a 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt18
+-rt19
On Tue, Oct 27, 2020 at 11:00:49AM +0100, Sebastian Andrzej Siewior wrote:
> On 2020-10-27 10:36:16 [+0100], Daniel Wagner wrote:
> > On Sat, Oct 24, 2020 at 11:18:38AM +0200, Sebastian Andrzej Siewior wrote:
> > > I'm pleased to announce the v5.9.1-rt19 patch set.
> >
> > FWIW, all tests pass in my lab (by avoiding doing the same stupid
> > mistake as last time...)
>
> glad to hear.
Well, one thing I need to figure out is how to get pi_stress working
correctly on my machines (that is for all -rt trees, so it's nothing
new). It consistently triggers RCU stall warnings with hackbench as
workload. Clark told me the test only works if there are enough CPUs to
run on. Need to look into it.
On Sat, Oct 24, 2020 at 11:18:38AM +0200, Sebastian Andrzej Siewior wrote:
> I'm pleased to announce the v5.9.1-rt19 patch set.
FWIW, all tests pass in my lab (by avoiding doing the same stupid
mistake as last time...)
On 2020-10-27 10:36:16 [+0100], Daniel Wagner wrote:
> On Sat, Oct 24, 2020 at 11:18:38AM +0200, Sebastian Andrzej Siewior wrote:
> > I'm pleased to announce the v5.9.1-rt19 patch set.
>
> FWIW, all tests pass in my lab (by avoiding doing the same stupid
> mistake as last time...)
glad to hear.
Sebastian
On 2020-10-27 11:25:47 [+0100], Daniel Wagner wrote:
> Well, one thing I need to figure out is how to get pi_stress working
> correctly on my machines (that is for all -rt trees, so it's nothing
> new). It consistently triggers RCU stall warnings with hackbench as
> workload. Clark told me the test only works if there are enough CPUs to
> run on. Need to look into it.
Is it running as a RT task? Do you have RCU-boosting enabled?
Otherwise it looks that if you throw enough non-RT load on the system,
RCU can not catch up which does not sound good.
Sebastian
On Tue, Oct 27, 2020 at 11:28:51AM +0100, Sebastian Andrzej Siewior wrote:
> Is it running as a RT task?
root@c2d:~/rt-tests# ./pi_stress
Starting PI Stress Test
Number of thread groups: 1
Duration of test run: infinite
Number of inversions per group: unlimited
Admin thread SCHED_FIFO priority 4
1 groups of 3 threads will be created
High thread SCHED_FIFO priority 3
Med thread SCHED_FIFO priority 2
Low thread SCHED_FIFO priority 1
It says so, let me double check if those task really run with SCHED_FIFO.
> Do you have RCU-boosting enabled?
Yes
#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
CONFIG_PREEMPT_RCU=y
CONFIG_RCU_EXPERT=y
CONFIG_SRCU=y
CONFIG_TREE_SRCU=y
CONFIG_TASKS_RCU_GENERIC=y
CONFIG_TASKS_RCU=y
CONFIG_TASKS_RUDE_RCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_RCU_NEED_SEGCBLIST=y
CONFIG_RCU_FANOUT=64
CONFIG_RCU_FANOUT_LEAF=16
CONFIG_RCU_BOOST=y
CONFIG_RCU_BOOST_DELAY=500
CONFIG_RCU_NOCB_CPU=y
# CONFIG_TASKS_TRACE_RCU_READ_MB is not set
# end of RCU Subsystem
> Otherwise it looks that if you throw enough non-RT load on the system,
> RCU can not catch up which does not sound good.
I think this is what Clark tried to tell me. If I understood him
correctly the test tool is not correct though.
On Tue, Oct 27, 2020 at 11:34:11AM +0100, Daniel Wagner wrote:
> It says so, let me double check if those task really run with SCHED_FIFO.
I just got an RCU stall without any background load. Anyway, just for
completeness here is the ps output:
root@c2d:~# ps -eLo pid,tid,class,rtprio,ni,pri,psr,pcpu,stat,wchan:14,comm
PID TID CLS RTPRIO NI PRI PSR %CPU STAT WCHAN COMMAND
1 1 TS - 0 19 0 0.0 Ss - systemd
2 2 TS - 0 19 0 0.0 S - kthreadd
3 3 TS - -20 39 0 0.0 I< - rcu_gp
4 4 TS - -20 39 0 0.0 I< - rcu_par_gp
6 6 TS - -20 39 0 0.0 I< - kworker/0:0H-events_highpri
8 8 TS - -20 39 0 0.0 I< - mm_percpu_wq
9 9 TS - 0 19 0 0.0 S - ksoftirqd/0
10 10 FF 1 - 41 0 0.0 S - rcuc/0
11 11 FF 1 - 41 0 0.0 I - rcu_preempt
12 12 FF 1 - 41 0 0.0 S - rcub/0
13 13 FF 99 - 139 0 0.0 S - migration/0
14 14 TS - 0 19 0 0.0 S - cpuhp/0
15 15 TS - 0 19 1 0.0 S - cpuhp/1
16 16 FF 99 - 139 1 0.0 S - migration/1
17 17 FF 1 - 41 1 0.0 R - rcuc/1
18 18 TS - 0 19 1 0.0 R - ksoftirqd/1
19 19 TS - 0 19 1 0.0 I - kworker/1:0-events_freezable_power_
20 20 TS - -20 39 1 0.0 I< - kworker/1:0H-kblockd
21 21 TS - 0 19 1 0.0 S - kdevtmpfs
22 22 TS - -20 39 0 0.0 I< - netns
23 23 TS - 0 19 1 0.0 S - rcu_tasks_kthre
24 24 TS - 0 19 1 0.0 S - rcu_tasks_rude_
25 25 TS - 0 19 0 0.0 S - kauditd
26 26 TS - 0 19 1 0.0 I - kworker/1:1-events_freezable_power_
27 27 TS - 0 19 0 0.0 I - kworker/0:1-ata_sff
28 28 TS - 0 19 1 0.0 S - oom_reaper
29 29 TS - -20 39 1 0.0 I< - writeback
30 30 TS - 0 19 0 0.0 S - kcompactd0
50 50 TS - -20 39 1 0.0 I< - kblockd
51 51 FF 50 - 90 1 0.0 S - irq/9-acpi
52 52 TS - -20 39 1 0.0 I< - ata_sff
53 53 TS - -20 39 0 0.0 I< - md
54 54 TS - -20 39 0 0.0 I< - rpciod
55 55 TS - -20 39 0 0.0 I< - kworker/u9:0-xprtiod
56 56 TS - -20 39 0 0.0 I< - xprtiod
57 57 TS - -20 39 1 0.0 I< - cfg80211
58 58 TS - 0 19 0 0.0 S - kswapd0
59 59 TS - -20 39 1 0.0 I< - nfsiod
61 61 TS - -20 39 1 0.0 I< - acpi_thermal_pm
63 63 FF 50 - 90 0 0.0 S - card0-crtc0
64 64 FF 50 - 90 0 0.0 S - card0-crtc1
65 65 FF 50 - 90 0 0.0 S - irq/16-i915
66 66 TS - -20 39 0 0.0 I< - kworker/0:1H-kblockd
67 67 FF 50 - 90 1 0.0 S - irq/14-ata_piix
68 68 FF 50 - 90 0 0.0 S - irq/15-ata_piix
69 69 TS - 0 19 1 0.0 S - scsi_eh_0
70 70 TS - -20 39 1 0.0 I< - scsi_tmf_0
71 71 TS - 0 19 1 0.0 S - scsi_eh_1
72 72 TS - -20 39 0 0.0 I< - scsi_tmf_1
78 78 FF 50 - 90 1 0.0 S - irq/23-ehci_hcd
79 79 FF 50 - 90 1 0.0 S - irq/23-uhci_hcd
80 80 FF 50 - 90 0 0.0 S - irq/19-uhci_hcd
81 81 FF 50 - 90 1 0.0 S - irq/18-uhci_hcd
82 82 FF 50 - 90 0 0.0 S - irq/16-uhci_hcd
83 83 FF 50 - 90 0 0.0 S - irq/1-i8042
84 84 FF 50 - 90 1 0.0 S - irq/8-rtc0
85 85 FF 50 - 90 0 0.0 S - irq/19-i801_smb
86 86 TS - -20 39 1 0.0 I< - ipv6_addrconf
88 88 TS - 0 19 0 0.2 S - pr/ttyS0
89 89 TS - 0 19 0 0.0 S - pr/netcon0
93 93 TS - -20 39 1 0.0 I< - kworker/1:1H-kblockd
94 94 FF 50 - 90 0 0.0 S - irq/26-eth0
95 95 TS - 0 19 1 0.0 S - scsi_eh_2
96 96 TS - -20 39 1 0.0 I< - scsi_tmf_2
97 97 TS - 0 19 0 0.0 S - usb-storage
105 105 TS - -20 39 1 0.0 I< - kworker/u9:1-xprtiod
106 106 TS - 0 19 0 0.0 S - NFSv4 callback
114 114 TS - 0 19 0 0.0 I - kworker/u8:8-events_unbound
158 158 TS - 0 19 0 0.0 Ss - systemd-journal
167 167 TS - 0 19 0 0.0 Ss - systemd-udevd
175 175 TS - 0 19 0 0.0 Ss - systemd-network
195 195 TS - 0 19 0 0.0 Ssl - systemd-timesyn
195 200 TS - 0 19 0 0.0 Ssl - sd-resolve
224 224 TS - 0 19 0 0.0 Ss - sshd
226 226 TS - 0 19 1 0.0 Ss+ - agetty
227 227 TS - 0 19 0 0.0 Ss+ - agetty
228 228 TS - 0 19 1 0.0 Ss+ - agetty
232 232 TS - 0 19 0 0.0 Ss+ - agetty
233 233 TS - 0 19 1 0.0 Ss+ - agetty
234 234 TS - 0 19 0 0.0 Ss+ - agetty
236 236 TS - 0 19 1 0.0 Ss+ - agetty
241 241 FF 50 - 90 1 0.0 S - irq/4-ttyS0
263 263 TS - 0 19 0 0.0 I - kworker/u8:47-rpciod
266 266 TS - 0 19 1 0.0 R - kworker/1:2+events_freezable_power_
267 267 TS - 0 19 0 0.0 I - kworker/0:0-events_power_efficient
269 269 TS - 0 19 0 0.0 Ss - sshd
275 275 TS - 0 19 0 0.0 Ss - bash
282 282 FF 4 - 44 0 0.0 Sl+ - pi_stress
282 283 FF 1 - 41 1 42.0 Rl+ - pi_stress
282 284 FF 2 - 42 1 22.1 Sl+ - pi_stress
282 285 FF 3 - 43 1 35.7 Rl+ - pi_stress
297 297 TS - 0 19 1 0.0 R - kworker/1:3
303 303 TS - 0 19 0 0.0 I - kworker/0:2-ata_sff
308 308 TS - 0 19 1 0.0 R - kworker/1:4
312 312 TS - 0 19 0 0.0 I - kworker/0:3-ata_sff
316 316 TS - 0 19 0 0.0 I - kworker/u8:0-rpciod
317 317 TS - 0 19 0 0.0 I - kworker/u8:1-rpciod
320 320 TS - 0 19 0 1.0 Ss - sshd
326 326 TS - 0 19 0 0.0 Ss - bash
329 329 TS - 0 19 0 0.0 R+ - ps
On 2020-10-27 11:34:11 [+0100], Daniel Wagner wrote:
> On Tue, Oct 27, 2020 at 11:28:51AM +0100, Sebastian Andrzej Siewior wrote:
> > Is it running as a RT task?
>
> root@c2d:~/rt-tests# ./pi_stress
> Starting PI Stress Test
> Number of thread groups: 1
> Duration of test run: infinite
> Number of inversions per group: unlimited
> Admin thread SCHED_FIFO priority 4
> 1 groups of 3 threads will be created
> High thread SCHED_FIFO priority 3
> Med thread SCHED_FIFO priority 2
> Low thread SCHED_FIFO priority 1
>
> It says so, let me double check if those task really run with SCHED_FIFO.
urgh. You wrote pi_stress and I read stress-ng. Okay this explains it.
> > Otherwise it looks that if you throw enough non-RT load on the system,
> > RCU can not catch up which does not sound good.
>
> I think this is what Clark tried to tell me. If I understood him
> correctly the test tool is not correct though.
a dummy RCU section might help. But otherwise it is correct :/
Sebastian