2011-04-05 15:31:41

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 13/21] sched: Add p->pi_lock to task_rq_lock()

In order to be able to call set_task_cpu() while either holding
p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
stabilize task_rq().

This makes task_rq_lock() acquire both locks, and have
__task_rq_lock() validate that p->pi_lock is held. This increases the
locking overhead for most scheduler syscalls but allows reduction of
rq->lock contention for some scheduler hot paths (ttwu).

Signed-off-by: Peter Zijlstra <[email protected]>
Reviewed-by: Frank Rowand <[email protected]>
---
kernel/sched.c | 103 ++++++++++++++++++++++++++-------------------------------
1 file changed, 47 insertions(+), 56 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -600,7 +600,7 @@ static inline int cpu_of(struct rq *rq)
* Return the group to which this tasks belongs.
*
* We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup.
*/
@@ -610,7 +610,7 @@ static inline struct task_group *task_gr
struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&task_rq(p)->lock));
+ lockdep_is_held(&p->pi_lock));
tg = container_of(css, struct task_group, css);

return autogroup_task_group(p, tg);
@@ -926,23 +926,15 @@ static inline void finish_lock_switch(st
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

/*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
- return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;

+ lockdep_assert_held(&p->pi_lock);
+
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
@@ -953,22 +945,22 @@ static inline struct rq *__task_rq_lock(
}

/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;

for (;;) {
- local_irq_save(*flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
}

@@ -978,10 +970,13 @@ static void __task_rq_unlock(struct rq *
raw_spin_unlock(&rq->lock);
}

-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
+ __releases(p->pi_lock)
{
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}

/*
@@ -2178,6 +2173,11 @@ void set_task_cpu(struct task_struct *p,
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
#endif

trace_sched_migrate_task(p, new_cpu);
@@ -2273,7 +2273,7 @@ unsigned long wait_task_inactive(struct
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

/*
* If it changed from the expected state, bail out now.
@@ -2639,6 +2639,7 @@ static void __sched_fork(struct task_str
*/
void sched_fork(struct task_struct *p, int clone_flags)
{
+ unsigned long flags;
int cpu = get_cpu();

__sched_fork(p);
@@ -2689,9 +2690,9 @@ void sched_fork(struct task_struct *p, i
*
* Silence PROVE_RCU.
*/
- rcu_read_lock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu);
- rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
@@ -2740,7 +2741,7 @@ void wake_up_new_task(struct task_struct
set_task_cpu(p, cpu);

p->state = TASK_RUNNING;
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
#endif

rq = task_rq_lock(p, &flags);
@@ -2751,7 +2752,7 @@ void wake_up_new_task(struct task_struct
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
#endif
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
put_cpu();
}

@@ -3476,12 +3477,12 @@ void sched_exec(void)
likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
struct migration_arg arg = { p, dest_cpu };

- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
return;
}
unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}

#endif
@@ -3518,7 +3519,7 @@ unsigned long long task_delta_exec(struc

rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

return ns;
}
@@ -3536,7 +3537,7 @@ unsigned long long task_sched_runtime(st

rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

return ns;
}
@@ -3560,7 +3561,7 @@ unsigned long long thread_group_sched_ru
rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

return ns;
}
@@ -4675,16 +4676,13 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
const struct sched_class *prev_class;

BUG_ON(prio < 0 || prio > MAX_PRIO);

- lockdep_assert_held(&p->pi_lock);
-
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);

trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
@@ -4709,7 +4707,7 @@ void rt_mutex_setprio(struct task_struct
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);

check_class_changed(rq, p, prev_class, oldprio);
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
}

#endif
@@ -4757,7 +4755,7 @@ void set_user_nice(struct task_struct *p
resched_task(rq->curr);
}
out_unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}
EXPORT_SYMBOL(set_user_nice);

@@ -4979,20 +4977,17 @@ static int __sched_setscheduler(struct t
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
+ *
* To be able to change p->policy safely, the apropriate
* runqueue lock must be held.
*/
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);

/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EINVAL;
}

@@ -5005,8 +5000,7 @@ static int __sched_setscheduler(struct t
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EPERM;
}
}
@@ -5015,8 +5009,7 @@ static int __sched_setscheduler(struct t
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
goto recheck;
}
on_rq = p->on_rq;
@@ -5038,8 +5031,7 @@ static int __sched_setscheduler(struct t
activate_task(rq, p, 0);

check_class_changed(rq, p, prev_class, oldprio);
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);

rt_mutex_adjust_pi(p);

@@ -5620,7 +5612,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, p

rq = task_rq_lock(p, &flags);
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -5843,8 +5835,7 @@ int set_cpus_allowed_ptr(struct task_str
unsigned int dest_cpu;
int ret = 0;

- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);

if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
@@ -5872,15 +5863,13 @@ int set_cpus_allowed_ptr(struct task_str
if (need_migrate_task(p)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
out:
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);

return ret;
}
@@ -5908,6 +5897,7 @@ static int __migrate_task(struct task_st
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);

+ raw_spin_lock(&p->pi_lock);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
@@ -5930,6 +5920,7 @@ static int __migrate_task(struct task_st
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&p->pi_lock);
return ret;
}

@@ -8656,7 +8647,7 @@ void sched_move_task(struct task_struct
if (on_rq)
enqueue_task(rq, tsk, 0);

- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, tsk, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */



2011-04-14 08:37:18

by Peter Zijlstra

[permalink] [raw]
Subject: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

Commit-ID: 0122ec5b02f766c355b3168df53a6c038a24fa0d
Gitweb: http://git.kernel.org/tip/0122ec5b02f766c355b3168df53a6c038a24fa0d
Author: Peter Zijlstra <[email protected]>
AuthorDate: Tue, 5 Apr 2011 17:23:51 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 14 Apr 2011 08:52:38 +0200

sched: Add p->pi_lock to task_rq_lock()

In order to be able to call set_task_cpu() while either holding
p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
stabilize task_rq().

This makes task_rq_lock() acquire both locks, and have
__task_rq_lock() validate that p->pi_lock is held. This increases the
locking overhead for most scheduler syscalls but allows reduction of
rq->lock contention for some scheduler hot paths (ttwu).

Reviewed-by: Frank Rowand <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Nick Piggin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched.c | 103 +++++++++++++++++++++++++------------------------------
1 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6b269b7..f155127 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -599,7 +599,7 @@ static inline int cpu_of(struct rq *rq)
* Return the group to which this tasks belongs.
*
* We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup.
*/
@@ -609,7 +609,7 @@ static inline struct task_group *task_group(struct task_struct *p)
struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&task_rq(p)->lock));
+ lockdep_is_held(&p->pi_lock));
tg = container_of(css, struct task_group, css);

return autogroup_task_group(p, tg);
@@ -924,23 +924,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

/*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
- return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;

+ lockdep_assert_held(&p->pi_lock);
+
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
@@ -951,22 +943,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
}

/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;

for (;;) {
- local_irq_save(*flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
}

@@ -976,10 +968,13 @@ static void __task_rq_unlock(struct rq *rq)
raw_spin_unlock(&rq->lock);
}

-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
+ __releases(p->pi_lock)
{
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}

/*
@@ -2175,6 +2170,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
#endif

trace_sched_migrate_task(p, new_cpu);
@@ -2270,7 +2270,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

/*
* If it changed from the expected state, bail out now.
@@ -2652,6 +2652,7 @@ static void __sched_fork(struct task_struct *p)
*/
void sched_fork(struct task_struct *p, int clone_flags)
{
+ unsigned long flags;
int cpu = get_cpu();

__sched_fork(p);
@@ -2702,9 +2703,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
*
* Silence PROVE_RCU.
*/
- rcu_read_lock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu);
- rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
@@ -2753,7 +2754,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
set_task_cpu(p, cpu);

p->state = TASK_RUNNING;
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
#endif

rq = task_rq_lock(p, &flags);
@@ -2765,7 +2766,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
#endif
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
put_cpu();
}

@@ -3490,12 +3491,12 @@ void sched_exec(void)
likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
struct migration_arg arg = { p, dest_cpu };

- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
return;
}
unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}

#endif
@@ -3532,7 +3533,7 @@ unsigned long long task_delta_exec(struct task_struct *p)

rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

return ns;
}
@@ -3550,7 +3551,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)

rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

return ns;
}
@@ -3574,7 +3575,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

return ns;
}
@@ -4693,16 +4694,13 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
const struct sched_class *prev_class;

BUG_ON(prio < 0 || prio > MAX_PRIO);

- lockdep_assert_held(&p->pi_lock);
-
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);

trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
@@ -4727,7 +4725,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);

check_class_changed(rq, p, prev_class, oldprio);
- task_rq_unlock(rq, &flags);
+ __task_rq_unlock(rq);
}

#endif
@@ -4775,7 +4773,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(rq->curr);
}
out_unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}
EXPORT_SYMBOL(set_user_nice);

@@ -5003,20 +5001,17 @@ recheck:
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
+ *
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);

/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EINVAL;
}

@@ -5040,8 +5035,7 @@ recheck:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EPERM;
}
}
@@ -5050,8 +5044,7 @@ recheck:
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
goto recheck;
}
on_rq = p->on_rq;
@@ -5073,8 +5066,7 @@ recheck:
activate_task(rq, p, 0);

check_class_changed(rq, p, prev_class, oldprio);
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);

rt_mutex_adjust_pi(p);

@@ -5666,7 +5658,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

rq = task_rq_lock(p, &flags);
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);

rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -5889,8 +5881,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
unsigned int dest_cpu;
int ret = 0;

- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);

if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
@@ -5918,15 +5909,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
if (need_migrate_task(p)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
out:
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);

return ret;
}
@@ -5954,6 +5943,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);

+ raw_spin_lock(&p->pi_lock);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
@@ -5976,6 +5966,7 @@ done:
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&p->pi_lock);
return ret;
}

@@ -8702,7 +8693,7 @@ void sched_move_task(struct task_struct *tsk)
if (on_rq)
enqueue_task(rq, tsk, 0);

- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, tsk, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */

2011-06-01 13:59:26

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

Hi,

git bisect blames this commit for a problem I have with v3.0-rc1:
If I printk large amounts of data, the machine locks up.
As the commit does not revert cleanly on top of 3.0, I haven't been
able to double check.
The test I use is simple, just add something like

for (i=0; i < 10000; ++i) printk("test %d\n", i);

and trigger it, in most cases I can see the first 10 printks before
I have to power cycle the machine (sysrq-b does not work anymore).
Attached my .config.

-Arne



On 14.04.2011 10:36, tip-bot for Peter Zijlstra wrote:
> Commit-ID: 0122ec5b02f766c355b3168df53a6c038a24fa0d
> Gitweb: http://git.kernel.org/tip/0122ec5b02f766c355b3168df53a6c038a24fa0d
> Author: Peter Zijlstra <[email protected]>
> AuthorDate: Tue, 5 Apr 2011 17:23:51 +0200
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Thu, 14 Apr 2011 08:52:38 +0200
>
> sched: Add p->pi_lock to task_rq_lock()
>
> In order to be able to call set_task_cpu() while either holding
> p->pi_lock or task_rq(p)->lock we need to hold both locks in order to
> stabilize task_rq().
>
> This makes task_rq_lock() acquire both locks, and have
> __task_rq_lock() validate that p->pi_lock is held. This increases the
> locking overhead for most scheduler syscalls but allows reduction of
> rq->lock contention for some scheduler hot paths (ttwu).
>
> Reviewed-by: Frank Rowand <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Cc: Mike Galbraith <[email protected]>
> Cc: Nick Piggin <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Link: http://lkml.kernel.org/r/[email protected]
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> kernel/sched.c | 103 +++++++++++++++++++++++++------------------------------
> 1 files changed, 47 insertions(+), 56 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 6b269b7..f155127 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -599,7 +599,7 @@ static inline int cpu_of(struct rq *rq)
> * Return the group to which this tasks belongs.
> *
> * We use task_subsys_state_check() and extend the RCU verification
> - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
> + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
> * holds that lock for each task it moves into the cgroup. Therefore
> * by holding that lock, we pin the task to the current cgroup.
> */
> @@ -609,7 +609,7 @@ static inline struct task_group *task_group(struct task_struct *p)
> struct cgroup_subsys_state *css;
>
> css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
> - lockdep_is_held(&task_rq(p)->lock));
> + lockdep_is_held(&p->pi_lock));
> tg = container_of(css, struct task_group, css);
>
> return autogroup_task_group(p, tg);
> @@ -924,23 +924,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
> #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
>
> /*
> - * Check whether the task is waking, we use this to synchronize ->cpus_allowed
> - * against ttwu().
> - */
> -static inline int task_is_waking(struct task_struct *p)
> -{
> - return unlikely(p->state == TASK_WAKING);
> -}
> -
> -/*
> - * __task_rq_lock - lock the runqueue a given task resides on.
> - * Must be called interrupts disabled.
> + * __task_rq_lock - lock the rq @p resides on.
> */
> static inline struct rq *__task_rq_lock(struct task_struct *p)
> __acquires(rq->lock)
> {
> struct rq *rq;
>
> + lockdep_assert_held(&p->pi_lock);
> +
> for (;;) {
> rq = task_rq(p);
> raw_spin_lock(&rq->lock);
> @@ -951,22 +943,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
> }
>
> /*
> - * task_rq_lock - lock the runqueue a given task resides on and disable
> - * interrupts. Note the ordering: we can safely lookup the task_rq without
> - * explicitly disabling preemption.
> + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
> */
> static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
> + __acquires(p->pi_lock)
> __acquires(rq->lock)
> {
> struct rq *rq;
>
> for (;;) {
> - local_irq_save(*flags);
> + raw_spin_lock_irqsave(&p->pi_lock, *flags);
> rq = task_rq(p);
> raw_spin_lock(&rq->lock);
> if (likely(rq == task_rq(p)))
> return rq;
> - raw_spin_unlock_irqrestore(&rq->lock, *flags);
> + raw_spin_unlock(&rq->lock);
> + raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
> }
> }
>
> @@ -976,10 +968,13 @@ static void __task_rq_unlock(struct rq *rq)
> raw_spin_unlock(&rq->lock);
> }
>
> -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
> +static inline void
> +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
> __releases(rq->lock)
> + __releases(p->pi_lock)
> {
> - raw_spin_unlock_irqrestore(&rq->lock, *flags);
> + raw_spin_unlock(&rq->lock);
> + raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
> }
>
> /*
> @@ -2175,6 +2170,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
> */
> WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
> !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
> +
> +#ifdef CONFIG_LOCKDEP
> + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
> + lockdep_is_held(&task_rq(p)->lock)));
> +#endif
> #endif
>
> trace_sched_migrate_task(p, new_cpu);
> @@ -2270,7 +2270,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
> ncsw = 0;
> if (!match_state || p->state == match_state)
> ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
>
> /*
> * If it changed from the expected state, bail out now.
> @@ -2652,6 +2652,7 @@ static void __sched_fork(struct task_struct *p)
> */
> void sched_fork(struct task_struct *p, int clone_flags)
> {
> + unsigned long flags;
> int cpu = get_cpu();
>
> __sched_fork(p);
> @@ -2702,9 +2703,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
> *
> * Silence PROVE_RCU.
> */
> - rcu_read_lock();
> + raw_spin_lock_irqsave(&p->pi_lock, flags);
> set_task_cpu(p, cpu);
> - rcu_read_unlock();
> + raw_spin_unlock_irqrestore(&p->pi_lock, flags);
>
> #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
> if (likely(sched_info_on()))
> @@ -2753,7 +2754,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
> set_task_cpu(p, cpu);
>
> p->state = TASK_RUNNING;
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
> #endif
>
> rq = task_rq_lock(p, &flags);
> @@ -2765,7 +2766,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
> if (p->sched_class->task_woken)
> p->sched_class->task_woken(rq, p);
> #endif
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
> put_cpu();
> }
>
> @@ -3490,12 +3491,12 @@ void sched_exec(void)
> likely(cpu_active(dest_cpu)) && need_migrate_task(p)) {
> struct migration_arg arg = { p, dest_cpu };
>
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
> stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
> return;
> }
> unlock:
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
> }
>
> #endif
> @@ -3532,7 +3533,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
>
> rq = task_rq_lock(p, &flags);
> ns = do_task_delta_exec(p, rq);
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
>
> return ns;
> }
> @@ -3550,7 +3551,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
>
> rq = task_rq_lock(p, &flags);
> ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
>
> return ns;
> }
> @@ -3574,7 +3575,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
> rq = task_rq_lock(p, &flags);
> thread_group_cputime(p, &totals);
> ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
>
> return ns;
> }
> @@ -4693,16 +4694,13 @@ EXPORT_SYMBOL(sleep_on_timeout);
> */
> void rt_mutex_setprio(struct task_struct *p, int prio)
> {
> - unsigned long flags;
> int oldprio, on_rq, running;
> struct rq *rq;
> const struct sched_class *prev_class;
>
> BUG_ON(prio < 0 || prio > MAX_PRIO);
>
> - lockdep_assert_held(&p->pi_lock);
> -
> - rq = task_rq_lock(p, &flags);
> + rq = __task_rq_lock(p);
>
> trace_sched_pi_setprio(p, prio);
> oldprio = p->prio;
> @@ -4727,7 +4725,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
> enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
>
> check_class_changed(rq, p, prev_class, oldprio);
> - task_rq_unlock(rq, &flags);
> + __task_rq_unlock(rq);
> }
>
> #endif
> @@ -4775,7 +4773,7 @@ void set_user_nice(struct task_struct *p, long nice)
> resched_task(rq->curr);
> }
> out_unlock:
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
> }
> EXPORT_SYMBOL(set_user_nice);
>
> @@ -5003,20 +5001,17 @@ recheck:
> /*
> * make sure no PI-waiters arrive (or leave) while we are
> * changing the priority of the task:
> - */
> - raw_spin_lock_irqsave(&p->pi_lock, flags);
> - /*
> + *
> * To be able to change p->policy safely, the appropriate
> * runqueue lock must be held.
> */
> - rq = __task_rq_lock(p);
> + rq = task_rq_lock(p, &flags);
>
> /*
> * Changing the policy of the stop threads its a very bad idea
> */
> if (p == rq->stop) {
> - __task_rq_unlock(rq);
> - raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> + task_rq_unlock(rq, p, &flags);
> return -EINVAL;
> }
>
> @@ -5040,8 +5035,7 @@ recheck:
> if (rt_bandwidth_enabled() && rt_policy(policy) &&
> task_group(p)->rt_bandwidth.rt_runtime == 0 &&
> !task_group_is_autogroup(task_group(p))) {
> - __task_rq_unlock(rq);
> - raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> + task_rq_unlock(rq, p, &flags);
> return -EPERM;
> }
> }
> @@ -5050,8 +5044,7 @@ recheck:
> /* recheck policy now with rq lock held */
> if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
> policy = oldpolicy = -1;
> - __task_rq_unlock(rq);
> - raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> + task_rq_unlock(rq, p, &flags);
> goto recheck;
> }
> on_rq = p->on_rq;
> @@ -5073,8 +5066,7 @@ recheck:
> activate_task(rq, p, 0);
>
> check_class_changed(rq, p, prev_class, oldprio);
> - __task_rq_unlock(rq);
> - raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> + task_rq_unlock(rq, p, &flags);
>
> rt_mutex_adjust_pi(p);
>
> @@ -5666,7 +5658,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
>
> rq = task_rq_lock(p, &flags);
> time_slice = p->sched_class->get_rr_interval(rq, p);
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, p, &flags);
>
> rcu_read_unlock();
> jiffies_to_timespec(time_slice, &t);
> @@ -5889,8 +5881,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> unsigned int dest_cpu;
> int ret = 0;
>
> - raw_spin_lock_irqsave(&p->pi_lock, flags);
> - rq = __task_rq_lock(p);
> + rq = task_rq_lock(p, &flags);
>
> if (!cpumask_intersects(new_mask, cpu_active_mask)) {
> ret = -EINVAL;
> @@ -5918,15 +5909,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> if (need_migrate_task(p)) {
> struct migration_arg arg = { p, dest_cpu };
> /* Need help from migration thread: drop lock and wait. */
> - __task_rq_unlock(rq);
> - raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> + task_rq_unlock(rq, p, &flags);
> stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
> tlb_migrate_finish(p->mm);
> return 0;
> }
> out:
> - __task_rq_unlock(rq);
> - raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> + task_rq_unlock(rq, p, &flags);
>
> return ret;
> }
> @@ -5954,6 +5943,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
> rq_src = cpu_rq(src_cpu);
> rq_dest = cpu_rq(dest_cpu);
>
> + raw_spin_lock(&p->pi_lock);
> double_rq_lock(rq_src, rq_dest);
> /* Already moved. */
> if (task_cpu(p) != src_cpu)
> @@ -5976,6 +5966,7 @@ done:
> ret = 1;
> fail:
> double_rq_unlock(rq_src, rq_dest);
> + raw_spin_unlock(&p->pi_lock);
> return ret;
> }
>
> @@ -8702,7 +8693,7 @@ void sched_move_task(struct task_struct *tsk)
> if (on_rq)
> enqueue_task(rq, tsk, 0);
>
> - task_rq_unlock(rq, &flags);
> + task_rq_unlock(rq, tsk, &flags);
> }
> #endif /* CONFIG_CGROUP_SCHED */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/


Attachments:
config (77.74 kB)

2011-06-01 16:31:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Wed, 2011-06-01 at 15:58 +0200, Arne Jansen wrote:
> git bisect blames this commit for a problem I have with v3.0-rc1:
> If I printk large amounts of data, the machine locks up.
> As the commit does not revert cleanly on top of 3.0, I haven't been
> able to double check.
> The test I use is simple, just add something like
>
> for (i=0; i < 10000; ++i) printk("test %d\n", i);
>
> and trigger it, in most cases I can see the first 10 printks before
> I have to power cycle the machine (sysrq-b does not work anymore).
> Attached my .config.

I've made me a module that does the above, I've also changed my .config
to match yours (smp=y, sched-cgroup=y, autogroup=n, preempt=n, no_hz=y),
but sadly I cannot reproduce, I get all 10k prints on my serial line.

Even without serial line it works (somehow booting without visible
console is scary as hell :)

Which makes me ask, how are you observing your console?

Because those 10k lines aren't even near the amount of crap a regular
boot spews out on this box, although I guess the tight loop might
generate it slightly faster than a regular boot does.

2011-06-01 17:21:28

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 01.06.2011 18:35, Peter Zijlstra wrote:
> On Wed, 2011-06-01 at 15:58 +0200, Arne Jansen wrote:
>> git bisect blames this commit for a problem I have with v3.0-rc1:
>> If I printk large amounts of data, the machine locks up.
>> As the commit does not revert cleanly on top of 3.0, I haven't been
>> able to double check.
>> The test I use is simple, just add something like
>>
>> for (i=0; i< 10000; ++i) printk("test %d\n", i);
>>
>> and trigger it, in most cases I can see the first 10 printks before
>> I have to power cycle the machine (sysrq-b does not work anymore).
>> Attached my .config.
>
> I've made me a module that does the above, I've also changed my .config
> to match yours (smp=y, sched-cgroup=y, autogroup=n, preempt=n, no_hz=y),
> but sadly I cannot reproduce, I get all 10k prints on my serial line.
>
> Even without serial line it works (somehow booting without visible
> console is scary as hell :)
>
> Which makes me ask, how are you observing your console?
>

They don't go out to the serial line, I only observe them with a
tail -f on messages. Default log level doesn't go the console here.

> Because those 10k lines aren't even near the amount of crap a regular
> boot spews out on this box, although I guess the tight loop might
> generate it slightly faster than a regular boot does.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-01 18:05:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Wed, 2011-06-01 at 19:20 +0200, Arne Jansen wrote:
> On 01.06.2011 18:35, Peter Zijlstra wrote:
> > On Wed, 2011-06-01 at 15:58 +0200, Arne Jansen wrote:
> >> git bisect blames this commit for a problem I have with v3.0-rc1:
> >> If I printk large amounts of data, the machine locks up.
> >> As the commit does not revert cleanly on top of 3.0, I haven't been
> >> able to double check.
> >> The test I use is simple, just add something like
> >>
> >> for (i=0; i< 10000; ++i) printk("test %d\n", i);
> >>
> >> and trigger it, in most cases I can see the first 10 printks before
> >> I have to power cycle the machine (sysrq-b does not work anymore).
> >> Attached my .config.
> >
> > I've made me a module that does the above, I've also changed my .config
> > to match yours (smp=y, sched-cgroup=y, autogroup=n, preempt=n, no_hz=y),
> > but sadly I cannot reproduce, I get all 10k prints on my serial line.
> >
> > Even without serial line it works (somehow booting without visible
> > console is scary as hell :)
> >
> > Which makes me ask, how are you observing your console?
> >
>
> They don't go out to the serial line, I only observe them with a
> tail -f on messages. Default log level doesn't go the console here.

Right ok, so I used your exact .config, added a few drivers needed for
my hardware and indeed, it doesn't even finish booting and gets stuck
someplace.

Sadly it looks like even the NMI watchdog is dead,.. /me goes try and
make sense of this.

2011-06-01 18:41:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Wed, 2011-06-01 at 20:09 +0200, Peter Zijlstra wrote:
> On Wed, 2011-06-01 at 19:20 +0200, Arne Jansen wrote:
> > On 01.06.2011 18:35, Peter Zijlstra wrote:
> > > On Wed, 2011-06-01 at 15:58 +0200, Arne Jansen wrote:
> > >> git bisect blames this commit for a problem I have with v3.0-rc1:
> > >> If I printk large amounts of data, the machine locks up.
> > >> As the commit does not revert cleanly on top of 3.0, I haven't been
> > >> able to double check.
> > >> The test I use is simple, just add something like
> > >>
> > >> for (i=0; i< 10000; ++i) printk("test %d\n", i);
> > >>
> > >> and trigger it, in most cases I can see the first 10 printks before
> > >> I have to power cycle the machine (sysrq-b does not work anymore).
> > >> Attached my .config.
> > >
> > > I've made me a module that does the above, I've also changed my .config
> > > to match yours (smp=y, sched-cgroup=y, autogroup=n, preempt=n, no_hz=y),
> > > but sadly I cannot reproduce, I get all 10k prints on my serial line.
> > >
> > > Even without serial line it works (somehow booting without visible
> > > console is scary as hell :)
> > >
> > > Which makes me ask, how are you observing your console?
> > >
> >
> > They don't go out to the serial line, I only observe them with a
> > tail -f on messages. Default log level doesn't go the console here.
>
> Right ok, so I used your exact .config, added a few drivers needed for
> my hardware and indeed, it doesn't even finish booting and gets stuck
> someplace.
>
> Sadly it looks like even the NMI watchdog is dead,.. /me goes try and
> make sense of this.

Sadly both 0122ec5b02f766c355b3168df53a6c038a24fa0d^1 and
0122ec5b02f766c355b3168df53a6c038a24fa0d itself boot just fine and run
the test module without problems.

I will have to re-bisect this.

2011-06-01 19:31:43

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 01.06.2011 20:44, Peter Zijlstra wrote:
> On Wed, 2011-06-01 at 20:09 +0200, Peter Zijlstra wrote:
>> On Wed, 2011-06-01 at 19:20 +0200, Arne Jansen wrote:
>>> On 01.06.2011 18:35, Peter Zijlstra wrote:
>>>> On Wed, 2011-06-01 at 15:58 +0200, Arne Jansen wrote:
>>>>> git bisect blames this commit for a problem I have with v3.0-rc1:
>>>>> If I printk large amounts of data, the machine locks up.
>>>>> As the commit does not revert cleanly on top of 3.0, I haven't been
>>>>> able to double check.
>>>>> The test I use is simple, just add something like
>>>>>
>>>>> for (i=0; i< 10000; ++i) printk("test %d\n", i);
>>>>>
>>>>> and trigger it, in most cases I can see the first 10 printks before
>>>>> I have to power cycle the machine (sysrq-b does not work anymore).
>>>>> Attached my .config.
>>>>
>>>> I've made me a module that does the above, I've also changed my .config
>>>> to match yours (smp=y, sched-cgroup=y, autogroup=n, preempt=n, no_hz=y),
>>>> but sadly I cannot reproduce, I get all 10k prints on my serial line.
>>>>
>>>> Even without serial line it works (somehow booting without visible
>>>> console is scary as hell :)
>>>>
>>>> Which makes me ask, how are you observing your console?
>>>>
>>>
>>> They don't go out to the serial line, I only observe them with a
>>> tail -f on messages. Default log level doesn't go the console here.
>>
>> Right ok, so I used your exact .config, added a few drivers needed for
>> my hardware and indeed, it doesn't even finish booting and gets stuck
>> someplace.
>>
>> Sadly it looks like even the NMI watchdog is dead,.. /me goes try and
>> make sense of this.
>
> Sadly both 0122ec5b02f766c355b3168df53a6c038a24fa0d^1 and
> 0122ec5b02f766c355b3168df53a6c038a24fa0d itself boot just fine and run
> the test module without problems.

I can only partially confirm this:

2acca55ed98ad9b9aa25e7e587ebe306c0313dc7 runs fine
0122ec5b02f766c355b3168df53a6c038a24fa0d freezes after line 189
ab2515c4b98f7bc4fa11cad9fa0f811d63a72a26 freezes after line 39

>
> I will have to re-bisect this.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-01 21:09:33

by Linus Torvalds

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

Boot-time hang - maybe due to the mis-merge that re-introduced the
infinite media change signals for ide-cd?

I just pushed out a fix, it may not have mirrored out yet.

I dunno. Worth checking out before spending a lot of time bisecting.

Linus

2011-06-03 09:16:00

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Thu, 2011-06-02 at 06:09 +0900, Linus Torvalds wrote:
> Boot-time hang - maybe due to the mis-merge that re-introduced the
> infinite media change signals for ide-cd?
>
> I just pushed out a fix, it may not have mirrored out yet.
>
> I dunno. Worth checking out before spending a lot of time bisecting.

Right, so that wasn't it. I haven't done a full bisect yet because I
noticed it died on a usb suspend line every single time and that machine
only had a single usb device, a memory stick, in it. So I simply pulled
the stick and voila it booted. So something is screwy with usb suspend
or something.

This of course means that I'm now completely unable to reproduce the
issue at hand :/

Maybe if I try another box..

Anyway, Arne, how long did you wait before power cycling the box? The
NMI watchdog should trigger in about a minute or so if it will trigger
at all (its enabled in your config).

2011-06-03 10:03:05

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 03.06.2011 11:15, Peter Zijlstra wrote:
> On Thu, 2011-06-02 at 06:09 +0900, Linus Torvalds wrote:
>> Boot-time hang - maybe due to the mis-merge that re-introduced the
>> infinite media change signals for ide-cd?
>>
>> I just pushed out a fix, it may not have mirrored out yet.
>>
>> I dunno. Worth checking out before spending a lot of time bisecting.
>
> Right, so that wasn't it. I haven't done a full bisect yet because I
> noticed it died on a usb suspend line every single time and that machine
> only had a single usb device, a memory stick, in it. So I simply pulled
> the stick and voila it booted. So something is screwy with usb suspend
> or something.
>
> This of course means that I'm now completely unable to reproduce the
> issue at hand :/
>
> Maybe if I try another box..
>
> Anyway, Arne, how long did you wait before power cycling the box? The
> NMI watchdog should trigger in about a minute or so if it will trigger
> at all (its enabled in your config).

No, it doesn't trigger, but the hang is not as complete as I first
thought. A running iostat via ssh continues to give output for a while,
the serial console still reacts to return and prompts for login. But
after a while more and more locks up. The console locks as soon as I
sysrq-t.
Maybe it has also something to do with the place where I added the
printks (btrfs_scan_one_device). Also the 10k-print gets triggered
several times (though I only see 10 lines of output). Maybe you can
send me your test-module and I'll try that, so we have more equal
conditions.
What also might help: the maschine I'm testing with is a quad-core
X3450 with 8GB RAM.

2011-06-03 10:31:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
> On 03.06.2011 11:15, Peter Zijlstra wrote:

> > Anyway, Arne, how long did you wait before power cycling the box? The
> > NMI watchdog should trigger in about a minute or so if it will trigger
> > at all (its enabled in your config).
>
> No, it doesn't trigger,

Bummer.

> but the hang is not as complete as I first
> thought. A running iostat via ssh continues to give output for a while,
> the serial console still reacts to return and prompts for login. But
> after a while more and more locks up. The console locks as soon as I
> sysrq-t.

OK, that seems to suggest one CPU is stuck, and once you try something
that touches the CPU everything grinds to a halt. Does something like
sysrq-l work? That would send NMIs to the other CPUs.

Anyway, good to know using serial doesn't make it go away, that means
its not too timing sensitive.

> Maybe it has also something to do with the place where I added the
> printks (btrfs_scan_one_device).

printk() should work pretty much anywhere these days, and filesystem
code in particular shouldn't be ran from any weird and wonderful
contexts afaik.

> Also the 10k-print gets triggered
> several times (though I only see 10 lines of output). Maybe you can
> send me your test-module and I'll try that, so we have more equal
> conditions.

Sure, see below.

> What also might help: the maschine I'm testing with is a quad-core
> X3450 with 8GB RAM.

/me & wikipedia, that's a nehalem box, ok I'm testing on a westmere
(don't have a nehalem).

---
kernel/Makefile | 1 +
kernel/test.c | 23 +++++++++++++++++++++++
2 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfc..65eff6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-m += test.o
obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
diff --git a/kernel/test.c b/kernel/test.c
index e69de29..8005395 100644
--- a/kernel/test.c
+++ b/kernel/test.c
@@ -0,0 +1,23 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+MODULE_LICENSE("GPL");
+
+static void
+test_cleanup(void)
+{
+}
+
+static int __init
+test_init(void)
+{
+ int i;
+
+ for (i = 0; i < 10000; i++)
+ printk("test %d\n", i);
+
+ return 0;
+}
+
+module_init(test_init);
+module_exit(test_cleanup);

2011-06-03 11:52:21

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 03.06.2011 12:30, Peter Zijlstra wrote:
> On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
>> On 03.06.2011 11:15, Peter Zijlstra wrote:
>
> Bummer.
>
>> but the hang is not as complete as I first
>> thought. A running iostat via ssh continues to give output for a while,
>> the serial console still reacts to return and prompts for login. But
>> after a while more and more locks up. The console locks as soon as I
>> sysrq-t.
>
> OK, that seems to suggest one CPU is stuck, and once you try something
> that touches the CPU everything grinds to a halt. Does something like
> sysrq-l work? That would send NMIs to the other CPUs.
>
> Anyway, good to know using serial doesn't make it go away, that means
> its not too timing sensitive.
>
>
>> Also the 10k-print gets triggered
>> several times (though I only see 10 lines of output). Maybe you can
>> send me your test-module and I'll try that, so we have more equal
>> conditions.
>
> Sure, see below.
>

Your module also triggers it. On first test directly on first try, on
second test only on the 3rd try. When it hangs sysrq-l doesn't give
any output. I double checked without a hang, and then it dumps
something.


>> What also might help: the maschine I'm testing with is a quad-core
>> X3450 with 8GB RAM.
>
> /me & wikipedia, that's a nehalem box, ok I'm testing on a westmere
> (don't have a nehalem).


>
> ---
> kernel/Makefile | 1 +
> kernel/test.c | 23 +++++++++++++++++++++++
> 2 files changed, 24 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 2d64cfc..65eff6c 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -80,6 +80,7 @@ obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
> obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
> obj-$(CONFIG_SECCOMP) += seccomp.o
> obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
> +obj-m += test.o
> obj-$(CONFIG_TREE_RCU) += rcutree.o
> obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
> obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
> diff --git a/kernel/test.c b/kernel/test.c
> index e69de29..8005395 100644
> --- a/kernel/test.c
> +++ b/kernel/test.c
> @@ -0,0 +1,23 @@
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +
> +MODULE_LICENSE("GPL");
> +
> +static void
> +test_cleanup(void)
> +{
> +}
> +
> +static int __init
> +test_init(void)
> +{
> + int i;
> +
> + for (i = 0; i < 10000; i++)
> + printk("test %d\n", i);
> +
> + return 0;
> +}
> +
> +module_init(test_init);
> +module_exit(test_cleanup);
>

2011-06-03 12:44:26

by Linus Torvalds

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Fri, Jun 3, 2011 at 7:02 PM, Arne Jansen <[email protected]> wrote:
>
> No, it doesn't trigger, but the hang is not as complete as I first
> thought. A running iostat via ssh continues to give output for a while,
> the serial console still reacts to return and prompts for login. But
> after a while more and more locks up. The console locks as soon as I
> sysrq-t.

Is it the tty rescheduling bug?

That would explain the printk's mattering.

Remove the schedule_work() call from flush_to_ldisc() in
drivers/tty/tty_buffer.c and see if the problem goes away. See the
other discussion thread on lkml ("tty breakage in X (Was: tty vs
workqueue oddities)")

Hmm?

Linus

2011-06-03 13:06:03

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 03.06.2011 14:44, Linus Torvalds wrote:
> On Fri, Jun 3, 2011 at 7:02 PM, Arne Jansen <[email protected]> wrote:
>>
>> No, it doesn't trigger, but the hang is not as complete as I first
>> thought. A running iostat via ssh continues to give output for a while,
>> the serial console still reacts to return and prompts for login. But
>> after a while more and more locks up. The console locks as soon as I
>> sysrq-t.
>
> Is it the tty rescheduling bug?
>
> That would explain the printk's mattering.
>
> Remove the schedule_work() call from flush_to_ldisc() in
> drivers/tty/tty_buffer.c and see if the problem goes away. See the
> other discussion thread on lkml ("tty breakage in X (Was: tty vs
> workqueue oddities)")
>
> Hmm?

No change. Also git bisect quite clearly points to
0122ec5b02f766c and ab2515c4b98f7bc4, both are older than
b1c43f82c5aa2654 mentioned in the other thread.

-Arne


> Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-04 21:30:28

by Linus Torvalds

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Fri, Jun 3, 2011 at 10:05 PM, Arne Jansen <[email protected]> wrote:
>
> No change. Also git bisect quite clearly points to
> 0122ec5b02f766c and ab2515c4b98f7bc4, both are older than
> b1c43f82c5aa2654 mentioned in the other thread.

Ok, I haven't heard anything further on this. Ingo? Peter?

We're getting to the point where we just need to revert the thing,
since I'm not getting the feeling that there are any fixes
forthcoming, and I'd like -rc2 to not have this kind of bisected bug.

Ingo? Those two commits no longer revert cleanly, presumably due to
other changes in the area (but I didn't check). Can you do a patch to
do the reverts, and then you can try to re-do the thing later once you
figure out what's wrong.

Linus

2011-06-04 22:05:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Sun, 2011-06-05 at 06:29 +0900, Linus Torvalds wrote:
> On Fri, Jun 3, 2011 at 10:05 PM, Arne Jansen <[email protected]> wrote:
> >
> > No change. Also git bisect quite clearly points to
> > 0122ec5b02f766c and ab2515c4b98f7bc4, both are older than
> > b1c43f82c5aa2654 mentioned in the other thread.
>
> Ok, I haven't heard anything further on this. Ingo? Peter?

I'm a bit stumped, and not being able to reproduce at all :/

> We're getting to the point where we just need to revert the thing,
> since I'm not getting the feeling that there are any fixes
> forthcoming, and I'd like -rc2 to not have this kind of bisected bug.

Agreed.

> Ingo? Those two commits no longer revert cleanly, presumably due to
> other changes in the area (but I didn't check). Can you do a patch to
> do the reverts, and then you can try to re-do the thing later once you
> figure out what's wrong.

Yeah, that wants a whole lot of reverting, from the offending commit up
to and including 317f394160e9beb97d19a84c39b7e5eb3d7815a8.

2011-06-04 22:51:07

by Linus Torvalds

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Sun, Jun 5, 2011 at 7:08 AM, Peter Zijlstra <[email protected]> wrote:
>
> Yeah, that wants a whole lot of reverting, from the offending commit up
> to and including 317f394160e9beb97d19a84c39b7e5eb3d7815a8.

Mind sending one single tested patch? I still get conflicts, even just
trying to revert the last of those (ie 317f394160e9) due to all the
other scheduler changes..

Linus

2011-06-05 06:02:13

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 05.06.2011 00:08, Peter Zijlstra wrote:
> On Sun, 2011-06-05 at 06:29 +0900, Linus Torvalds wrote:
>> On Fri, Jun 3, 2011 at 10:05 PM, Arne Jansen<[email protected]> wrote:
>>>
>>> No change. Also git bisect quite clearly points to
>>> 0122ec5b02f766c and ab2515c4b98f7bc4, both are older than
>>> b1c43f82c5aa2654 mentioned in the other thread.
>>
>> Ok, I haven't heard anything further on this. Ingo? Peter?
>
> I'm a bit stumped, and not being able to reproduce at all :/

I'm willing to take any number of round trips to get to the true
nature of the bug. From the description I have a feeling that the
offending patch might just shift the timing slightly, so even if
the problem is gone for me, it might just be buried deeper.
I can also try to reproduce it on a second machine and give you
access to it, though this might take a few days.

>
>> We're getting to the point where we just need to revert the thing,
>> since I'm not getting the feeling that there are any fixes
>> forthcoming, and I'd like -rc2 to not have this kind of bisected bug.
>
> Agreed.
>
>> Ingo? Those two commits no longer revert cleanly, presumably due to
>> other changes in the area (but I didn't check). Can you do a patch to
>> do the reverts, and then you can try to re-do the thing later once you
>> figure out what's wrong.
>
> Yeah, that wants a whole lot of reverting, from the offending commit up
> to and including 317f394160e9beb97d19a84c39b7e5eb3d7815a8.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 07:57:50

by Mike Galbraith

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On Sun, 2011-06-05 at 08:01 +0200, Arne Jansen wrote:
> On 05.06.2011 00:08, Peter Zijlstra wrote:
> > On Sun, 2011-06-05 at 06:29 +0900, Linus Torvalds wrote:
> >> On Fri, Jun 3, 2011 at 10:05 PM, Arne Jansen<[email protected]> wrote:
> >>>
> >>> No change. Also git bisect quite clearly points to
> >>> 0122ec5b02f766c and ab2515c4b98f7bc4, both are older than
> >>> b1c43f82c5aa2654 mentioned in the other thread.
> >>
> >> Ok, I haven't heard anything further on this. Ingo? Peter?
> >
> > I'm a bit stumped, and not being able to reproduce at all :/
>
> I'm willing to take any number of round trips to get to the true
> nature of the bug. From the description I have a feeling that the
> offending patch might just shift the timing slightly, so even if
> the problem is gone for me, it might just be buried deeper.
> I can also try to reproduce it on a second machine and give you
> access to it, though this might take a few days.

My x3550 M3 with your config plus a couple drivers I need is exhibiting
hard lockups with no watchdog triggered. If I use both serial and tty
consoles, I get all kinds of gripes from soft lockup and rcu when
loading Peter's test module, but that's the tty problem.

With serial console and earlyprintk=serial alone, box may or may not
boot all the way without hard locking silently. Ditto on restart after
loading/unloading Peter's test module. I always see all 10k messages
though, IFF I have no tty console active.

-Mike

2011-06-05 08:18:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()


* Peter Zijlstra <[email protected]> wrote:

> On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
> > On 03.06.2011 11:15, Peter Zijlstra wrote:
>
> > > Anyway, Arne, how long did you wait before power cycling the box? The
> > > NMI watchdog should trigger in about a minute or so if it will trigger
> > > at all (its enabled in your config).
> >
> > No, it doesn't trigger,
>
> Bummer.

Is there no output even when the console is configured to do an
earlyprintk? That will allow the NMI watchdog to punch through even a
printk or scheduler lockup.

Arne, you can turn this on via one of these:

earlyprintk=vga,keep
earlyprintk=serial,ttyS0,115200,keep

(the ',keep' portion is important to have it active even after the
regular console has been switched on.)

Could you also please check with the (untested) patch below applied?
This will turn off *all* printk done by the NMI watchdog and switches
it to do pure early_printk() - which does not use any locking so it
should never lock up.

[ If you keep seeing 'NMI watchdog tick' messages periodically
occuring after the lockup then i'll send a more complete patch that
shuts off the regular printk path and makes sure that all output is
early_printk() based only. ]

earlyprintk=,keep with such a patch has let me down only on the
rarest of occasions.

( Arne, please also double check on a working bootup that the NMI
watchdog is actually ticking, by checking the NMI counts in
/proc/interrupts go up slowly but surely on all CPUs. )

Thanks,

Ingo

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56a..7c7e33f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -234,15 +234,12 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;

- if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-
__this_cpu_write(hard_watchdog_warn, true);
return;
}

+ early_printk("NMI watchog tick %ld\n", jiffies);
+
__this_cpu_write(hard_watchdog_warn, false);
return;
}

2011-06-05 08:53:59

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 05.06.2011 10:17, Ingo Molnar wrote:
>
> * Peter Zijlstra<[email protected]> wrote:
>
>> On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
>>> On 03.06.2011 11:15, Peter Zijlstra wrote:
>>
>>>> Anyway, Arne, how long did you wait before power cycling the box? The
>>>> NMI watchdog should trigger in about a minute or so if it will trigger
>>>> at all (its enabled in your config).
>>>
>>> No, it doesn't trigger,
>>
>> Bummer.
>
> Is there no output even when the console is configured to do an
> earlyprintk? That will allow the NMI watchdog to punch through even a
> printk or scheduler lockup.
>

Just to be clear, I have no boot problems whatsoever. And I have no
problems with the serial console. It's just the regular printk locking
up when e.g. I load the test module.

> Arne, you can turn this on via one of these:
>
> earlyprintk=vga,keep

I don't have access to vga as it is a remote machine.

> earlyprintk=serial,ttyS0,115200,keep

I'll try that.

>
> (the ',keep' portion is important to have it active even after the
> regular console has been switched on.)
>
> Could you also please check with the (untested) patch below applied?
> This will turn off *all* printk done by the NMI watchdog and switches
> it to do pure early_printk() - which does not use any locking so it
> should never lock up.
>
> [ If you keep seeing 'NMI watchdog tick' messages periodically
> occuring after the lockup then i'll send a more complete patch that
> shuts off the regular printk path and makes sure that all output is
> early_printk() based only. ]
>
> earlyprintk=,keep with such a patch has let me down only on the
> rarest of occasions.
>
> ( Arne, please also double check on a working bootup that the NMI
> watchdog is actually ticking, by checking the NMI counts in
> /proc/interrupts go up slowly but surely on all CPUs. )
>
> Thanks,
>
> Ingo
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 3d0c56a..7c7e33f 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -234,15 +234,12 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
> if (__this_cpu_read(hard_watchdog_warn) == true)
> return;
>
> - if (hardlockup_panic)
> - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
> - else
> - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
> -
> __this_cpu_write(hard_watchdog_warn, true);
> return;
> }
>
> + early_printk("NMI watchog tick %ld\n", jiffies);
> +
> __this_cpu_write(hard_watchdog_warn, false);
> return;
> }

2011-06-05 09:42:04

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()


* Arne Jansen <[email protected]> wrote:

> On 05.06.2011 10:17, Ingo Molnar wrote:
> >
> >* Peter Zijlstra<[email protected]> wrote:
> >
> >>On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
> >>>On 03.06.2011 11:15, Peter Zijlstra wrote:
> >>
> >>>>Anyway, Arne, how long did you wait before power cycling the box? The
> >>>>NMI watchdog should trigger in about a minute or so if it will trigger
> >>>>at all (its enabled in your config).
> >>>
> >>>No, it doesn't trigger,
> >>
> >>Bummer.
> >
> >Is there no output even when the console is configured to do an
> >earlyprintk? That will allow the NMI watchdog to punch through even a
> >printk or scheduler lockup.
> >
>
> Just to be clear, I have no boot problems whatsoever. And I have no
> problems with the serial console. It's just the regular printk locking
> up when e.g. I load the test module.

Yes.

> > Arne, you can turn this on via one of these:
> >
> > earlyprintk=vga,keep
>
> I don't have access to vga as it is a remote machine.
>
> > earlyprintk=serial,ttyS0,115200,keep
>
> I'll try that.

Please don't forget:

> > Could you also please check with the (untested) patch below applied?
> > This will turn off *all* printk done by the NMI watchdog and switches
> > it to do pure early_printk() - which does not use any locking so it

if you get a lockup somewhere within printk then the NMI watchdog
will lock up.

Also please first check that the NMI watchdog is ticking. (the patch
will ensure that, there will be periodic prints to the serial log)

Thanks,

Ingo

2011-06-05 09:43:16

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 05.06.2011 10:17, Ingo Molnar wrote:
>
> * Peter Zijlstra<[email protected]> wrote:
>
>> On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
>>> On 03.06.2011 11:15, Peter Zijlstra wrote:
>>
>>>> Anyway, Arne, how long did you wait before power cycling the box? The
>>>> NMI watchdog should trigger in about a minute or so if it will trigger
>>>> at all (its enabled in your config).
>>>
>>> No, it doesn't trigger,
>>
>> Bummer.
>
> Is there no output even when the console is configured to do an
> earlyprintk? That will allow the NMI watchdog to punch through even a
> printk or scheduler lockup.
>
> Arne, you can turn this on via one of these:
>
> earlyprintk=vga,keep
> earlyprintk=serial,ttyS0,115200,keep

My grub conf looks like this now:
kernel /boot/vmlinuz-2.6.39-rc3+ root=LABEL=label panic=15
console=ttyS0,9600 earlyprintk=serial,ttyS0,9600,keep quiet

>
> (the ',keep' portion is important to have it active even after the
> regular console has been switched on.)
>
> Could you also please check with the (untested) patch below applied?
> This will turn off *all* printk done by the NMI watchdog and switches
> it to do pure early_printk() - which does not use any locking so it
> should never lock up.
>
> [ If you keep seeing 'NMI watchdog tick' messages periodically
> occuring after the lockup then i'll send a more complete patch that
> shuts off the regular printk path and makes sure that all output is
> early_printk() based only. ]
>
> earlyprintk=,keep with such a patch has let me down only on the
> rarest of occasions.
>
> ( Arne, please also double check on a working bootup that the NMI
> watchdog is actually ticking, by checking the NMI counts in
> /proc/interrupts go up slowly but surely on all CPUs. )

It does, but _very_ slowly. Some CPUs do not count up for tens of
minutes if the machine is idle. If I generate some load like 'make
tags', the counters go up quite quickly.
After 4 minutes and one 'make cscope' it looks like this:
NMI: 8 13 43 5 2 3
22 1 Non-maskable interrupts

But I never see a single tick on console or in dmesg, even when I
replace the early_printk with a printk.

Btw, I get one warn on boot, but it look irrelevant to me:
[ 36.064321] ------------[ cut here ]------------
[ 36.064328] WARNING: at kernel/printk.c:293 do_syslog+0xbf/0x550()
[ 36.064330] Hardware name: X8SIL
[ 36.064331] Attempt to access syslog with CAP_SYS_ADMIN but no
CAP_SYSLOG (deprecated).
[ 36.064333] Modules linked in: mpt2sas scsi_transport_sas raid_class
[ 36.064338] Pid: 21625, comm: syslog-ng Not tainted 2.6.39-rc3+ #8
[ 36.064340] Call Trace:
[ 36.064344] [<ffffffff81091f7a>] warn_slowpath_common+0x7a/0xb0
[ 36.064347] [<ffffffff81092051>] warn_slowpath_fmt+0x41/0x50
[ 36.064351] [<ffffffff8109d8a5>] ? ns_capable+0x25/0x60
[ 36.064354] [<ffffffff8109365f>] do_syslog+0xbf/0x550
[ 36.064358] [<ffffffff810c9575>] ? lock_release_holdtime+0x35/0x170
[ 36.064362] [<ffffffff811e17a7>] kmsg_open+0x17/0x20
[ 36.064366] [<ffffffff811d5f46>] proc_reg_open+0xa6/0x180
[ 36.064368] [<ffffffff811e1790>] ? kmsg_release+0x20/0x20
[ 36.064371] [<ffffffff811e1770>] ? read_vmcore+0x1d0/0x1d0
[ 36.064374] [<ffffffff811d5ea0>] ? proc_fill_super+0xb0/0xb0
[ 36.064378] [<ffffffff811790bb>] __dentry_open+0x15b/0x330
[ 36.064382] [<ffffffff8185d6e6>] ? _raw_spin_unlock+0x26/0x30
[ 36.064385] [<ffffffff81179379>] nameidata_to_filp+0x69/0x80
[ 36.064388] [<ffffffff81187a3a>] do_last+0x1da/0x840
[ 36.064391] [<ffffffff81188fdb>] path_openat+0xcb/0x3f0
[ 36.064394] [<ffffffff810ba5c5>] ? sched_clock_cpu+0xc5/0x100
[ 36.064397] [<ffffffff8118944a>] do_filp_open+0x7a/0xa0
[ 36.064400] [<ffffffff8185d6e6>] ? _raw_spin_unlock+0x26/0x30
[ 36.064402] [<ffffffff81196c12>] ? alloc_fd+0xf2/0x140
[ 36.064405] [<ffffffff8117a3d2>] do_sys_open+0x102/0x1e0
[ 36.064408] [<ffffffff8117a4db>] sys_open+0x1b/0x20
[ 36.064412] [<ffffffff81864dbb>] system_call_fastpath+0x16/0x1b
[ 36.064414] ---[ end trace df959c735174f5f7 ]---


-Arne

>
> Thanks,
>
> Ingo
>

2011-06-05 09:46:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()


* Ingo Molnar <[email protected]> wrote:

>
> * Arne Jansen <[email protected]> wrote:
>
> > On 05.06.2011 10:17, Ingo Molnar wrote:
> > >
> > >* Peter Zijlstra<[email protected]> wrote:
> > >
> > >>On Fri, 2011-06-03 at 12:02 +0200, Arne Jansen wrote:
> > >>>On 03.06.2011 11:15, Peter Zijlstra wrote:
> > >>
> > >>>>Anyway, Arne, how long did you wait before power cycling the box? The
> > >>>>NMI watchdog should trigger in about a minute or so if it will trigger
> > >>>>at all (its enabled in your config).
> > >>>
> > >>>No, it doesn't trigger,
> > >>
> > >>Bummer.
> > >
> > >Is there no output even when the console is configured to do an
> > >earlyprintk? That will allow the NMI watchdog to punch through even a
> > >printk or scheduler lockup.
> > >
> >
> > Just to be clear, I have no boot problems whatsoever. And I have no
> > problems with the serial console. It's just the regular printk locking
> > up when e.g. I load the test module.
>
> Yes.
>
> > > Arne, you can turn this on via one of these:
> > >
> > > earlyprintk=vga,keep
> >
> > I don't have access to vga as it is a remote machine.
> >
> > > earlyprintk=serial,ttyS0,115200,keep
> >
> > I'll try that.
>
> Please don't forget:
>
> > > Could you also please check with the (untested) patch below applied?
> > > This will turn off *all* printk done by the NMI watchdog and switches
> > > it to do pure early_printk() - which does not use any locking so it
>
> if you get a lockup somewhere within printk then the NMI watchdog
> will lock up.

Please use the updated patch below - the first one wasnt informative
enough and it would stop 'ticking' after a hard lockup - not good :-)

With the patch below applied you should get periodic printouts from
the NMI watchdog both before and after the hard lockup.

If the NMI watchdog does not stop ticking after the lockup i'll send
a more complete patch that allows the printout of a backtrace on
every CPU, after the lockup.

Thanks,

Ingo
--
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56a..d335bc7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -216,6 +216,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;

+ early_printk("CPU #%d NMI watchdog tick %ld\n", smp_processor_id(), jiffies);
+
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
@@ -234,11 +236,6 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;

- if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-
__this_cpu_write(hard_watchdog_warn, true);
return;
}

2011-06-05 09:56:09

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()


* Arne Jansen <[email protected]> wrote:

> >( Arne, please also double check on a working bootup that the NMI
> > watchdog is actually ticking, by checking the NMI counts in
> > /proc/interrupts go up slowly but surely on all CPUs. )
>
> It does, but _very_ slowly. Some CPUs do not count up for tens of
> minutes if the machine is idle. If I generate some load like 'make
> tags', the counters go up quite quickly.
> After 4 minutes and one 'make cscope' it looks like this:
> NMI: 8 13 43 5 2
> 3 22 1 Non-maskable interrupts
>
> But I never see a single tick on console or in dmesg, even when I
> replace the early_printk with a printk.

hm, that might be because the NMI watchdog uses halted cycles to
tick.

That's not a problem (the kernel cannot lock up while there are no
cycles ticking) but nevertheless could you work this around please
by starting 8 infinite shell loops:

for ((i=0; i<8; i++)); do while : ; do : ; done & done

?

This will saturate all cores and makes sure the NMI watchdog is
ticking everywhere.

Hopefully this wont make the bug go away :-)

This will remove one factor of uncertainty (of where the NMI watchdog
is working or not), so it simplifies debugging.

> [ 36.064321] ------------[ cut here ]------------
> [ 36.064328] WARNING: at kernel/printk.c:293 do_syslog+0xbf/0x550()
> [ 36.064330] Hardware name: X8SIL
> [ 36.064331] Attempt to access syslog with CAP_SYS_ADMIN but no
> CAP_SYSLOG (deprecated).

Yeah, unrelated, and rather annoying looking that warning. The
warning is borderline correct (it's messy to drop CAP_SYSLOG but keep
CAP_SYS_ADMIN) but still, if we warned every time userspace relies on
something the kernel provided in the past, in a somewhat messy way,
we'd never complete bootup i guess ;-)

Thanks,

Ingo

2011-06-05 10:22:30

by Arne Jansen

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()

On 05.06.2011 11:55, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>>> ( Arne, please also double check on a working bootup that the NMI
>>> watchdog is actually ticking, by checking the NMI counts in
>>> /proc/interrupts go up slowly but surely on all CPUs. )
>>
>> It does, but _very_ slowly. Some CPUs do not count up for tens of
>> minutes if the machine is idle. If I generate some load like 'make
>> tags', the counters go up quite quickly.
>> After 4 minutes and one 'make cscope' it looks like this:
>> NMI: 8 13 43 5 2
>> 3 22 1 Non-maskable interrupts
>>
>> But I never see a single tick on console or in dmesg, even when I
>> replace the early_printk with a printk.
>
> hm, that might be because the NMI watchdog uses halted cycles to
> tick.
>
> That's not a problem (the kernel cannot lock up while there are no
> cycles ticking) but nevertheless could you work this around please
> by starting 8 infinite shell loops:
>
> for ((i=0; i<8; i++)); do while : ; do : ; done& done
>
> ?
>
> This will saturate all cores and makes sure the NMI watchdog is
> ticking everywhere.
>
> Hopefully this wont make the bug go away :-)
>

OK, now we get going. I get the ticks, the bug is still there, and
all CPUs still tick after the lockup. I also added an early_printk
inside the lockup-if, and it reports hard lockups. At first for only
one or 2 CPUs, and after some time all CPUs are locked up.

-Arne

2011-06-05 11:01:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:sched/locking] sched: Add p->pi_lock to task_rq_lock()


* Arne Jansen <[email protected]> wrote:

> On 05.06.2011 11:55, Ingo Molnar wrote:
> >
> >* Arne Jansen<[email protected]> wrote:
> >
> >>>( Arne, please also double check on a working bootup that the NMI
> >>> watchdog is actually ticking, by checking the NMI counts in
> >>> /proc/interrupts go up slowly but surely on all CPUs. )
> >>
> >>It does, but _very_ slowly. Some CPUs do not count up for tens of
> >>minutes if the machine is idle. If I generate some load like 'make
> >>tags', the counters go up quite quickly.
> >>After 4 minutes and one 'make cscope' it looks like this:
> >>NMI: 8 13 43 5 2
> >>3 22 1 Non-maskable interrupts
> >>
> >>But I never see a single tick on console or in dmesg, even when I
> >>replace the early_printk with a printk.
> >
> >hm, that might be because the NMI watchdog uses halted cycles to
> >tick.
> >
> >That's not a problem (the kernel cannot lock up while there are no
> >cycles ticking) but nevertheless could you work this around please
> >by starting 8 infinite shell loops:
> >
> > for ((i=0; i<8; i++)); do while : ; do : ; done& done
> >
> >?
> >
> >This will saturate all cores and makes sure the NMI watchdog is
> >ticking everywhere.
> >
> >Hopefully this wont make the bug go away :-)
> >
>
> OK, now we get going. I get the ticks, the bug is still there, and
> all CPUs still tick after the lockup. I also added an early_printk
> inside the lockup-if, and it reports hard lockups. At first for only
> one or 2 CPUs, and after some time all CPUs are locked up.

Very good!

If you add a dump_stack() do you get a stacktrace, or do the NMI
watchdog ticks stop?

If the ticks stop this suggests a lockup within the printk code. If
you get a stack dump then we'll have good debug data.

Thanks,

Ingo

2011-06-05 11:19:55

by Ingo Molnar

[permalink] [raw]
Subject: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Ingo Molnar <[email protected]> wrote:

> If the ticks stop this suggests a lockup within the printk code.
> [...]

In which case the printk-killswitch patch below (to be applied
*instead* of the previous debugging patch i sent) should provide the
desired NMI watchdog output on the serial console.

Warning: it's entirely untested.

Thanks,

Ingo

arch/x86/kernel/early_printk.c | 2 +-
include/linux/printk.h | 4 ++++
kernel/printk.c | 18 ++++++++++++++++++
kernel/watchdog.c | 7 +++++++
4 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a35..d75fd66 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -171,7 +171,7 @@ static struct console early_serial_console = {

/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
+int early_console_initialized;

asmlinkage void early_printk(const char *fmt, ...)
{
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 0101d55..7393291 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -88,6 +88,8 @@ int no_printk(const char *fmt, ...)
return 0;
}

+extern int early_console_initialized;
+
extern asmlinkage __attribute__ ((format (printf, 1, 2)))
void early_printk(const char *fmt, ...);

@@ -114,6 +116,8 @@ extern int printk_delay_msec;
extern int dmesg_restrict;
extern int kptr_restrict;

+extern void printk_kill(void);
+
void log_buf_kexec_setup(void);
void __init setup_log_buf(int early);
#else
diff --git a/kernel/printk.c b/kernel/printk.c
index 3518539..f6193e1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -519,6 +519,19 @@ static void __call_console_drivers(unsigned start, unsigned end)
}
}

+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static int __read_mostly printk_killswitch;
+
+void printk_kill(void)
+{
+ printk_killswitch = 1;
+}
+
static int __read_mostly ignore_loglevel;

static int __init ignore_loglevel_setup(char *str)
@@ -833,6 +846,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
size_t plen;
char special;

+ /* Return early if a debugging subsystem has killed printk output: */
+ if (unlikely(printk_killswitch))
+ return 1;
+
boot_delay_msec();
printk_delay();

@@ -1533,6 +1550,7 @@ void register_console(struct console *newcon)
for_each_console(bcon)
if (bcon->flags & CON_BOOT)
unregister_console(bcon);
+ early_console_initialized = 0;
} else {
printk(KERN_INFO "%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56a..6e9b109 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -234,6 +234,13 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;

+ /*
+ * If early-printk is enabled then make sure we do not
+ * lock up in printk() and kill console logging:
+ */
+ if (early_console_initialized)
+ printk_kill();
+
if (hardlockup_panic)
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
else

2011-06-05 11:36:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Ingo Molnar <[email protected]> wrote:

> > If the ticks stop this suggests a lockup within the printk code.
> > [...]
>
> In which case the printk-killswitch patch below (to be applied
> *instead* of the previous debugging patch i sent) should provide
> the desired NMI watchdog output on the serial console.
>
> Warning: it's entirely untested.

Note, since this is an SMP box, if the lockup messages show up with
this patch but are mixed up with each other then adding a spinlock
around the WARN() would probably help keeping the output serialized.

A simple:

static DEFINE_SPINLOCK(watchdog_output_lock);

...
spin_lock(&watchdog_output_lock);
...
[ the WARN_ON() logic. ]
...
spin_unlock(&watchdog_output_lock);
...

would suffice.

Thanks,

Ingo

2011-06-05 11:58:00

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 13:36, Ingo Molnar wrote:
>
> * Ingo Molnar<[email protected]> wrote:
>
>>> If the ticks stop this suggests a lockup within the printk code.
>>> [...]
>>
>> In which case the printk-killswitch patch below (to be applied
>> *instead* of the previous debugging patch i sent) should provide
>> the desired NMI watchdog output on the serial console.
>>
>> Warning: it's entirely untested.

How is the output supposed to come through? shouldn't printk revert
to early_printk instead of just returning?


>
> Note, since this is an SMP box, if the lockup messages show up with
> this patch but are mixed up with each other then adding a spinlock
> around the WARN() would probably help keeping the output serialized.
>
> A simple:
>
> static DEFINE_SPINLOCK(watchdog_output_lock);
>
> ...
> spin_lock(&watchdog_output_lock);
> ...
> [ the WARN_ON() logic. ]
> ...
> spin_unlock(&watchdog_output_lock);
> ...
>
> would suffice.
>
> Thanks,
>
> Ingo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 13:40:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> >> Warning: it's entirely untested.
>
> How is the output supposed to come through? shouldn't printk revert
> to early_printk instead of just returning?

oh, right you are.

Does the patch below work? It does early-printk within printk().

Thanks,

Ingo

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a35..211d8c2 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -170,8 +170,8 @@ static struct console early_serial_console = {
};

/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
+struct console *early_console = &early_vga_console;
+int early_console_initialized;

asmlinkage void early_printk(const char *fmt, ...)
{
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 0101d55..414dc34 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -88,6 +88,9 @@ int no_printk(const char *fmt, ...)
return 0;
}

+extern struct console *early_console;
+extern int early_console_initialized;
+
extern asmlinkage __attribute__ ((format (printf, 1, 2)))
void early_printk(const char *fmt, ...);

@@ -114,6 +117,8 @@ extern int printk_delay_msec;
extern int dmesg_restrict;
extern int kptr_restrict;

+extern void printk_kill(void);
+
void log_buf_kexec_setup(void);
void __init setup_log_buf(int early);
#else
diff --git a/kernel/printk.c b/kernel/printk.c
index 3518539..50684e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -519,6 +519,19 @@ static void __call_console_drivers(unsigned start, unsigned end)
}
}

+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static int __read_mostly printk_killswitch;
+
+void printk_kill(void)
+{
+ printk_killswitch = 1;
+}
+
static int __read_mostly ignore_loglevel;

static int __init ignore_loglevel_setup(char *str)
@@ -833,6 +846,16 @@ asmlinkage int vprintk(const char *fmt, va_list args)
size_t plen;
char special;

+ /* Return early if a debugging subsystem has killed printk output: */
+ if (unlikely(printk_killswitch)) {
+ char buf[512];
+
+ printed_len = vscnprintf(buf, sizeof(buf), fmt, args);
+ early_console->write(early_console, buf, printed_len);
+
+ return printed_len;
+ }
+
boot_delay_msec();
printk_delay();

@@ -1533,6 +1556,7 @@ void register_console(struct console *newcon)
for_each_console(bcon)
if (bcon->flags & CON_BOOT)
unregister_console(bcon);
+ early_console_initialized = 0;
} else {
printk(KERN_INFO "%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56a..6e9b109 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -234,6 +234,13 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;

+ /*
+ * If early-printk is enabled then make sure we do not
+ * lock up in printk() and kill console logging:
+ */
+ if (early_console_initialized)
+ printk_kill();
+
if (hardlockup_panic)
panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
else

2011-06-05 13:54:40

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 15:39, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>>>> Warning: it's entirely untested.
>>
>> How is the output supposed to come through? shouldn't printk revert
>> to early_printk instead of just returning?
>
> oh, right you are.
>
> Does the patch below work? It does early-printk within printk().

Too late, I already built my on early_vprintk ;)

Here we go:

http://eischnee.de/lockup.txt

Now it's your turn :)

-Arne

>
> Thanks,
>
> Ingo
>
> diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
> index cd28a35..211d8c2 100644
> --- a/arch/x86/kernel/early_printk.c
> +++ b/arch/x86/kernel/early_printk.c
> @@ -170,8 +170,8 @@ static struct console early_serial_console = {
> };
>
> /* Direct interface for emergencies */
> -static struct console *early_console =&early_vga_console;
> -static int __initdata early_console_initialized;
> +struct console *early_console =&early_vga_console;
> +int early_console_initialized;
>
> asmlinkage void early_printk(const char *fmt, ...)
> {
> diff --git a/include/linux/printk.h b/include/linux/printk.h
> index 0101d55..414dc34 100644
> --- a/include/linux/printk.h
> +++ b/include/linux/printk.h
> @@ -88,6 +88,9 @@ int no_printk(const char *fmt, ...)
> return 0;
> }
>
> +extern struct console *early_console;
> +extern int early_console_initialized;
> +
> extern asmlinkage __attribute__ ((format (printf, 1, 2)))
> void early_printk(const char *fmt, ...);
>
> @@ -114,6 +117,8 @@ extern int printk_delay_msec;
> extern int dmesg_restrict;
> extern int kptr_restrict;
>
> +extern void printk_kill(void);
> +
> void log_buf_kexec_setup(void);
> void __init setup_log_buf(int early);
> #else
> diff --git a/kernel/printk.c b/kernel/printk.c
> index 3518539..50684e3 100644
> --- a/kernel/printk.c
> +++ b/kernel/printk.c
> @@ -519,6 +519,19 @@ static void __call_console_drivers(unsigned start, unsigned end)
> }
> }
>
> +/*
> + * This is independent of any log levels - a global
> + * kill switch that turns off all of printk.
> + *
> + * Used by the NMI watchdog if early-printk is enabled.
> + */
> +static int __read_mostly printk_killswitch;
> +
> +void printk_kill(void)
> +{
> + printk_killswitch = 1;
> +}
> +
> static int __read_mostly ignore_loglevel;
>
> static int __init ignore_loglevel_setup(char *str)
> @@ -833,6 +846,16 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> size_t plen;
> char special;
>
> + /* Return early if a debugging subsystem has killed printk output: */
> + if (unlikely(printk_killswitch)) {
> + char buf[512];
> +
> + printed_len = vscnprintf(buf, sizeof(buf), fmt, args);
> + early_console->write(early_console, buf, printed_len);
> +
> + return printed_len;
> + }
> +
> boot_delay_msec();
> printk_delay();
>
> @@ -1533,6 +1556,7 @@ void register_console(struct console *newcon)
> for_each_console(bcon)
> if (bcon->flags& CON_BOOT)
> unregister_console(bcon);
> + early_console_initialized = 0;
> } else {
> printk(KERN_INFO "%sconsole [%s%d] enabled\n",
> (newcon->flags& CON_BOOT) ? "boot" : "" ,
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 3d0c56a..6e9b109 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -234,6 +234,13 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
> if (__this_cpu_read(hard_watchdog_warn) == true)
> return;
>
> + /*
> + * If early-printk is enabled then make sure we do not
> + * lock up in printk() and kill console logging:
> + */
> + if (early_console_initialized)
> + printk_kill();
> +
> if (hardlockup_panic)
> panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
> else
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 14:06:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> On 05.06.2011 15:39, Ingo Molnar wrote:
> >
> >* Arne Jansen<[email protected]> wrote:
> >
> >>>>Warning: it's entirely untested.
> >>
> >>How is the output supposed to come through? shouldn't printk revert
> >>to early_printk instead of just returning?
> >
> >oh, right you are.
> >
> >Does the patch below work? It does early-printk within printk().
>
> Too late, I already built my on early_vprintk ;)

heh :-)

Mind posting the patch? Your tested patch is infinitely more valuable
than my not-even-build-tested patch ;-)

Thanks,

Ingo

2011-06-05 14:10:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> Here we go:
>
> http://eischnee.de/lockup.txt
>
> Now it's your turn :)

So the lockup is in:

[<ffffffff813af2d9>] do_raw_spin_lock+0x129/0x170
[<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
[<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
[<ffffffff81092df6>] ? vprintk+0x76/0x4a0
[<ffffffff81092df6>] vprintk+0x76/0x4a0
[<ffffffff810c5f8d>] ? trace_hardirqs_off+0xd/0x10
[<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
[<ffffffff81859e19>] printk+0x63/0x65
[<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
[<ffffffff81091f98>] warn_slowpath_common+0x38/0xb0
[<ffffffff81092025>] warn_slowpath_null+0x15/0x20
[<ffffffff8108a4bd>] try_to_wake_up+0x29d/0x350
[<ffffffff8108a5a0>] wake_up_process+0x10/0x20
[<ffffffff8185c071>] __up+0x41/0x50
[<ffffffff810b937c>] up+0x3c/0x50
[<ffffffff81092a36>] console_unlock+0x1a6/0x200
[<ffffffff81092f86>] vprintk+0x206/0x4a0
[<ffffffff810c5f8d>] ? trace_hardirqs_off+0xd/0x10
[<ffffffff810ba6db>] ? local_clock+0x4b/0x60
[<ffffffffa0012000>] ? 0xffffffffa0011fff
[<ffffffff81859e19>] printk+0x63/0x65
[<ffffffffa001201d>] test_init+0x1d/0x2b [test]
[<ffffffff810001ce>] do_one_initcall+0x3e/0x170

Somehow we end up generating a WARN_ON() within a printk()'s
try_to_wake_up() and predictably we lock up on that ...

Peter?

Arne, mind helping a bit with:

[<ffffffff81091f98>] warn_slowpath_common+0x38/0xb0
[<ffffffff81092025>] warn_slowpath_null+0x15/0x20
[<ffffffff8108a4bd>] try_to_wake_up+0x29d/0x350

which WARN_ON() does that correspond to in try_to_wake_up()?

Thanks,

Ingo

2011-06-05 14:31:36

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 16:10, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>> Here we go:
>>
>> http://eischnee.de/lockup.txt
>>
>> Now it's your turn :)
>
> So the lockup is in:
>
> [<ffffffff813af2d9>] do_raw_spin_lock+0x129/0x170
> [<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
> [<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
> [<ffffffff81092df6>] ? vprintk+0x76/0x4a0
> [<ffffffff81092df6>] vprintk+0x76/0x4a0
> [<ffffffff810c5f8d>] ? trace_hardirqs_off+0xd/0x10
> [<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
> [<ffffffff81859e19>] printk+0x63/0x65
> [<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
> [<ffffffff81091f98>] warn_slowpath_common+0x38/0xb0
> [<ffffffff81092025>] warn_slowpath_null+0x15/0x20
> [<ffffffff8108a4bd>] try_to_wake_up+0x29d/0x350
> [<ffffffff8108a5a0>] wake_up_process+0x10/0x20
> [<ffffffff8185c071>] __up+0x41/0x50
> [<ffffffff810b937c>] up+0x3c/0x50
> [<ffffffff81092a36>] console_unlock+0x1a6/0x200
> [<ffffffff81092f86>] vprintk+0x206/0x4a0
> [<ffffffff810c5f8d>] ? trace_hardirqs_off+0xd/0x10
> [<ffffffff810ba6db>] ? local_clock+0x4b/0x60
> [<ffffffffa0012000>] ? 0xffffffffa0011fff
> [<ffffffff81859e19>] printk+0x63/0x65
> [<ffffffffa001201d>] test_init+0x1d/0x2b [test]
> [<ffffffff810001ce>] do_one_initcall+0x3e/0x170
>
> Somehow we end up generating a WARN_ON() within a printk()'s
> try_to_wake_up() and predictably we lock up on that ...
>
> Peter?
>
> Arne, mind helping a bit with:
>
> [<ffffffff81091f98>] warn_slowpath_common+0x38/0xb0
> [<ffffffff81092025>] warn_slowpath_null+0x15/0x20
> [<ffffffff8108a4bd>] try_to_wake_up+0x29d/0x350
>
> which WARN_ON() does that correspond to in try_to_wake_up()?

(gdb) info line *0xffffffff8108a4bd
Line 934 of "kernel/sched.c" starts at address 0xffffffff8108a498
<try_to_wake_up+632> and ends at 0xffffffff8108a4c8 <try_to_wake_up+680>.

sched.c:934: in function __task_rq_lock
lockdep_assert_held(&p->pi_lock);

I'm currently testing on commit 0122ec5b02f766c355b3168d.

-Arne

>
> Thanks,
>
> Ingo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 14:45:21

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 16:06, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>> On 05.06.2011 15:39, Ingo Molnar wrote:
>>>
>>> * Arne Jansen<[email protected]> wrote:
>>>
>>>>>> Warning: it's entirely untested.
>>>>
>>>> How is the output supposed to come through? shouldn't printk revert
>>>> to early_printk instead of just returning?
>>>
>>> oh, right you are.
>>>
>>> Does the patch below work? It does early-printk within printk().
>>
>> Too late, I already built my on early_vprintk ;)
>
> heh :-)
>
> Mind posting the patch? Your tested patch is infinitely more valuable
> than my not-even-build-tested patch ;-)

It's basically just your patch. I attached it nevertheless...

-Arne


Attachments:
printk_kill.patch (3.85 kB)

2011-06-05 15:13:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> sched.c:934: in function __task_rq_lock
> lockdep_assert_held(&p->pi_lock);

Oh. Could you remove that line with the patch below - does it result
in a working system?

Now, this patch alone just removes a debugging check - but i'm not
sure the debugging check is correct - we take the pi_lock in a raw
way - which means it's not lockdep covered.

So how can lockdep_assert_held() be called on it?

Thanks,

Ingo

diff --git a/kernel/sched.c b/kernel/sched.c
index fd18f39..a32316b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -938,8 +938,6 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
{
struct rq *rq;

- lockdep_assert_held(&p->pi_lock);
-
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);

2011-06-05 15:26:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Ingo Molnar <[email protected]> wrote:

>
> * Arne Jansen <[email protected]> wrote:
>
> > sched.c:934: in function __task_rq_lock
> > lockdep_assert_held(&p->pi_lock);
>
> Oh. Could you remove that line with the patch below - does it result
> in a working system?
>
> Now, this patch alone just removes a debugging check - but i'm not
> sure the debugging check is correct - we take the pi_lock in a raw
> way - which means it's not lockdep covered.
>
> So how can lockdep_assert_held() be called on it?

Ok, i'm wrong there - it's lockdep covered.

I also reviewed all the __task_rq_lock() call sites and each of them
has the pi_lock acquired. So unless both Peter and me are blind, the
other option would be some sort of memory corruption corrupting the
runqueue.

But ... that looks so unlikely here, it's clearly heavy printk() and
console_sem twiddling that triggers the bug, not any other scheduler
activity.

Thanks,

Ingo

2011-06-05 15:32:41

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


one more thing, could you please add this call:

debug_show_all_locks();

to after the WARN(), in watchdog.c?

Please surround the whole printout portion by the
spin_lock()/unlock() protection code i suggested, full-lock-state
printouts are slow and other CPUs might start printing their NMI
ticks ...

With the all-locks-printed output we can double check what locks are
held.

Thanks,

Ingo

2011-06-05 15:34:57

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 17:13, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>> sched.c:934: in function __task_rq_lock
>> lockdep_assert_held(&p->pi_lock);
>
> Oh. Could you remove that line with the patch below - does it result
> in a working system?

yes.

>
> Now, this patch alone just removes a debugging check - but i'm not
> sure the debugging check is correct - we take the pi_lock in a raw
> way - which means it's not lockdep covered.
>
> So how can lockdep_assert_held() be called on it?
>
> Thanks,
>
> Ingo
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index fd18f39..a32316b 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -938,8 +938,6 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
> {
> struct rq *rq;
>
> - lockdep_assert_held(&p->pi_lock);
> -
> for (;;) {
> rq = task_rq(p);
> raw_spin_lock(&rq->lock);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 16:07:52

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 17:32, Ingo Molnar wrote:
>
> one more thing, could you please add this call:
>
> debug_show_all_locks();
>
> to after the WARN(), in watchdog.c?
>
> Please surround the whole printout portion by the
> spin_lock()/unlock() protection code i suggested, full-lock-state
> printouts are slow and other CPUs might start printing their NMI
> ticks ...
>
> With the all-locks-printed output we can double check what locks are
> held.

Hm, on first try:

INFO: lockdep is turned off.

Recompiling...

>
> Thanks,
>
> Ingo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 16:35:53

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 18:07, Arne Jansen wrote:
> On 05.06.2011 17:32, Ingo Molnar wrote:
>>
>> one more thing, could you please add this call:
>>
>> debug_show_all_locks();
>>
>> to after the WARN(), in watchdog.c?
>>
>> Please surround the whole printout portion by the
>> spin_lock()/unlock() protection code i suggested, full-lock-state
>> printouts are slow and other CPUs might start printing their NMI
>> ticks ...
>>
>> With the all-locks-printed output we can double check what locks are
>> held.
>
> Hm, on first try:
>
> INFO: lockdep is turned off.
>
> Recompiling...
>

same after a full recompile.

# grep LOCKDEP .config

CONFIG_LOCKDEP_SUPPORT=y
CONFIG_LOCKDEP=y
# CONFIG_DEBUG_LOCKDEP is not set


-Arne

>>
>> Thanks,
>>
>> Ingo
>> --
>> To unsubscribe from this list: send the line "unsubscribe
>> linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 16:50:43

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 18:35, Arne Jansen wrote:
> On 05.06.2011 18:07, Arne Jansen wrote:
>> On 05.06.2011 17:32, Ingo Molnar wrote:
>>>
>>> one more thing, could you please add this call:
>>>
>>> debug_show_all_locks();
>>>
>>> to after the WARN(), in watchdog.c?
>>>
>>> Please surround the whole printout portion by the
>>> spin_lock()/unlock() protection code i suggested, full-lock-state
>>> printouts are slow and other CPUs might start printing their NMI
>>> ticks ...
>>>
>>> With the all-locks-printed output we can double check what locks are
>>> held.
>>

btw, the output posted earlier also contains some
BUG: spinlock lockup.


>> Hm, on first try:
>>
>> INFO: lockdep is turned off.
>>
>> Recompiling...
>>
>
> same after a full recompile.
>
> # grep LOCKDEP .config
>
> CONFIG_LOCKDEP_SUPPORT=y
> CONFIG_LOCKDEP=y
> # CONFIG_DEBUG_LOCKDEP is not set
>
>
> -Arne
>
>>>
>>> Thanks,
>>>
>>> Ingo
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe
>>> linux-kernel" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at http://www.tux.org/lkml/
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe
>> linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 17:21:16

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> >>>With the all-locks-printed output we can double check what locks are
> >>>held.
>
> btw, the output posted earlier also contains some BUG: spinlock
> lockup.

hm, it's hard to interpret that without the spin_lock()/unlock()
logic keeping the dumps apart.

Was lockdep enabled as you started the test?

but ... if the lock is reasonably sorted then it's this one:

<0>BUG: spinlock lockup on CPU#3, modprobe/22211, ffffffff81e1c0c0
Pid: 22211, comm: modprobe Tainted: G W 2.6.39-rc3+ #19
Call Trace:
[<ffffffff813af306>] do_raw_spin_lock+0x156/0x170
[<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
[<ffffffff81092df6>] ? vprintk+0x76/0x4a0
[<ffffffff81092df6>] vprintk+0x76/0x4a0
[<ffffffff810c5f8d>] ? trace_hardirqs_off+0xd/0x10
[<ffffffff81859e19>] printk+0x63/0x65
[<ffffffff813af301>] do_raw_spin_lock+0x151/0x170
[<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
[<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
[<ffffffff81092df6>] ? vprintk+0x76/0x4a0
[<ffffffff81092df6>] vprintk+0x76/0x4a0
[<ffffffff8108758b>] ? cpuacct_charge+0x9b/0xb0
[<ffffffff8108750f>] ? cpuacct_charge+0x1f/0xb0
[<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
[<ffffffff81859e19>] printk+0x63/0x65
[<ffffffff813af090>] spin_bug+0x70/0xf0
[<ffffffff813af2d9>] do_raw_spin_lock+0x129/0x170
[<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
[<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
[<ffffffff81092df6>] ? vprintk+0x76/0x4a0

and it occured before the lockup in the scheduler.

Which could be due to a race between disabling lockdep on one CPU and
the scheduler doing the lock-held check on another CPU.

Do you get any messages after the assert is removed, during the test?

Thanks,

Ingo

2011-06-05 17:42:27

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 19:20, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>>>>> With the all-locks-printed output we can double check what locks are
>>>>> held.
>>
>> btw, the output posted earlier also contains some BUG: spinlock
>> lockup.
>
> hm, it's hard to interpret that without the spin_lock()/unlock()
> logic keeping the dumps apart.

The locking was in place from the beginning. As the output is still
scrambled, there are other sources for BUG/WARN outside the watchdog
that trigger in parallel. Maybe we should protect the whole BUG/WARN
mechanism with a lock and send it to early_printk from the beginning,
so we don't have to wait for the watchdog to kill printk off and the
first BUG can come through.
Or just let WARN/BUG kill off printk instead of the watchdog (though
I have to get rid of that syslog-WARN on startup).

>
> Was lockdep enabled as you started the test?

At least it was in the config, but haven't double checked. ATM, it is.

>
> but ... if the lock is reasonably sorted then it's this one:
>
> <0>BUG: spinlock lockup on CPU#3, modprobe/22211, ffffffff81e1c0c0
> Pid: 22211, comm: modprobe Tainted: G W 2.6.39-rc3+ #19
> Call Trace:
> [<ffffffff813af306>] do_raw_spin_lock+0x156/0x170
> [<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
> [<ffffffff81092df6>] ? vprintk+0x76/0x4a0
> [<ffffffff81092df6>] vprintk+0x76/0x4a0
> [<ffffffff810c5f8d>] ? trace_hardirqs_off+0xd/0x10
> [<ffffffff81859e19>] printk+0x63/0x65
> [<ffffffff813af301>] do_raw_spin_lock+0x151/0x170
> [<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
> [<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
> [<ffffffff81092df6>] ? vprintk+0x76/0x4a0
> [<ffffffff81092df6>] vprintk+0x76/0x4a0
> [<ffffffff8108758b>] ? cpuacct_charge+0x9b/0xb0
> [<ffffffff8108750f>] ? cpuacct_charge+0x1f/0xb0
> [<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
> [<ffffffff81859e19>] printk+0x63/0x65
> [<ffffffff813af090>] spin_bug+0x70/0xf0
> [<ffffffff813af2d9>] do_raw_spin_lock+0x129/0x170
> [<ffffffff8108a4bd>] ? try_to_wake_up+0x29d/0x350
> [<ffffffff8185ce71>] _raw_spin_lock+0x51/0x70
> [<ffffffff81092df6>] ? vprintk+0x76/0x4a0
>
> and it occured before the lockup in the scheduler.
>
> Which could be due to a race between disabling lockdep on one CPU and
> the scheduler doing the lock-held check on another CPU.
>
> Do you get any messages after the assert is removed, during the test?

No.

2011-06-05 19:00:13

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> > hm, it's hard to interpret that without the spin_lock()/unlock()
> > logic keeping the dumps apart.
>
> The locking was in place from the beginning. [...]

Ok, i was surprised it looked relatively ordered :-)

> [...] As the output is still scrambled, there are other sources for
> BUG/WARN outside the watchdog that trigger in parallel. Maybe we
> should protect the whole BUG/WARN mechanism with a lock and send it
> to early_printk from the beginning, so we don't have to wait for
> the watchdog to kill printk off and the first BUG can come through.
> Or just let WARN/BUG kill off printk instead of the watchdog
> (though I have to get rid of that syslog-WARN on startup).

I had yet another look at your lockup.txt and i think the main cause
is the WARN_ON() caused by the not-held pi_lock. The lockup there
causes other CPUs to wedge in printk, which triggers spinlock-lockup
messages there.

So i think the primary trigger is the pi_lock WARN_ON() (as your
bisection has confirmed that too), everything else comes from this.

Unfortunately i don't think we can really 'fix' the problem by
removing the assert. By all means the assert is correct: pi_lock
should be held there. If we are not holding it then we likely won't
crash in an easily visible way - it's a lot easier to trigger asserts
than to trigger obscure side-effects of locking bugs.

It is also a mystery why only printk() triggers this bug. The wakeup
done there is not particularly special, so by all means we should
have seen similar lockups elsewhere as well - not just with
printk()s. Yet we are not seeing them.

So some essential piece of the puzzle is still missing.

Thanks,

Ingo

2011-06-05 19:30:44

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 20:59, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>>> hm, it's hard to interpret that without the spin_lock()/unlock()
>>> logic keeping the dumps apart.
>>
>> The locking was in place from the beginning. [...]
>
> Ok, i was surprised it looked relatively ordered :-)
>
>> [...] As the output is still scrambled, there are other sources for
>> BUG/WARN outside the watchdog that trigger in parallel. Maybe we
>> should protect the whole BUG/WARN mechanism with a lock and send it
>> to early_printk from the beginning, so we don't have to wait for
>> the watchdog to kill printk off and the first BUG can come through.
>> Or just let WARN/BUG kill off printk instead of the watchdog
>> (though I have to get rid of that syslog-WARN on startup).
>
> I had yet another look at your lockup.txt and i think the main cause
> is the WARN_ON() caused by the not-held pi_lock. The lockup there
> causes other CPUs to wedge in printk, which triggers spinlock-lockup
> messages there.
>
> So i think the primary trigger is the pi_lock WARN_ON() (as your
> bisection has confirmed that too), everything else comes from this.
>
> Unfortunately i don't think we can really 'fix' the problem by
> removing the assert. By all means the assert is correct: pi_lock
> should be held there. If we are not holding it then we likely won't
> crash in an easily visible way - it's a lot easier to trigger asserts
> than to trigger obscure side-effects of locking bugs.
>
> It is also a mystery why only printk() triggers this bug. The wakeup
> done there is not particularly special, so by all means we should
> have seen similar lockups elsewhere as well - not just with
> printk()s. Yet we are not seeing them.

From the timing I see I'd guess it has something to do with the
scheduler kicking in during printk. I'm neither familiar with the
printk code nor with the scheduler.
If you have any ideas what I should test or add please let me know.

-Arne

>
> So some essential piece of the puzzle is still missing.
>
> Thanks,
>
> Ingo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-05 19:44:37

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> From the timing I see I'd guess it has something to do with the
> scheduler kicking in during printk. I'm neither familiar with the
> printk code nor with the scheduler.

Yeah, that's the well-known wake-up of klogd:

void console_unlock(void)
{
...
up(&console_sem);

actually ... that's not the klogd wake-up at all (!). I so suck today
at bug analysis :-)

It's the console lock()/unlock() sequence, and guess what does it:

drivers/tty/tty_io.c: console_lock();
drivers/tty/vt/selection.c: console_lock();

and the vt.c code in a dozen places.

So maybe it's some sort of tty related memory corruption that was
made *visible* via the extra assert that the scheduler is doing? The
pi_list is embedded in task struct.

This would explain why only printk() triggers it and other wakeup
patterns not.

Now, i don't really like this theory either. Why is there no other
type of corruption? And exactly why did only the task_struct::pi_lock
field get corrupted while nearby fields not? Also, none of the fields
near pi_lock are even remotely tty related.

Thanks,

Ingo

2011-06-05 20:15:37

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 21:44, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>> From the timing I see I'd guess it has something to do with the
>> scheduler kicking in during printk. I'm neither familiar with the
>> printk code nor with the scheduler.
>
> Yeah, that's the well-known wake-up of klogd:
>
> void console_unlock(void)
> {
> ...
> up(&console_sem);
>
> actually ... that's not the klogd wake-up at all (!). I so suck today
> at bug analysis :-)
>
> It's the console lock()/unlock() sequence, and guess what does it:
>
> drivers/tty/tty_io.c: console_lock();
> drivers/tty/vt/selection.c: console_lock();
>
> and the vt.c code in a dozen places.
>
> So maybe it's some sort of tty related memory corruption that was
> made *visible* via the extra assert that the scheduler is doing? The
> pi_list is embedded in task struct.
>
> This would explain why only printk() triggers it and other wakeup
> patterns not.
>
> Now, i don't really like this theory either. Why is there no other
> type of corruption? And exactly why did only the task_struct::pi_lock
> field get corrupted while nearby fields not? Also, none of the fields
> near pi_lock are even remotely tty related.

Can lockdep just get confused by the lockdep_off/on calls in printk
while scheduling is allowed? There aren't many users of lockdep_off().

I'll can try again tomorrow to get a dump of all logs from the
watchdog, but enough for today...

2011-06-06 06:56:45

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 22:15, Arne Jansen wrote:
> On 05.06.2011 21:44, Ingo Molnar wrote:
>>
>> * Arne Jansen<[email protected]> wrote:
>>
>>> From the timing I see I'd guess it has something to do with the
>>> scheduler kicking in during printk. I'm neither familiar with the
>>> printk code nor with the scheduler.
>>
>> Yeah, that's the well-known wake-up of klogd:
>>
>> void console_unlock(void)
>> {
>> ...
>> up(&console_sem);
>>
>> actually ... that's not the klogd wake-up at all (!). I so suck today
>> at bug analysis :-)
>>
>> It's the console lock()/unlock() sequence, and guess what does it:
>>
>> drivers/tty/tty_io.c: console_lock();
>> drivers/tty/vt/selection.c: console_lock();
>>
>> and the vt.c code in a dozen places.
>>
>> So maybe it's some sort of tty related memory corruption that was
>> made *visible* via the extra assert that the scheduler is doing? The
>> pi_list is embedded in task struct.
>>
>> This would explain why only printk() triggers it and other wakeup
>> patterns not.
>>
>> Now, i don't really like this theory either. Why is there no other
>> type of corruption? And exactly why did only the task_struct::pi_lock
>> field get corrupted while nearby fields not? Also, none of the fields
>> near pi_lock are even remotely tty related.
>
> Can lockdep just get confused by the lockdep_off/on calls in printk
> while scheduling is allowed? There aren't many users of lockdep_off().
>
> I'll can try again tomorrow to get a dump of all logs from the
> watchdog, but enough for today...

I just let it dump the locks in debug_show_all_locks, even though
for some reason debug_locks is false. Don't know if the result is
helpful in any way, as it might well be inaccurate.

INFO: lockdep is turned off.

Showing all locks held in the system:
2 locks held by syslog-ng/21624:
#0: (&tty->atomic_write_lock){+.+.+.}, at: [<ffffffff8142ade3>]
tty_write_lock+0x23/0x60
#1: (&tty->output_lock){+.+...}, at: [<ffffffff8142ee7a>]
n_tty_write+0x14a/0x490
1 lock held by agetty/22174:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by agetty/22175:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by agetty/22176:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by agetty/22177:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by agetty/22178:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by agetty/22179:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by agetty/22180:
#0: (&tty->atomic_read_lock){+.+...}, at: [<ffffffff8142fb86>]
n_tty_read+0x5f6/0x8e0
1 lock held by tail/22197:
#0: (&rq->lock){-.-.-.}, at: [<ffffffff8185ae42>] schedule+0xe2/0x940

Some more fact that might help figure out what happens:
- I nearly always either see all 10000 messages or only 10.
Never 9, never 11. I saw 40 once, and once 190.
- If I printk only 1000 lines instead of 10000, nothing bad happens
- If /var/log/syslog is not filled with binary garbage, I also just
see the 10 lines.

-Arne

2011-06-06 07:34:32

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 05.06.2011 17:26, Ingo Molnar wrote:
>
> * Ingo Molnar <[email protected]> wrote:
>
>>
>> * Arne Jansen <[email protected]> wrote:
>>
>>> sched.c:934: in function __task_rq_lock
>>> lockdep_assert_held(&p->pi_lock);
>>
>> Oh. Could you remove that line with the patch below - does it result
>> in a working system?
>>
>> Now, this patch alone just removes a debugging check - but i'm not
>> sure the debugging check is correct - we take the pi_lock in a raw
>> way - which means it's not lockdep covered.
>>
>> So how can lockdep_assert_held() be called on it?
>
> Ok, i'm wrong there - it's lockdep covered.
>
> I also reviewed all the __task_rq_lock() call sites and each of them
> has the pi_lock acquired. So unless both Peter and me are blind, the
> other option would be some sort of memory corruption corrupting the
> runqueue.

Another small idea, can we install the assert into a pre-0122ec5b02f766c
to see if it's an older problem that just got uncovered by the assert?

-Arne

>
> But ... that looks so unlikely here, it's clearly heavy printk() and
> console_sem twiddling that triggers the bug, not any other scheduler
> activity.
>
> Thanks,
>
> Ingo

2011-06-06 08:39:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Sun, 2011-06-05 at 17:13 +0200, Ingo Molnar wrote:

> Now, this patch alone just removes a debugging check - but i'm not
> sure the debugging check is correct - we take the pi_lock in a raw
> way - which means it's not lockdep covered.

Ever since tglx did s/raw_/arch_/g raw_ is covered by lockdep.

2011-06-06 09:02:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
>
> Can lockdep just get confused by the lockdep_off/on calls in printk
> while scheduling is allowed? There aren't many users of lockdep_off().

Yes!, in that case lock_is_held() returns false, triggering the warning.
I guess there's an argument to be made in favour of the below..

---
kernel/lockdep.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 53a6895..e4129cf 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3242,7 +3242,7 @@ int lock_is_held(struct lockdep_map *lock)
int ret = 0;

if (unlikely(current->lockdep_recursion))
- return ret;
+ return 1; /* avoid false negative lockdep_assert_held */

raw_local_irq_save(flags);
check_flags(flags);

2011-06-06 09:18:44

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 06.06.2011 11:01, Peter Zijlstra wrote:
> On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
>>
>> Can lockdep just get confused by the lockdep_off/on calls in printk
>> while scheduling is allowed? There aren't many users of lockdep_off().
>
> Yes!, in that case lock_is_held() returns false, triggering the warning.
> I guess there's an argument to be made in favour of the below..


Two questions... is there any protection between the lockdep_recursion
check and the set to one? I guess in our case it is, because it's the
scheduler that calls it, but in general?
And why is lockdep needed to check if a lock is help? Isn't it reflected
in the lock structure itself?

-Arne

>
> ---
> kernel/lockdep.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/lockdep.c b/kernel/lockdep.c
> index 53a6895..e4129cf 100644
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -3242,7 +3242,7 @@ int lock_is_held(struct lockdep_map *lock)
> int ret = 0;
>
> if (unlikely(current->lockdep_recursion))
> - return ret;
> + return 1; /* avoid false negative lockdep_assert_held */
>
> raw_local_irq_save(flags);
> check_flags(flags);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-06 09:25:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 11:18 +0200, Arne Jansen wrote:
> On 06.06.2011 11:01, Peter Zijlstra wrote:
> > On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
> >>
> >> Can lockdep just get confused by the lockdep_off/on calls in printk
> >> while scheduling is allowed? There aren't many users of lockdep_off().
> >
> > Yes!, in that case lock_is_held() returns false, triggering the warning.
> > I guess there's an argument to be made in favour of the below..
>
>
> Two questions... is there any protection between the lockdep_recursion
> check and the set to one? I guess in our case it is, because it's the
> scheduler that calls it, but in general?

Yeah, its always current->lockdep_recursion, so there is no
concurrency :-)

> And why is lockdep needed to check if a lock is help? Isn't it reflected
> in the lock structure itself?

Ah, so the difference is between _who_ owns the lock. Things like
spin_is_locked() check if the lock is taken but cannot tell you who owns
it, but lock_is_held() checks if the current context owns the lock.



2011-06-06 09:53:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 11:24 +0200, Peter Zijlstra wrote:
> On Mon, 2011-06-06 at 11:18 +0200, Arne Jansen wrote:
> > On 06.06.2011 11:01, Peter Zijlstra wrote:
> > > On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
> > >>
> > >> Can lockdep just get confused by the lockdep_off/on calls in printk
> > >> while scheduling is allowed? There aren't many users of lockdep_off().
> > >
> > > Yes!, in that case lock_is_held() returns false, triggering the warning.
> > > I guess there's an argument to be made in favour of the below..
> >
> >
> > Two questions... is there any protection between the lockdep_recursion
> > check and the set to one? I guess in our case it is, because it's the
> > scheduler that calls it, but in general?
>
> Yeah, its always current->lockdep_recursion, so there is no
> concurrency :-)
>
> > And why is lockdep needed to check if a lock is help? Isn't it reflected
> > in the lock structure itself?
>
> Ah, so the difference is between _who_ owns the lock. Things like
> spin_is_locked() check if the lock is taken but cannot tell you who owns
> it, but lock_is_held() checks if the current context owns the lock.

Also, lockdep_assert_held() doesn't generate any code when lockdep is
not configured.

2011-06-06 10:00:18

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 06.06.2011 11:01, Peter Zijlstra wrote:
> On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
>>
>> Can lockdep just get confused by the lockdep_off/on calls in printk
>> while scheduling is allowed? There aren't many users of lockdep_off().
>
> Yes!, in that case lock_is_held() returns false, triggering the warning.
> I guess there's an argument to be made in favour of the below..

As expected this apparently fixes the problem. But are we confident
enough this is the true source? If it's really that simple, printk
calling into the scheduler, why am I the only one seeing this?

-Arne

>
> ---
> kernel/lockdep.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/lockdep.c b/kernel/lockdep.c
> index 53a6895..e4129cf 100644
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -3242,7 +3242,7 @@ int lock_is_held(struct lockdep_map *lock)
> int ret = 0;
>
> if (unlikely(current->lockdep_recursion))
> - return ret;
> + return 1; /* avoid false negative lockdep_assert_held */
>
> raw_local_irq_save(flags);
> check_flags(flags);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-06 10:27:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 12:00 +0200, Arne Jansen wrote:
> As expected this apparently fixes the problem. But are we confident
> enough this is the true source? If it's really that simple, printk
> calling into the scheduler, why am I the only one seeing this?

Right, so apparently you have contention on console_sem and the up()
actually does a wakeup. I'm still trying to figure out how to do that.

2011-06-06 13:10:58

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Ingo Molnar <[email protected]> wrote:

> So some essential piece of the puzzle is still missing.

Oh. I think the essential piece of the puzzle might be this code in
printk():

asmlinkage int vprintk(const char *fmt, va_list args)
{
...
lockdep_off();
if (console_trylock_for_printk(this_cpu))
console_unlock();

lockdep_on();
...

So while i right now do not see how this (ancient) piece of logic
causes trouble, could you try the patch below, does it make the
WARN()+lockup go away?

Thanks,

Ingo

diff --git a/kernel/printk.c b/kernel/printk.c
index 3518539..1b9d2be 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -859,7 +859,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();
}

- lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;

@@ -947,7 +946,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Try to acquire and then immediately release the
* console semaphore. The release will do all the
* actual magic (print out buffers, wake up klogd,
- * etc).
+ * etc).
*
* The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
@@ -956,7 +955,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
if (console_trylock_for_printk(this_cpu))
console_unlock();

- lockdep_on();
out_restore_irqs:
raw_local_irq_restore(flags);

2011-06-06 13:12:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 15:10 +0200, Ingo Molnar wrote:


> diff --git a/kernel/printk.c b/kernel/printk.c
> index 3518539..1b9d2be 100644
> --- a/kernel/printk.c
> +++ b/kernel/printk.c
> @@ -859,7 +859,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> zap_locks();
> }
>
> - lockdep_off();

At the very least you should also do: s/raw_local_irq_/local_irq/ on
this function.

> spin_lock(&logbuf_lock);
> printk_cpu = this_cpu;
>
> @@ -947,7 +946,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> * Try to acquire and then immediately release the
> * console semaphore. The release will do all the
> * actual magic (print out buffers, wake up klogd,
> - * etc).
> + * etc).
> *
> * The console_trylock_for_printk() function
> * will release 'logbuf_lock' regardless of whether it
> @@ -956,7 +955,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> if (console_trylock_for_printk(this_cpu))
> console_unlock();
>
> - lockdep_on();
> out_restore_irqs:
> raw_local_irq_restore(flags);
>

2011-06-06 13:22:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Mon, 2011-06-06 at 15:10 +0200, Ingo Molnar wrote:
>
>
> > diff --git a/kernel/printk.c b/kernel/printk.c
> > index 3518539..1b9d2be 100644
> > --- a/kernel/printk.c
> > +++ b/kernel/printk.c
> > @@ -859,7 +859,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> > zap_locks();
> > }
> >
> > - lockdep_off();
>
> At the very least you should also do: s/raw_local_irq_/local_irq/ on
> this function.

Right, i've also removed the preempt_disable()/enable() pair - that
looks superfluous.

Updated patch below - still untested.

Thanks,

Ingo

---
kernel/printk.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)

Index: tip/kernel/printk.c
===================================================================
--- tip.orig/kernel/printk.c
+++ tip/kernel/printk.c
@@ -836,9 +836,8 @@ asmlinkage int vprintk(const char *fmt,
boot_delay_msec();
printk_delay();

- preempt_disable();
/* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = smp_processor_id();

/*
@@ -859,7 +858,6 @@ asmlinkage int vprintk(const char *fmt,
zap_locks();
}

- lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;

@@ -947,7 +945,7 @@ asmlinkage int vprintk(const char *fmt,
* Try to acquire and then immediately release the
* console semaphore. The release will do all the
* actual magic (print out buffers, wake up klogd,
- * etc).
+ * etc).
*
* The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
@@ -956,11 +954,9 @@ asmlinkage int vprintk(const char *fmt,
if (console_trylock_for_printk(this_cpu))
console_unlock();

- lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);

- preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);

2011-06-06 13:26:02

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 12:26 +0200, Peter Zijlstra wrote:
> On Mon, 2011-06-06 at 12:00 +0200, Arne Jansen wrote:
> > As expected this apparently fixes the problem. But are we confident
> > enough this is the true source? If it's really that simple, printk
> > calling into the scheduler, why am I the only one seeing this?
>
> Right, so apparently you have contention on console_sem and the up()
> actually does a wakeup. I'm still trying to figure out how to do that.

On a related note, I'm not quite sure what we need that
lockdep_{off,on}() for, I just build and booted a kernel without them
and so far life is good.

I tried lockdep splats and in-scheduler printk()s (although the latter
will still mess up the box if printk()'s up(console_sem) triggers a
wakeup for obvious reasons).

---
Subject: lockdep, printk: Remove lockdep_off from printk()
From: Peter Zijlstra <[email protected]>
Date: Mon Jun 06 13:37:22 CEST 2011

Remove the lockdep_{off,on}() usage from printk() as it appears
superfluous, a kernel with this patch on can printk lockdep output just
fine.

Signed-off-by: Peter Zijlstra <[email protected]>
---
Index: linux-2.6/kernel/printk.c
===================================================================
--- linux-2.6.orig/kernel/printk.c
+++ linux-2.6/kernel/printk.c
@@ -838,7 +838,7 @@ asmlinkage int vprintk(const char *fmt,

preempt_disable();
/* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = smp_processor_id();

/*
@@ -859,7 +859,6 @@ asmlinkage int vprintk(const char *fmt,
zap_locks();
}

- lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;

@@ -956,9 +955,8 @@ asmlinkage int vprintk(const char *fmt,
if (console_trylock_for_printk(this_cpu))
console_unlock();

- lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);

preempt_enable();
return printed_len;

2011-06-06 13:32:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 15:21 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Mon, 2011-06-06 at 15:10 +0200, Ingo Molnar wrote:
> >
> >
> > > diff --git a/kernel/printk.c b/kernel/printk.c
> > > index 3518539..1b9d2be 100644
> > > --- a/kernel/printk.c
> > > +++ b/kernel/printk.c
> > > @@ -859,7 +859,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> > > zap_locks();
> > > }
> > >
> > > - lockdep_off();
> >
> > At the very least you should also do: s/raw_local_irq_/local_irq/ on
> > this function.
>
> Right, i've also removed the preempt_disable()/enable() pair - that
> looks superfluous.

aside from the preempt thing, such a patch was just tested, I had a
module trigger a lockdep warning, and stuck a printk() in the middle of
ttwu (conditional so I could actually boot).

So go ahead, and merge this.

We still need the patch to lock_is_held() though, since there's a few
other lockdep_off() sites in the kernel, and at least the NTFS one needs
to be able to schedule.

2011-06-06 14:58:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Sun, 2011-06-05 at 17:13 +0200, Ingo Molnar wrote:
>
> > Now, this patch alone just removes a debugging check - but i'm not
> > sure the debugging check is correct - we take the pi_lock in a raw
> > way - which means it's not lockdep covered.
>
> Ever since tglx did s/raw_/arch_/g raw_ is covered by lockdep.

It's not lockdep covered due to the lockdep_off(), or am i missing
something?

Thanks,

Ingo

2011-06-06 15:04:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
> >
> > Can lockdep just get confused by the lockdep_off/on calls in printk
> > while scheduling is allowed? There aren't many users of lockdep_off().
>
> Yes!, in that case lock_is_held() returns false, triggering the warning.
> I guess there's an argument to be made in favour of the below..
>
> ---
> kernel/lockdep.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/lockdep.c b/kernel/lockdep.c
> index 53a6895..e4129cf 100644
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -3242,7 +3242,7 @@ int lock_is_held(struct lockdep_map *lock)
> int ret = 0;
>
> if (unlikely(current->lockdep_recursion))
> - return ret;
> + return 1; /* avoid false negative lockdep_assert_held */
>
> raw_local_irq_save(flags);
> check_flags(flags);

Oh, this explains the full bug i think.

lockdep_off() causes us to not track pi_lock, and thus the assert
inside printk() called try_to_wake_up() triggers incorrectly.

The reason why Arne triggered it is probably because console_lock
*wakeups* from printk are very, very rare: almost nothing actually
locks the console. His remote system probably has some VT-intense
application (screen?) that hits console_lock more intensely.

Arne, do you use some vt-intense application there?

The real fix might be to remove the lockdep_off()/on() call from
printk(), that looks actively evil ... we had to hack through several
layers of side-effects before we found the real bug - so it's not
like the off()/on() made things more robust!

So i think what we want to apply is the lockdep_off()/on() removal,
once Arne has it tested.

Thanks,

Ingo

2011-06-06 15:08:26

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Ingo Molnar <[email protected]> wrote:

> The real fix might be to remove the lockdep_off()/on() call from
> printk(), that looks actively evil ... we had to hack through
> several layers of side-effects before we found the real bug - so
> it's not like the off()/on() made things more robust!

The other obvious fix would be to *remove* the blasted wakeup from
printk(). It's a serious debugging robustness violation and it's not
like the wakeup is super important latency-wise.

We *already* have a timer tick driven klogd wakeup poll routine. So i
doubt we'd have many problems from not doing wakeups from printk().
Opinions?

Thanks,

Ingo

2011-06-06 15:10:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 16:58 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Sun, 2011-06-05 at 17:13 +0200, Ingo Molnar wrote:
> >
> > > Now, this patch alone just removes a debugging check - but i'm not
> > > sure the debugging check is correct - we take the pi_lock in a raw
> > > way - which means it's not lockdep covered.
> >
> > Ever since tglx did s/raw_/arch_/g raw_ is covered by lockdep.
>
> It's not lockdep covered due to the lockdep_off(), or am i missing
> something?

Your initial stmt was about the raw_ part, raw_ locks are tracked by
lockdep ever since tglx renamed them to arch_ and introduced new raw_
primitives.

But yeah, the lockdep_off() stuff also disables all tracking, on top of
that it also makes lock_is_held() return an unconditional false (even if
the lock was acquired before lockdep_off and thus registered).

My patch that fixes lock_is_held() should avoid false
lockdep_assert_held() explosions and this this printk() while rq->lock
problem.

Removing lockdep_off() usage from printk() would also be nice, but Mike
triggered logbuf_lock <-> rq->lock inversion with that due to the
up(&console_sem) wakeup muck.

Ideally we'd pull the up() out from under logbuf_lock, am looking at
that.

2011-06-06 15:47:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 17:09 +0200, Peter Zijlstra wrote:
> On Mon, 2011-06-06 at 16:58 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > On Sun, 2011-06-05 at 17:13 +0200, Ingo Molnar wrote:
> > >
> > > > Now, this patch alone just removes a debugging check - but i'm not
> > > > sure the debugging check is correct - we take the pi_lock in a raw
> > > > way - which means it's not lockdep covered.
> > >
> > > Ever since tglx did s/raw_/arch_/g raw_ is covered by lockdep.
> >
> > It's not lockdep covered due to the lockdep_off(), or am i missing
> > something?
>
> Your initial stmt was about the raw_ part, raw_ locks are tracked by
> lockdep ever since tglx renamed them to arch_ and introduced new raw_
> primitives.
>
> But yeah, the lockdep_off() stuff also disables all tracking, on top of
> that it also makes lock_is_held() return an unconditional false (even if
> the lock was acquired before lockdep_off and thus registered).
>
> My patch that fixes lock_is_held() should avoid false
> lockdep_assert_held() explosions and this this printk() while rq->lock
> problem.
>
> Removing lockdep_off() usage from printk() would also be nice, but Mike
> triggered logbuf_lock <-> rq->lock inversion with that due to the
> up(&console_sem) wakeup muck.
>
> Ideally we'd pull the up() out from under logbuf_lock, am looking at
> that.

something like so,.. but then there's a comment about console_sem and
logbuf_lock interlocking in interating ways, but it fails to mention how
and why. But I think it should maybe work..

Needs more staring at, preferably by someone who actually understands
that horrid mess :/ Also, this all still doesn't make printk() work
reliably while holding rq->lock.

---
Index: linux-2.6/kernel/printk.c
===================================================================
--- linux-2.6.orig/kernel/printk.c
+++ linux-2.6/kernel/printk.c
@@ -686,6 +686,7 @@ static void zap_locks(void)

oops_timestamp = jiffies;

+ debug_locks_off();
/* If a crash is occurring, make sure we can't deadlock */
spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
@@ -782,7 +783,7 @@ static inline int can_use_console(unsign
static int console_trylock_for_printk(unsigned int cpu)
__releases(&logbuf_lock)
{
- int retval = 0;
+ int retval = 0, wake = 0;

if (console_trylock()) {
retval = 1;
@@ -795,12 +796,14 @@ static int console_trylock_for_printk(un
*/
if (!can_use_console(cpu)) {
console_locked = 0;
- up(&console_sem);
+ wake = 1;
retval = 0;
}
}
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
+ if (wake)
+ up(&console_sem);
return retval;
}
static const char recursion_bug_msg [] =
@@ -836,9 +839,8 @@ asmlinkage int vprintk(const char *fmt,
boot_delay_msec();
printk_delay();

- preempt_disable();
/* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = smp_processor_id();

/*
@@ -859,7 +861,6 @@ asmlinkage int vprintk(const char *fmt,
zap_locks();
}

- lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;

@@ -956,11 +957,9 @@ asmlinkage int vprintk(const char *fmt,
if (console_trylock_for_printk(this_cpu))
console_unlock();

- lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);

- preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);
@@ -1271,8 +1270,8 @@ void console_unlock(void)
if (unlikely(exclusive_console))
exclusive_console = NULL;

- up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
+ up(&console_sem);
if (wake_klogd)
wake_up_klogd();
}

2011-06-06 15:52:54

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> Needs more staring at, preferably by someone who actually
> understands that horrid mess :/ Also, this all still doesn't make
> printk() work reliably while holding rq->lock.

So, what about my suggestion to just *remove* the wakeup from there
and use the deferred wakeup mechanism that klogd uses.

That would make printk() *visibly* more robust in practice.

[ It would also open up the way to possibly make printk() NMI entry
safe - currently we lock up if we printk in an NMI or #MC context
that happens to nest inside a printk(). ]

Thanks,

Ingo

2011-06-06 16:01:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > Needs more staring at, preferably by someone who actually
> > understands that horrid mess :/ Also, this all still doesn't make
> > printk() work reliably while holding rq->lock.
>
> So, what about my suggestion to just *remove* the wakeup from there
> and use the deferred wakeup mechanism that klogd uses.
>
> That would make printk() *visibly* more robust in practice.

That's currently done from the jiffy tick, do you want to effectively
delay releasing the console_sem for the better part of a jiffy?

> [ It would also open up the way to possibly make printk() NMI entry
> safe - currently we lock up if we printk in an NMI or #MC context
> that happens to nest inside a printk(). ]

Well, for that to happen you also need to deal with logbuf_lock nesting.
Personally I think using printk() from NMI context is quite beyond sane.

2011-06-06 16:08:30

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > Needs more staring at, preferably by someone who actually
> > > understands that horrid mess :/ Also, this all still doesn't make
> > > printk() work reliably while holding rq->lock.
> >
> > So, what about my suggestion to just *remove* the wakeup from there
> > and use the deferred wakeup mechanism that klogd uses.
> >
> > That would make printk() *visibly* more robust in practice.
>
> That's currently done from the jiffy tick, do you want to effectively
> delay releasing the console_sem for the better part of a jiffy?

Yes, and we already do it in some other circumstances. Can you see
any problem with that? klogd is an utter slowpath anyway.

> > [ It would also open up the way to possibly make printk() NMI entry
> > safe - currently we lock up if we printk in an NMI or #MC context
> > that happens to nest inside a printk(). ]
>
> Well, for that to happen you also need to deal with logbuf_lock
> nesting. [...]

That we could do as a robustness patch: detect when the current CPU
already holds it and do not lock up on that. This would also allow
printk() to work within a crashing printk(). (assuming the second
printk() does not crash - in which case it's game over anyway)

> Personally I think using printk() from NMI context is quite beyond
> sane.

Yeah, quite so, but it *can* happen so if we can make it work as a
free side-effect of a printk()-robustness increasing patch, why not?

Thanks,

Ingo

2011-06-06 16:13:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 18:08 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
> > > * Peter Zijlstra <[email protected]> wrote:
> > >
> > > > Needs more staring at, preferably by someone who actually
> > > > understands that horrid mess :/ Also, this all still doesn't make
> > > > printk() work reliably while holding rq->lock.
> > >
> > > So, what about my suggestion to just *remove* the wakeup from there
> > > and use the deferred wakeup mechanism that klogd uses.
> > >
> > > That would make printk() *visibly* more robust in practice.
> >
> > That's currently done from the jiffy tick, do you want to effectively
> > delay releasing the console_sem for the better part of a jiffy?
>
> Yes, and we already do it in some other circumstances.

We do?

> Can you see
> any problem with that? klogd is an utter slowpath anyway.

but console_sem isn't klogd. We delay klogd and that's perfectly fine,
but afaict we don't delay console_sem.

2011-06-06 16:18:06

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Mon, 2011-06-06 at 18:08 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
> > > > * Peter Zijlstra <[email protected]> wrote:
> > > >
> > > > > Needs more staring at, preferably by someone who actually
> > > > > understands that horrid mess :/ Also, this all still doesn't make
> > > > > printk() work reliably while holding rq->lock.
> > > >
> > > > So, what about my suggestion to just *remove* the wakeup from there
> > > > and use the deferred wakeup mechanism that klogd uses.
> > > >
> > > > That would make printk() *visibly* more robust in practice.
> > >
> > > That's currently done from the jiffy tick, do you want to effectively
> > > delay releasing the console_sem for the better part of a jiffy?
> >
> > Yes, and we already do it in some other circumstances.
>
> We do?

Yes, see the whole printk_pending logic, it delays:

wake_up_interruptible(&log_wait);

to the next jiffies tick.

> > Can you see
> > any problem with that? klogd is an utter slowpath anyway.
>
> but console_sem isn't klogd. We delay klogd and that's perfectly
> fine, but afaict we don't delay console_sem.

But console_sem is really a similar special case as klogd. See, it's
about a *printk*. That's rare by definition.

If someone on the console sees it he'll be startled by at least 10
msecs ;-) So delaying the wakeup to the next jiffy really fits into
the same approach as we already do with &log_wait, hm?

This would solve a real nightmare that has plagued us ever since
printk() has done wakeups directly - i.e. like forever.

Thanks,

Ingo

2011-06-06 16:38:56

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 06.06.2011 18:17, Ingo Molnar wrote:
>
> * Peter Zijlstra<[email protected]> wrote:
>
>> On Mon, 2011-06-06 at 18:08 +0200, Ingo Molnar wrote:
>>> * Peter Zijlstra<[email protected]> wrote:
>>>
>>>> On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
>>>>> * Peter Zijlstra<[email protected]> wrote:
>>>>>
>>>>>> Needs more staring at, preferably by someone who actually
>>>>>> understands that horrid mess :/ Also, this all still doesn't make
>>>>>> printk() work reliably while holding rq->lock.
>>>>>
>>>>> So, what about my suggestion to just *remove* the wakeup from there
>>>>> and use the deferred wakeup mechanism that klogd uses.
>>>>>
>>>>> That would make printk() *visibly* more robust in practice.
>>>>
>>>> That's currently done from the jiffy tick, do you want to effectively
>>>> delay releasing the console_sem for the better part of a jiffy?
>>>
>>> Yes, and we already do it in some other circumstances.
>>
>> We do?
>
> Yes, see the whole printk_pending logic, it delays:
>
> wake_up_interruptible(&log_wait);
>
> to the next jiffies tick.
>
>>> Can you see
>>> any problem with that? klogd is an utter slowpath anyway.
>>
>> but console_sem isn't klogd. We delay klogd and that's perfectly
>> fine, but afaict we don't delay console_sem.
>
> But console_sem is really a similar special case as klogd. See, it's
> about a *printk*. That's rare by definition.
>
> If someone on the console sees it he'll be startled by at least 10
> msecs ;-) So delaying the wakeup to the next jiffy really fits into
> the same approach as we already do with&log_wait, hm?

As long as it doesn't scramble the order of the messages, the delay
imho doesn't matter even in very printk-heavy debugging sessions.

>
> This would solve a real nightmare that has plagued us ever since
> printk() has done wakeups directly - i.e. like forever.
>
> Thanks,
>
> Ingo

2011-06-06 16:44:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 18:17 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Mon, 2011-06-06 at 18:08 +0200, Ingo Molnar wrote:
> > > * Peter Zijlstra <[email protected]> wrote:
> > >
> > > > On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
> > > > > * Peter Zijlstra <[email protected]> wrote:
> > > > >
> > > > > > Needs more staring at, preferably by someone who actually
> > > > > > understands that horrid mess :/ Also, this all still doesn't make
> > > > > > printk() work reliably while holding rq->lock.
> > > > >
> > > > > So, what about my suggestion to just *remove* the wakeup from there
> > > > > and use the deferred wakeup mechanism that klogd uses.
> > > > >
> > > > > That would make printk() *visibly* more robust in practice.
> > > >
> > > > That's currently done from the jiffy tick, do you want to effectively
> > > > delay releasing the console_sem for the better part of a jiffy?
> > >
> > > Yes, and we already do it in some other circumstances.
> >
> > We do?
>
> Yes, see the whole printk_pending logic, it delays:
>
> wake_up_interruptible(&log_wait);
>
> to the next jiffies tick.

Again, that's not console_sem ("..delay releasing console_sem.."
"..already done.." isn't true).

> > > Can you see
> > > any problem with that? klogd is an utter slowpath anyway.
> >
> > but console_sem isn't klogd. We delay klogd and that's perfectly
> > fine, but afaict we don't delay console_sem.
>
> But console_sem is really a similar special case as klogd. See, it's
> about a *printk*. That's rare by definition.

But its not rare, its _the_ lock that serialized the whole console
layer. Pretty much everything a console does goes through that lock.

By delaying this with 10ms (CONFIG_HZ=100) per printk could really delay
the whole boot process.

> If someone on the console sees it he'll be startled by at least 10
> msecs ;-) So delaying the wakeup to the next jiffy really fits into
> the same approach as we already do with &log_wait, hm?

Not convinced yet, I mean, don't get me wrong, I'd love to rid us of the
thing, but I'm not sure delaying the release of a resource like this is
the right approach.

Ahh, what we could do is something like the below and delay both the
acquire and release of the console_sem.

---
kernel/printk.c | 86 +++++++++++++++++++++++++-----------------------------
1 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/kernel/printk.c b/kernel/printk.c
index 3518539..d3bdf5a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -686,6 +686,7 @@ static void zap_locks(void)

oops_timestamp = jiffies;

+ debug_locks_off();
/* If a crash is occurring, make sure we can't deadlock */
spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
@@ -774,16 +775,13 @@ static inline int can_use_console(unsigned int cpu)
* messages from a 'printk'. Return true (and with the
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
- *
- * This gets called with the 'logbuf_lock' spinlock held and
- * interrupts disabled. It should return with 'lockbuf_lock'
- * released but interrupts still disabled.
*/
static int console_trylock_for_printk(unsigned int cpu)
__releases(&logbuf_lock)
{
int retval = 0;

+ spin_lock(&logbuf_lock);
if (console_trylock()) {
retval = 1;

@@ -803,12 +801,27 @@ static int console_trylock_for_printk(unsigned int cpu)
spin_unlock(&logbuf_lock);
return retval;
}
+
static const char recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int recursion_bug;
static int new_text_line = 1;
static char printk_buf[1024];

+static DEFINE_PER_CPU(int, printk_pending);
+
+int printk_needs_cpu(int cpu)
+{
+ if (cpu_is_offline(cpu))
+ printk_tick();
+ return __this_cpu_read(printk_pending);
+}
+
+void printk_set_pending(void)
+{
+ this_cpu_write(printk_pending, 1);
+}
+
int printk_delay_msec __read_mostly;

static inline void printk_delay(void)
@@ -836,9 +849,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
boot_delay_msec();
printk_delay();

- preempt_disable();
/* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = smp_processor_id();

/*
@@ -859,7 +871,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();
}

- lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;

@@ -942,25 +953,13 @@ asmlinkage int vprintk(const char *fmt, va_list args)
if (*p == '\n')
new_text_line = 1;
}
+ spin_unlock(&logbuf_lock);

- /*
- * Try to acquire and then immediately release the
- * console semaphore. The release will do all the
- * actual magic (print out buffers, wake up klogd,
- * etc).
- *
- * The console_trylock_for_printk() function
- * will release 'logbuf_lock' regardless of whether it
- * actually gets the semaphore or not.
- */
- if (console_trylock_for_printk(this_cpu))
- console_unlock();
+ printk_set_pending();

- lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);

- preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);
@@ -1201,29 +1200,6 @@ int is_console_locked(void)
return console_locked;
}

-static DEFINE_PER_CPU(int, printk_pending);
-
-void printk_tick(void)
-{
- if (__this_cpu_read(printk_pending)) {
- __this_cpu_write(printk_pending, 0);
- wake_up_interruptible(&log_wait);
- }
-}
-
-int printk_needs_cpu(int cpu)
-{
- if (cpu_is_offline(cpu))
- printk_tick();
- return __this_cpu_read(printk_pending);
-}
-
-void wake_up_klogd(void)
-{
- if (waitqueue_active(&log_wait))
- this_cpu_write(printk_pending, 1);
-}
-
/**
* console_unlock - unlock the console system
*
@@ -1273,11 +1249,29 @@ void console_unlock(void)

up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
+
if (wake_klogd)
- wake_up_klogd();
+ wake_up_interruptible(&log_wait);
}
EXPORT_SYMBOL(console_unlock);

+void printk_tick(void)
+{
+ if (!__this_cpu_read(printk_pending))
+ return;
+
+ /*
+ * Try to acquire and then immediately release the
+ * console semaphore. The release will do all the
+ * actual magic (print out buffers, wake up klogd,
+ * etc).
+ */
+ if (console_trylock_for_printk(smp_processor_id())) {
+ console_unlock();
+ __this_cpu_write(printk_pending, 0);
+ }
+}
+
/**
* console_conditional_schedule - yield the CPU if required
*

2011-06-06 16:45:32

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 06.06.2011 18:38, Arne Jansen wrote:
> On 06.06.2011 18:17, Ingo Molnar wrote:
>>
>> * Peter Zijlstra<[email protected]> wrote:
>>
>>> On Mon, 2011-06-06 at 18:08 +0200, Ingo Molnar wrote:
>>>> * Peter Zijlstra<[email protected]> wrote:
>>>>
>>>>> On Mon, 2011-06-06 at 17:52 +0200, Ingo Molnar wrote:
>>>>>> * Peter Zijlstra<[email protected]> wrote:
>>>>>>
>>>>>>> Needs more staring at, preferably by someone who actually
>>>>>>> understands that horrid mess :/ Also, this all still doesn't make
>>>>>>> printk() work reliably while holding rq->lock.
>>>>>>
>>>>>> So, what about my suggestion to just *remove* the wakeup from there
>>>>>> and use the deferred wakeup mechanism that klogd uses.
>>>>>>
>>>>>> That would make printk() *visibly* more robust in practice.
>>>>>
>>>>> That's currently done from the jiffy tick, do you want to effectively
>>>>> delay releasing the console_sem for the better part of a jiffy?
>>>>
>>>> Yes, and we already do it in some other circumstances.
>>>
>>> We do?
>>
>> Yes, see the whole printk_pending logic, it delays:
>>
>> wake_up_interruptible(&log_wait);
>>
>> to the next jiffies tick.
>>
>>>> Can you see
>>>> any problem with that? klogd is an utter slowpath anyway.
>>>
>>> but console_sem isn't klogd. We delay klogd and that's perfectly
>>> fine, but afaict we don't delay console_sem.
>>
>> But console_sem is really a similar special case as klogd. See, it's
>> about a *printk*. That's rare by definition.
>>
>> If someone on the console sees it he'll be startled by at least 10
>> msecs ;-) So delaying the wakeup to the next jiffy really fits into
>> the same approach as we already do with&log_wait, hm?
>
> As long as it doesn't scramble the order of the messages, the delay
> imho doesn't matter even in very printk-heavy debugging sessions.

And, as important, doesn't reduce the throughput of printk. Having only
100 wakeups/s sounds like the throughput is limited to 100xsizeof(ring
buffer).

>
>>
>> This would solve a real nightmare that has plagued us ever since
>> printk() has done wakeups directly - i.e. like forever.
>>
>> Thanks,
>>
>> Ingo
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2011-06-06 16:50:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 18:44 +0200, Peter Zijlstra wrote:
> +void printk_tick(void)
> +{
> + if (!__this_cpu_read(printk_pending))
> + return;
> +
> + /*
> + * Try to acquire and then immediately release the
> + * console semaphore. The release will do all the
> + * actual magic (print out buffers, wake up klogd,
> + * etc).
> + */
> + if (console_trylock_for_printk(smp_processor_id())) {
> + console_unlock();
> + __this_cpu_write(printk_pending, 0);
> + }
> +}

Aside from not compiling (someone stuck a ref to wake_up_klogd somewhere
in lib/) this does delay the whole of printk() output by up to a jiffy,
if the machine dies funny you could be missing large parts of the
output :/

2011-06-06 16:54:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 18:45 +0200, Arne Jansen wrote:
> > As long as it doesn't scramble the order of the messages, the delay
> > imho doesn't matter even in very printk-heavy debugging sessions.
>
> And, as important, doesn't reduce the throughput of printk. Having only
> 100 wakeups/s sounds like the throughput is limited to 100xsizeof(ring
> buffer).

Right, that's another problem.. not really sure delaying all this is
going to work :/

2011-06-06 17:04:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 18:44 +0200, Peter Zijlstra wrote:
>
> @@ -942,25 +953,13 @@ asmlinkage int vprintk(const char *fmt, va_list args)
> if (*p == '\n')
> new_text_line = 1;
> }
> + spin_unlock(&logbuf_lock);
>
> - /*
> - * Try to acquire and then immediately release the
> - * console semaphore. The release will do all the
> - * actual magic (print out buffers, wake up klogd,
> - * etc).
> - *
> - * The console_trylock_for_printk() function
> - * will release 'logbuf_lock' regardless of whether it
> - * actually gets the semaphore or not.
> - */
> - if (console_trylock_for_printk(this_cpu))
> - console_unlock();

FWIW the existing printk recursion logic is broken, console_unlock()
clears printk_cpu but console_trylock_for_printk() can release
logbuf_lock and fail the trylock of console_sem, in which case a
subsequent printk() is perfectly valid and non-recursing.

2011-06-06 17:07:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> > As long as it doesn't scramble the order of the messages, the
> > delay imho doesn't matter even in very printk-heavy debugging
> > sessions.
>
> And, as important, doesn't reduce the throughput of printk. Having
> only 100 wakeups/s sounds like the throughput is limited to
> 100xsizeof(ring buffer).

Nah.

I for example *always* kill klogd during such printk based debugging
sessions, because it's *already* very easy to overflow its buffering
abilities. Also, klogd often interferes with debugging.

So i make the log buffer big enough to contain enough debugging info.

So it's a non-issue IMHO. Linus, what do you think?

Thanks,

Ingo

2011-06-06 17:12:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 19:07 +0200, Ingo Molnar wrote:
> * Arne Jansen <[email protected]> wrote:
>
> > > As long as it doesn't scramble the order of the messages, the
> > > delay imho doesn't matter even in very printk-heavy debugging
> > > sessions.
> >
> > And, as important, doesn't reduce the throughput of printk. Having
> > only 100 wakeups/s sounds like the throughput is limited to
> > 100xsizeof(ring buffer).
>
> Nah.
>
> I for example *always* kill klogd during such printk based debugging
> sessions, because it's *already* very easy to overflow its buffering
> abilities. Also, klogd often interferes with debugging.

Also, klogd is completely irrelevant, klogd doesn't do anything useful.
Writing things to the actual console otoh is very useful (you get to see
them on the screen/serial line).

Delaying the console_sem release will delay anything touching the
console_sem, including userland stuffs.

Delaying the console_sem acquire+release will delay showing important
printk() lines on your serial.

Both suck.

2011-06-06 17:12:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> > > but console_sem isn't klogd. We delay klogd and that's
> > > perfectly fine, but afaict we don't delay console_sem.
> >
> > But console_sem is really a similar special case as klogd. See,
> > it's about a *printk*. That's rare by definition.
>
> But its not rare, its _the_ lock that serialized the whole console
> layer. Pretty much everything a console does goes through that
> lock.

Please. Think.

If console_sem was so frequently held then why on earth were you
*unable* to trigger the lockup with an artificial printk() storm and
why on earth has almost no-one else but Arne triggered it? :-)

This bug is the very proof that console_sem is seldom contended!

> Ahh, what we could do is something like the below and delay both
> the acquire and release of the console_sem.

Yeah!

> +void printk_tick(void)
> +{
> + if (!__this_cpu_read(printk_pending))
> + return;
> +
> + /*
> + * Try to acquire and then immediately release the
> + * console semaphore. The release will do all the
> + * actual magic (print out buffers, wake up klogd,
> + * etc).
> + */
> + if (console_trylock_for_printk(smp_processor_id())) {
> + console_unlock();
> + __this_cpu_write(printk_pending, 0);
> + }
> +}

Arne does this fix the hang you are seeing?

Now, we probably don't want to do this in 3.0, just to give time for
interactions to found and complaints to be worded. So we could do the
minimal fix first and queue up the bigger change for 3.1.

Hm?

Thanks,

Ingo

2011-06-06 17:13:39

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> Aside from not compiling (someone stuck a ref to wake_up_klogd
> somewhere in lib/) this does delay the whole of printk() output by
> up to a jiffy, if the machine dies funny you could be missing large
> parts of the output :/

oh, that's bad and i missed that aspect.

We want the console *output* immediately. That's non-negotiable!

What we want to delay are the various wakeups of secondary recipients
of printks ...

Thanks,

Ingo

2011-06-06 17:44:09

by Mike Galbraith

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 17:08 +0200, Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:
>
> > The real fix might be to remove the lockdep_off()/on() call from
> > printk(), that looks actively evil ... we had to hack through
> > several layers of side-effects before we found the real bug - so
> > it's not like the off()/on() made things more robust!
>
> The other obvious fix would be to *remove* the blasted wakeup from
> printk(). It's a serious debugging robustness violation and it's not
> like the wakeup is super important latency-wise.
>
> We *already* have a timer tick driven klogd wakeup poll routine. So i
> doubt we'd have many problems from not doing wakeups from printk().
> Opinions?

Seconded! I routinely whack that damn thing.

-Mike

2011-06-06 17:57:56

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 06.06.2011 19:11, Ingo Molnar wrote:
>
> * Peter Zijlstra<[email protected]> wrote:

>
>> +void printk_tick(void)
>> +{
>> + if (!__this_cpu_read(printk_pending))
>> + return;
>> +
>> + /*
>> + * Try to acquire and then immediately release the
>> + * console semaphore. The release will do all the
>> + * actual magic (print out buffers, wake up klogd,
>> + * etc).
>> + */
>> + if (console_trylock_for_printk(smp_processor_id())) {
>> + console_unlock();
>> + __this_cpu_write(printk_pending, 0);
>> + }
>> +}
>
> Arne does this fix the hang you are seeing?

What do you want me to test? just replace printk_tick with the
above version? If I do that, the machine doesn't even boot up
any more.

2011-06-06 18:07:54

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Arne Jansen <[email protected]> wrote:

> On 06.06.2011 19:11, Ingo Molnar wrote:
> >
> >* Peter Zijlstra<[email protected]> wrote:
>
> >
> >>+void printk_tick(void)
> >>+{
> >>+ if (!__this_cpu_read(printk_pending))
> >>+ return;
> >>+
> >>+ /*
> >>+ * Try to acquire and then immediately release the
> >>+ * console semaphore. The release will do all the
> >>+ * actual magic (print out buffers, wake up klogd,
> >>+ * etc).
> >>+ */
> >>+ if (console_trylock_for_printk(smp_processor_id())) {
> >>+ console_unlock();
> >>+ __this_cpu_write(printk_pending, 0);
> >>+ }
> >>+}
> >
> >Arne does this fix the hang you are seeing?
>
> What do you want me to test? just replace printk_tick with the
> above version? If I do that, the machine doesn't even boot up any
> more.

Yeah.

So i think we want two patches:

- The first one that minimally removes the lockdep_off()/on() dance
and fixes the regression: the patch that i sent earlier today.
I *think* that should fix the crash.

3.0 material.

- The second one that moves console_sem wakeups to the jiffies tick.
It does not push the acquiring and the console->write() calls to
jiffies context, only delays the wakeup.

3.1 material.

Thanks,

Ingo

2011-06-06 18:14:33

by Arne Jansen

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On 06.06.2011 20:07, Ingo Molnar wrote:
>
> * Arne Jansen<[email protected]> wrote:
>
>> On 06.06.2011 19:11, Ingo Molnar wrote:
>>>
>>> * Peter Zijlstra<[email protected]> wrote:
>>
>>>
>>>> +void printk_tick(void)
>>>> +{
>>>> + if (!__this_cpu_read(printk_pending))
>>>> + return;
>>>> +
>>>> + /*
>>>> + * Try to acquire and then immediately release the
>>>> + * console semaphore. The release will do all the
>>>> + * actual magic (print out buffers, wake up klogd,
>>>> + * etc).
>>>> + */
>>>> + if (console_trylock_for_printk(smp_processor_id())) {
>>>> + console_unlock();
>>>> + __this_cpu_write(printk_pending, 0);
>>>> + }
>>>> +}
>>>
>>> Arne does this fix the hang you are seeing?
>>
>> What do you want me to test? just replace printk_tick with the
>> above version? If I do that, the machine doesn't even boot up any
>> more.
>
> Yeah.
>
> So i think we want two patches:
>
> - The first one that minimally removes the lockdep_off()/on() dance
> and fixes the regression: the patch that i sent earlier today.
> I *think* that should fix the crash.

Isn't the regression just the false lockdep_assert_held(&p->pi_lock)?
The patch Peter sent earlier seems like the minimal changeset to fix
that, plus it fixes a bug that might pop up somewhere else, too.

>
> 3.0 material.
>
> - The second one that moves console_sem wakeups to the jiffies tick.
> It does not push the acquiring and the console->write() calls to
> jiffies context, only delays the wakeup.
>
> 3.1 material.
>
> Thanks,
>
> Ingo

2011-06-06 18:15:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 20:07 +0200, Ingo Molnar wrote:

> - The first one that minimally removes the lockdep_off()/on() dance
> and fixes the regression: the patch that i sent earlier today.
> I *think* that should fix the crash.
>
> 3.0 material.

I think the lock_is_held() fix is the urgent one, all the others are
make printk() suck less patches, ie. not so very important.

Without that lock_is_held() thing all lockdep_off() sites, inc ntfs will
still be able to trigger this problem.

2011-06-06 22:08:23

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Mon, 2011-06-06 at 20:07 +0200, Ingo Molnar wrote:
>
> > - The first one that minimally removes the lockdep_off()/on() dance
> > and fixes the regression: the patch that i sent earlier today.
> > I *think* that should fix the crash.
> >
> > 3.0 material.
>
> I think the lock_is_held() fix is the urgent one, all the others are
> make printk() suck less patches, ie. not so very important.
>
> Without that lock_is_held() thing all lockdep_off() sites, inc ntfs will
> still be able to trigger this problem.

ok. Could you please send a changeloggified patch with a
Reported-and-tested-by from Arne in a new thread, etc?

Thanks,

Ingo

2011-06-07 05:21:06

by Mike Galbraith

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 11:01 +0200, Peter Zijlstra wrote:
> On Sun, 2011-06-05 at 22:15 +0200, Arne Jansen wrote:
> >
> > Can lockdep just get confused by the lockdep_off/on calls in printk
> > while scheduling is allowed? There aren't many users of lockdep_off().
>
> Yes!, in that case lock_is_held() returns false, triggering the warning.
> I guess there's an argument to be made in favour of the below..

I've been testing/rebooting for a couple hours now, x3550 M3 lockup woes
using Arne's config are history.

All-better-by: (nah, heel sense-o-humor, _down_ i say;)

> ---
> kernel/lockdep.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/lockdep.c b/kernel/lockdep.c
> index 53a6895..e4129cf 100644
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -3242,7 +3242,7 @@ int lock_is_held(struct lockdep_map *lock)
> int ret = 0;
>
> if (unlikely(current->lockdep_recursion))
> - return ret;
> + return 1; /* avoid false negative lockdep_assert_held */
>
> raw_local_irq_save(flags);
> check_flags(flags);
>

2011-06-08 15:50:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Mon, 2011-06-06 at 19:11 +0200, Peter Zijlstra wrote:
> Delaying the console_sem release will delay anything touching the
> console_sem, including userland stuffs.
>
> Delaying the console_sem acquire+release will delay showing important
> printk() lines on your serial.
>
> Both suck.

I came up with the below hackery, seems to actually boot and such on a
lockdep enabled kernel (although Ingo did report lockups with a partial
version of the patch, still need to look at that).

The idea is to use the console_sem.lock instead of the semaphore itself,
we flush the console when console_sem.count > 0, which means its
uncontended. Its more or less equivalent to down_trylock() + up(),
except it never releases the sem internal lock, and optimizes the count
fiddling away.

It doesn't require a wakeup because any real semaphore contention will
still be spinning on the spinlock instead of enqueueing itself on the
waitlist.

Its rather ugly, exposes semaphore internals in places it shouldn't,
although we could of course expose some primitives for this, but then
people might thing it'd be okay to use them etc..

/me puts on the asbestos underwear

comments?

---
kernel/printk.c | 113 +++++++++++++++++++-----------------------------------
1 files changed, 40 insertions(+), 73 deletions(-)

diff --git a/kernel/printk.c b/kernel/printk.c
index 3518539..127b003 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -686,6 +686,7 @@ static void zap_locks(void)

oops_timestamp = jiffies;

+ debug_locks_off();
/* If a crash is occurring, make sure we can't deadlock */
spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
@@ -769,40 +770,6 @@ static inline int can_use_console(unsigned int cpu)
return cpu_online(cpu) || have_callable_console();
}

-/*
- * Try to get console ownership to actually show the kernel
- * messages from a 'printk'. Return true (and with the
- * console_lock held, and 'console_locked' set) if it
- * is successful, false otherwise.
- *
- * This gets called with the 'logbuf_lock' spinlock held and
- * interrupts disabled. It should return with 'lockbuf_lock'
- * released but interrupts still disabled.
- */
-static int console_trylock_for_printk(unsigned int cpu)
- __releases(&logbuf_lock)
-{
- int retval = 0;
-
- if (console_trylock()) {
- retval = 1;
-
- /*
- * If we can't use the console, we need to release
- * the console semaphore by hand to avoid flushing
- * the buffer. We need to hold the console semaphore
- * in order to do this test safely.
- */
- if (!can_use_console(cpu)) {
- console_locked = 0;
- up(&console_sem);
- retval = 0;
- }
- }
- printk_cpu = UINT_MAX;
- spin_unlock(&logbuf_lock);
- return retval;
-}
static const char recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int recursion_bug;
@@ -823,6 +790,8 @@ static inline void printk_delay(void)
}
}

+static void __console_flush(void);
+
asmlinkage int vprintk(const char *fmt, va_list args)
{
int printed_len = 0;
@@ -836,9 +805,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
boot_delay_msec();
printk_delay();

- preempt_disable();
/* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = smp_processor_id();

/*
@@ -859,7 +827,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();
}

- lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;

@@ -942,25 +909,18 @@ asmlinkage int vprintk(const char *fmt, va_list args)
if (*p == '\n')
new_text_line = 1;
}
+ printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);

- /*
- * Try to acquire and then immediately release the
- * console semaphore. The release will do all the
- * actual magic (print out buffers, wake up klogd,
- * etc).
- *
- * The console_trylock_for_printk() function
- * will release 'logbuf_lock' regardless of whether it
- * actually gets the semaphore or not.
- */
- if (console_trylock_for_printk(this_cpu))
- console_unlock();
+ spin_lock(&console_sem.lock);
+ if (console_sem.count > 0 && can_use_console(smp_processor_id()))
+ __console_flush();
+
+ spin_unlock(&console_sem.lock);

- lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);

- preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);
@@ -1224,31 +1184,12 @@ void wake_up_klogd(void)
this_cpu_write(printk_pending, 1);
}

-/**
- * console_unlock - unlock the console system
- *
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
- *
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * If there is output waiting for klogd, we wake it up.
- *
- * console_unlock(); may be called from any context.
- */
-void console_unlock(void)
+static void __console_flush(void)
{
unsigned long flags;
unsigned _con_start, _log_end;
unsigned wake_klogd = 0;

- if (console_suspended) {
- up(&console_sem);
- return;
- }
-
console_may_schedule = 0;

for ( ; ; ) {
@@ -1271,11 +1212,37 @@ void console_unlock(void)
if (unlikely(exclusive_console))
exclusive_console = NULL;

- up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
+
if (wake_klogd)
wake_up_klogd();
}
+
+/**
+ * console_unlock - unlock the console system
+ *
+ * Releases the console_lock which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * If there is output waiting for klogd, we wake it up.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ if (console_suspended) {
+ up(&console_sem);
+ return;
+ }
+
+ __console_flush();
+
+ up(&console_sem);
+}
EXPORT_SYMBOL(console_unlock);

/**

2011-06-08 19:18:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> I came up with the below hackery, seems to actually boot and such
> on a lockdep enabled kernel (although Ingo did report lockups with
> a partial version of the patch, still need to look at that).
>
> The idea is to use the console_sem.lock instead of the semaphore
> itself, we flush the console when console_sem.count > 0, which
> means its uncontended. Its more or less equivalent to
> down_trylock() + up(), except it never releases the sem internal
> lock, and optimizes the count fiddling away.
>
> It doesn't require a wakeup because any real semaphore contention
> will still be spinning on the spinlock instead of enqueueing itself
> on the waitlist.
>
> Its rather ugly, exposes semaphore internals in places it
> shouldn't, although we could of course expose some primitives for
> this, but then people might thing it'd be okay to use them etc..
>
> /me puts on the asbestos underwear

Hm, the no-wakeup aspect seems rather useful.

Could we perhaps remove console_sem and replace it with a mutex and
do something like this with a mutex and its ->wait_lock?

We'd have two happy side effects:

- we'd thus remove one of the last core kernel semaphore users
- we'd gain lockdep coverage for console locking as a bonus ...

Thanks,

Ingo

2011-06-08 19:35:02

by Linus Torvalds

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, Jun 8, 2011 at 12:17 PM, Ingo Molnar <[email protected]> wrote:
>
> Hm, the no-wakeup aspect seems rather useful.

I like the patch, but I would like to get it much better abstracted
out. Make some kind of

void atomic_down();
int atomic_down_trylock();
void atomic_up();

interfaces that basically get the semaphore in an "atomic" mode that
leaves the semaphore spinlock locked in the locked region.

So they would basically be spinlocks that can then be mixed with
normal sleeping semaphore usage.

> Could we perhaps remove console_sem and replace it with a mutex and
> do something like this with a mutex and its ->wait_lock?

That would be horrible.

The reason it works well for semaphores is that the semaphores have no
architecture-specific fast-path any more, and everything is done
within the spinlock.

With a mutex? Not good. We have several different mutex
implementations, along with fastpaths that never touch the spinlock at
all etc. And that is very much on purpose, because the spinlock
approach is noticeably slower and needs more atomic accesses. In
contrast, the semaphores are "legacy interfaces" and aren't considered
high-performance locking any more.

Linus

2011-06-08 19:45:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, 2011-06-08 at 21:17 +0200, Ingo Molnar wrote:
> Hm, the no-wakeup aspect seems rather useful.
>
> Could we perhaps remove console_sem and replace it with a mutex and
> do something like this with a mutex and its ->wait_lock?
>
> We'd have two happy side effects:
>
> - we'd thus remove one of the last core kernel semaphore users
> - we'd gain lockdep coverage for console locking as a bonus ...

The mutex thing is more complex due to the mutex fast path, the
advantage of the semaphore is its simple implementation that always
takes the internal lock.

I guess I can make it happen, but its a tad more tricky.

2011-06-08 20:28:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, 2011-06-08 at 12:27 -0700, Linus Torvalds wrote:
> Make some kind of
>
> void atomic_down();
> int atomic_down_trylock();
> void atomic_up();

atomic_down() is a tad iffy, it would have to wait for an actual
semaphore owner, which might sleep etc.. So I skipped it.

The other two are implemented here, and assume IRQs are disabled, we
could add _irq and _irqsave versions of both, but since there are no
users I avoided the effort.

---
include/linux/semaphore.h | 3 +++
kernel/semaphore.c | 36 +++++++++++++++++++++++++++++++++++-
2 files changed, 38 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/semaphore.h
===================================================================
--- linux-2.6.orig/include/linux/semaphore.h
+++ linux-2.6/include/linux/semaphore.h
@@ -43,4 +43,7 @@ extern int __must_check down_trylock(str
extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
extern void up(struct semaphore *sem);

+extern int atomic_down_trylock(struct semaphore *sem);
+extern void atomic_up(struct semaphore *sem);
+
#endif /* __LINUX_SEMAPHORE_H */
Index: linux-2.6/kernel/semaphore.c
===================================================================
--- linux-2.6.orig/kernel/semaphore.c
+++ linux-2.6/kernel/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
* down_trylock - try to acquire the semaphore, without waiting
* @sem: the semaphore to be acquired
*
- * Try to acquire the semaphore atomically. Returns 0 if the mutex has
+ * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
* been acquired successfully or 1 if it it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
@@ -143,6 +143,29 @@ int down_trylock(struct semaphore *sem)
EXPORT_SYMBOL(down_trylock);

/**
+ * atomic_down_trylock - try to acquire the semaphore internal lock
+ * #sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore internal lock, blocking all other semaphore
+ * operations. Returns 0 if the trylock has been acquired successfully or
+ * 1 if it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock! Be careful about this when converting code.
+ *
+ * NOTE: assumes IRQs are disabled.
+ */
+int atomic_down_trylock(struct semaphore *sem)
+{
+ spin_lock(&sem->lock);
+ if (sem->count > 0)
+ return 0;
+
+ spin_unlock(&sem->lock);
+ return 1;
+}
+
+/**
* down_timeout - acquire the semaphore within a specified time
* @sem: the semaphore to be acquired
* @jiffies: how long to wait before failing
@@ -188,6 +211,17 @@ void up(struct semaphore *sem)
}
EXPORT_SYMBOL(up);

+/**
+ * atomic_up - release the semaphore internal lock
+ * @sem: the semaphore to release the internal lock of
+ *
+ * Release the semaphore internal lock.
+ */
+void atomic_up(struct semaphore *sem)
+{
+ spin_unlock(&sem->lock);
+}
+
/* Functions for the contended case */

struct semaphore_waiter {

2011-06-08 20:53:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages


* Peter Zijlstra <[email protected]> wrote:

> On Wed, 2011-06-08 at 21:17 +0200, Ingo Molnar wrote:
> > Hm, the no-wakeup aspect seems rather useful.
> >
> > Could we perhaps remove console_sem and replace it with a mutex and
> > do something like this with a mutex and its ->wait_lock?
> >
> > We'd have two happy side effects:
> >
> > - we'd thus remove one of the last core kernel semaphore users
> > - we'd gain lockdep coverage for console locking as a bonus ...
>
> The mutex thing is more complex due to the mutex fast path, the
> advantage of the semaphore is its simple implementation that always
> takes the internal lock.
>
> I guess I can make it happen, but its a tad more tricky.

Hm, i thought it would be possible to only express it via the
slowpath: if mutex_trylock() succeeds then *all* execution goes into
the slowpath so we don't have to take all the fastpaths into account.

If that's notpossible then i think you and Linus are right that it's
not worth creating all the per arch fastpath special cases for
something like this.

The non-removal of the console_sem is sad though. Sniff.

Thanks,

Ingo

2011-06-08 20:54:49

by Linus Torvalds

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, Jun 8, 2011 at 1:32 PM, Peter Zijlstra <[email protected]> wrote:
>
> atomic_down() is a tad iffy, it would have to wait for an actual
> semaphore owner, which might sleep etc.. So I skipped it.

I think sleeping would be fine: the "atomic" part is about the code
the semaphore protects, not about the down() itself.

But the way you made the semantics be (caller has to disable
interrupts) for the other helpers, that doesn't really work.

Linus

2011-06-08 20:54:33

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, 8 Jun 2011, Peter Zijlstra wrote:

> On Wed, 2011-06-08 at 12:27 -0700, Linus Torvalds wrote:
> > Make some kind of
> >
> > void atomic_down();
> > int atomic_down_trylock();
> > void atomic_up();
>
> atomic_down() is a tad iffy, it would have to wait for an actual
> semaphore owner, which might sleep etc.. So I skipped it.
>
> The other two are implemented here, and assume IRQs are disabled, we
> could add _irq and _irqsave versions of both, but since there are no
> users I avoided the effort.
>
> ---
> include/linux/semaphore.h | 3 +++
> kernel/semaphore.c | 36 +++++++++++++++++++++++++++++++++++-

Can we please confine this to kernel/printk.c ?

I can see the creative abuse of this already.

Thanks,

tglx

2011-06-08 21:45:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, 2011-06-08 at 22:52 +0200, Ingo Molnar wrote:
> Hm, i thought it would be possible to only express it via the
> slowpath: if mutex_trylock() succeeds then *all* execution goes into
> the slowpath so we don't have to take all the fastpaths into account.

Right, but you first have to take wait_lock, then do the trylock, but
that's complicated for asm/mutex-null.h because trylock will then also
try to obtain the wait_lock.

You can do it by creating ___mutex_trylock_slowpath() which contains the
meat of __mutex_trylock_slowpath() and then implement
atomic_mutex_trylock{_irq,_irqsave,} using that, not releasing wait_lock
on success.

Shouldn't be too bad, but it ain't too pretty either.

Furthermore, like I said in my initial patch, I share Thomas' worry
about 'creative' usage of these primitives.

2011-06-08 21:57:20

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [debug patch] printk: Add a printk killswitch to robustify NMI watchdog messages

On Wed, 8 Jun 2011, Peter Zijlstra wrote:

> On Wed, 2011-06-08 at 22:52 +0200, Ingo Molnar wrote:
> > Hm, i thought it would be possible to only express it via the
> > slowpath: if mutex_trylock() succeeds then *all* execution goes into
> > the slowpath so we don't have to take all the fastpaths into account.
>
> Right, but you first have to take wait_lock, then do the trylock, but
> that's complicated for asm/mutex-null.h because trylock will then also
> try to obtain the wait_lock.
>
> You can do it by creating ___mutex_trylock_slowpath() which contains the
> meat of __mutex_trylock_slowpath() and then implement
> atomic_mutex_trylock{_irq,_irqsave,} using that, not releasing wait_lock
> on success.
>
> Shouldn't be too bad, but it ain't too pretty either.
>
> Furthermore, like I said in my initial patch, I share Thomas' worry
> about 'creative' usage of these primitives.

We are way better off with the semaphore abuse confined to printk.c.

A mutex would give us lockdep coverage, but due to the strict owner
semantics - which we have already proven in -rt by converting it to a
mutex - we can annotate console_sem lockdep wise and still keep the
nifty semaphore abuse.

Further I don't have any worries about -rt either as a RT task using
printf is doomed anyway and we should not encourage that by making it
somehow more deterministic.

Thanks,

tglx