Every now and then I end up with an undebuggable issue because multiple
CPUs hit something at the same time and everything is interleaved:
CR: 48000082 XER: 00000000
,RI
c0000003dc72fd10
,LE
d0000000065b84e8
Instruction dump:
MSR: 8000000100029033
Very annoying.
Some architectures already have their own recursive locking for oopses
and we have another version for serialising dump_stack.
Create a common version and use it everywhere (oopses, BUGs, WARNs,
dump_stack, soft lockups and hard lockups). A few testcases were
used to verify the series:
A trivial module to create concurrent WARNs, BUGs and oopses:
http://ozlabs.org/~anton/junkcode/warnstorm.tar.gz
And one to create concurrent soft and hard lockups:
http://ozlabs.org/~anton/junkcode/badguy.tar.gz
Anton Blanchard (7):
Add die_spin_lock_{irqsave,irqrestore}
powerpc: Use die_spin_lock_{irqsave,irqrestore}
arm: Use die_spin_lock_{irqsave,irqrestore}
x86: Use die_spin_lock_{irqsave,irqrestore}
watchdog: Serialise soft lockup errors with
die_spin_lock_{irqsave,irqrestore}
dump_stack: Serialise dump_stack with
die_spin_lock_{irqsave,irqrestore}
powerpc: Serialise BUG and WARNs with
die_spin_lock_{irqsave,irqrestore}
arch/arm/kernel/traps.c | 26 ++---------------
arch/powerpc/kernel/traps.c | 68 ++++++++++++++++++++++++++-------------------
arch/x86/kernel/dumpstack.c | 26 ++---------------
include/linux/die_lock.h | 23 +++++++++++++++
kernel/watchdog.c | 4 +++
lib/Makefile | 1 +
lib/die_lock.c | 43 ++++++++++++++++++++++++++++
lib/dump_stack.c | 40 +++-----------------------
8 files changed, 120 insertions(+), 111 deletions(-)
create mode 100644 include/linux/die_lock.h
create mode 100644 lib/die_lock.c
--
2.1.0
Many architectures have their own oops locking code that allows
the lock to be taken recursively. Create a common version.
Avoid creating generic locking functions, so they can't be
abused in other parts of the kernel.
Signed-off-by: Anton Blanchard <[email protected]>
---
include/linux/die_lock.h | 23 +++++++++++++++++++++++
lib/Makefile | 1 +
lib/die_lock.c | 43 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 67 insertions(+)
create mode 100644 include/linux/die_lock.h
create mode 100644 lib/die_lock.c
diff --git a/include/linux/die_lock.h b/include/linux/die_lock.h
new file mode 100644
index 0000000..540d09d
--- /dev/null
+++ b/include/linux/die_lock.h
@@ -0,0 +1,23 @@
+#ifndef __LINUX_DIE_LOCK_H
+#define __LINUX_DIE_LOCK_H
+
+#include <linux/typecheck.h>
+
+/**
+ * die_spin_lock_irqsave - lock die spinlock
+ * @flags: interrupt state is saved here
+ *
+ * The die spinlock is used to serialise output during oopses, BUGs and
+ * WARNs. It can be taken recursively so that nested oopses will not
+ * lock up.
+ */
+unsigned long __die_spin_lock_irqsave(void);
+#define die_spin_lock_irqsave(flags) \
+ do { \
+ typecheck(unsigned long, flags); \
+ flags = __die_spin_lock_irqsave(); \
+ } while (0)
+
+void die_spin_unlock_irqrestore(unsigned long flags);
+
+#endif /* __LINUX_DIE_LOCK_H */
diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b..7d87a80 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -28,6 +28,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
obj-y += string_helpers.o
+obj-y += die_lock.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += kstrtox.o
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
diff --git a/lib/die_lock.c b/lib/die_lock.c
new file mode 100644
index 0000000..5d2de2e
--- /dev/null
+++ b/lib/die_lock.c
@@ -0,0 +1,43 @@
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned long __die_spin_lock_irqsave(void)
+{
+ unsigned long flags;
+ int cpu;
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ if (!arch_spin_trylock(&die_lock)) {
+ if (cpu != die_owner)
+ arch_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+
+ return flags;
+}
+
+/**
+ * die_spin_unlock_irqrestore - Unlock die spinlock
+ * @flags: interrupt state to restore
+ *
+ * Unlock die spinlock and restore interrupt state. This must be
+ * paired with die_spin_lock_irqsave.
+ */
+void die_spin_unlock_irqrestore(unsigned long flags)
+{
+ die_nest_count--;
+ if (!die_nest_count) {
+ die_owner = -1;
+ /* Nest count reaches zero, release the lock. */
+ arch_spin_unlock(&die_lock);
+ }
+ raw_local_irq_restore(flags);
+}
--
2.1.0
Replace the powerpc specific oops locking with the common one.
Signed-off-by: Anton Blanchard <[email protected]>
---
arch/powerpc/kernel/traps.c | 24 +++---------------------
1 file changed, 3 insertions(+), 21 deletions(-)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 19e4744..4cc1e72 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -36,6 +36,7 @@
#include <linux/debugfs.h>
#include <linux/ratelimit.h>
#include <linux/context_tracking.h>
+#include <linux/die_lock.h>
#include <asm/emulated_ops.h>
#include <asm/pgtable.h>
@@ -109,14 +110,10 @@ static void pmac_backlight_unblank(void)
static inline void pmac_backlight_unblank(void) { }
#endif
-static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
static int die_counter;
static unsigned __kprobes long oops_begin(struct pt_regs *regs)
{
- int cpu;
unsigned long flags;
if (debugger(regs))
@@ -124,17 +121,7 @@ static unsigned __kprobes long oops_begin(struct pt_regs *regs)
oops_enter();
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!arch_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- arch_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
+ die_spin_lock_irqsave(flags);
console_verbose();
bust_spinlocks(1);
if (machine_is(powermac))
@@ -146,15 +133,10 @@ static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs,
int signr)
{
bust_spinlocks(0);
- die_owner = -1;
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
- die_nest_count--;
oops_exit();
printk("\n");
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- arch_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
+ die_spin_unlock_irqrestore(flags);
crash_fadump(regs, "die oops");
--
2.1.0
Replace the ARM specific oops locking with the common one.
Signed-off-by: Anton Blanchard <[email protected]>
---
arch/arm/kernel/traps.c | 26 +++-----------------------
1 file changed, 3 insertions(+), 23 deletions(-)
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 788e23f..3e3469d 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/irq.h>
+#include <linux/die_lock.h>
#include <linux/atomic.h>
#include <asm/cacheflush.h>
@@ -263,28 +264,12 @@ static int __die(const char *str, int err, struct pt_regs *regs)
return 0;
}
-static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
static unsigned long oops_begin(void)
{
- int cpu;
unsigned long flags;
oops_enter();
-
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!arch_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- arch_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
+ die_spin_lock_irqsave(flags);
console_verbose();
bust_spinlocks(1);
return flags;
@@ -296,13 +281,8 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
crash_kexec(regs);
bust_spinlocks(0);
- die_owner = -1;
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
- die_nest_count--;
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- arch_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
+ die_spin_unlock_irqrestore(flags);
oops_exit();
if (in_interrupt())
--
2.1.0
Replace the x86 specific oops locking with the common one.
Signed-off-by: Anton Blanchard <[email protected]>
---
arch/x86/kernel/dumpstack.c | 26 +++-----------------------
1 file changed, 3 insertions(+), 23 deletions(-)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7..b635f73 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -15,6 +15,7 @@
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/sysfs.h>
+#include <linux/die_lock.h>
#include <asm/stacktrace.h>
@@ -196,28 +197,12 @@ void show_stack(struct task_struct *task, unsigned long *sp)
show_stack_log_lvl(task, NULL, sp, bp, "");
}
-static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
unsigned long oops_begin(void)
{
- int cpu;
unsigned long flags;
oops_enter();
-
- /* racy, but better than risking deadlock. */
- raw_local_irq_save(flags);
- cpu = smp_processor_id();
- if (!arch_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
- /* nested oops. should stop eventually */;
- else
- arch_spin_lock(&die_lock);
- }
- die_nest_count++;
- die_owner = cpu;
+ die_spin_lock_irqsave(flags);
console_verbose();
bust_spinlocks(1);
return flags;
@@ -231,13 +216,8 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
crash_kexec(regs);
bust_spinlocks(0);
- die_owner = -1;
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
- die_nest_count--;
- if (!die_nest_count)
- /* Nest count reaches zero, release the lock. */
- arch_spin_unlock(&die_lock);
- raw_local_irq_restore(flags);
+ die_spin_unlock_irqrestore(flags);
oops_exit();
if (!signr)
--
2.1.0
A simple kernel module was used to create concurrent soft and
hard lockups:
http://ozlabs.org/~anton/junkcode/badguy.tar.gz
Signed-off-by: Anton Blanchard <[email protected]>
---
kernel/watchdog.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf118..dd161e3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
+#include <linux/die_lock.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -313,6 +314,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ unsigned long flags;
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -384,6 +386,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
}
+ die_spin_lock_irqsave(flags);
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -394,6 +397,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
show_regs(regs);
else
dump_stack();
+ die_spin_unlock_irqrestore(flags);
if (softlockup_all_cpu_backtrace) {
/* Avoid generating two back traces for current
--
2.1.0
Remove another version of a recursive lock in dump_stack.
Signed-off-by: Anton Blanchard <[email protected]>
---
lib/dump_stack.c | 40 ++++------------------------------------
1 file changed, 4 insertions(+), 36 deletions(-)
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 6745c62..f64ee3c 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -7,7 +7,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/smp.h>
-#include <linux/atomic.h>
+#include <linux/die_lock.h>
static void __dump_stack(void)
{
@@ -20,44 +20,12 @@ static void __dump_stack(void)
*
* Architectures can override this implementation by implementing its own.
*/
-#ifdef CONFIG_SMP
-static atomic_t dump_lock = ATOMIC_INIT(-1);
-
asmlinkage __visible void dump_stack(void)
{
- int was_locked;
- int old;
- int cpu;
-
- /*
- * Permit this cpu to perform nested stack dumps while serialising
- * against other CPUs
- */
- preempt_disable();
-
-retry:
- cpu = smp_processor_id();
- old = atomic_cmpxchg(&dump_lock, -1, cpu);
- if (old == -1) {
- was_locked = 0;
- } else if (old == cpu) {
- was_locked = 1;
- } else {
- cpu_relax();
- goto retry;
- }
-
- __dump_stack();
+ unsigned long flags;
- if (!was_locked)
- atomic_set(&dump_lock, -1);
-
- preempt_enable();
-}
-#else
-asmlinkage __visible void dump_stack(void)
-{
+ die_spin_lock_irqsave(flags);
__dump_stack();
+ die_spin_unlock_irqrestore(flags);
}
-#endif
EXPORT_SYMBOL(dump_stack);
--
2.1.0
A simple kernel module was used to create concurrent WARNs and BUGs:
http://ozlabs.org/~anton/junkcode/warnstorm.tar.gz
Signed-off-by: Anton Blanchard <[email protected]>
---
arch/powerpc/kernel/traps.c | 44 ++++++++++++++++++++++++++++++++++++--------
1 file changed, 36 insertions(+), 8 deletions(-)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 4cc1e72..f779557 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1114,6 +1114,39 @@ static int emulate_math(struct pt_regs *regs)
static inline int emulate_math(struct pt_regs *regs) { return -1; }
#endif
+static bool __report_bug(unsigned long bugaddr, struct pt_regs *regs)
+{
+ const struct bug_entry *bug;
+ unsigned long flags;
+ int err;
+
+ if (!is_valid_bugaddr(bugaddr))
+ return false;
+
+ bug = find_bug(bugaddr);
+ if (!bug)
+ return false;
+
+ if (is_warning_bug(bug)) {
+ die_spin_lock_irqsave(flags);
+ report_bug(bugaddr, regs);
+ die_spin_unlock_irqrestore(flags);
+
+ regs->nip += 4;
+
+ return true;
+ }
+
+ flags = oops_begin(regs);
+ report_bug(bugaddr, regs);
+ err = SIGTRAP;
+ if (__die("Exception in kernel mode", regs, err))
+ err = 0;
+ oops_end(flags, regs, err);
+
+ return true;
+}
+
void __kprobes program_check_exception(struct pt_regs *regs)
{
enum ctx_state prev_state = exception_enter();
@@ -1138,11 +1171,9 @@ void __kprobes program_check_exception(struct pt_regs *regs)
== NOTIFY_STOP)
goto bail;
- if (!(regs->msr & MSR_PR) && /* not user-mode */
- report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
- regs->nip += 4;
+ if (!user_mode(regs) && __report_bug(regs->nip, regs))
goto bail;
- }
+
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
goto bail;
}
@@ -1157,11 +1188,8 @@ void __kprobes program_check_exception(struct pt_regs *regs)
* - A tend is illegally attempted.
* - writing a TM SPR when transactional.
*/
- if (!user_mode(regs) &&
- report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
- regs->nip += 4;
+ if (!user_mode(regs) && __report_bug(regs->nip, regs))
goto bail;
- }
/* If usermode caused this, it's done something illegal and
* gets a SIGILL slap on the wrist. We call it an illegal
* operand to distinguish from the instruction just being bad
--
2.1.0
* Anton Blanchard <[email protected]> wrote:
> Every now and then I end up with an undebuggable issue
> because multiple CPUs hit something at the same time and
> everything is interleaved:
>
> CR: 48000082 XER: 00000000
> ,RI
> c0000003dc72fd10
> ,LE
> d0000000065b84e8
> Instruction dump:
> MSR: 8000000100029033
>
> Very annoying.
>
> Some architectures already have their own recursive
> locking for oopses and we have another version for
> serialising dump_stack.
>
> Create a common version and use it everywhere (oopses,
> BUGs, WARNs, dump_stack, soft lockups and hard lockups).
Dunno. I've had cases where the simultaneity of the oopses
(i.e. their garbled nature) gave me the clue about the type
of race to expect.
To still get that information: instead of taking a
serializing spinlock (or in addition to it), it would be
nice to at least preserve the true time order of the
incidents, at minimum by generating a global count for
oopses/warnings (a bit like the oops count # currently),
and to gather it first - before taking any spinlocks.
Thanks,
Ingo
* Anton Blanchard <[email protected]> wrote:
> +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
> +static int die_owner = -1;
> +static unsigned int die_nest_count;
> +
> +unsigned long __die_spin_lock_irqsave(void)
> +{
> + unsigned long flags;
> + int cpu;
> +
> + /* racy, but better than risking deadlock. */
> + raw_local_irq_save(flags);
> +
> + cpu = smp_processor_id();
> + if (!arch_spin_trylock(&die_lock)) {
> + if (cpu != die_owner)
> + arch_spin_lock(&die_lock);
So why not trylock and time out here after a few seconds,
instead of indefinitely supressing some potentially vital
output due to some other CPU crashing/locking with the lock
held?
> + }
> + die_nest_count++;
> + die_owner = cpu;
> +
> + return flags;
I suspect this would work in most cases.
If we fix the deadlock potential, and get a true global
ordering of various oopses/warnings as they triggered (or
at least timestamping them), then I'm sold on this I guess,
it will likely improve things.
Thanks,
Ingo
* Ingo Molnar <[email protected]> wrote:
>
> * Anton Blanchard <[email protected]> wrote:
>
> > +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
> > +static int die_owner = -1;
> > +static unsigned int die_nest_count;
> > +
> > +unsigned long __die_spin_lock_irqsave(void)
> > +{
> > + unsigned long flags;
> > + int cpu;
> > +
> > + /* racy, but better than risking deadlock. */
> > + raw_local_irq_save(flags);
> > +
> > + cpu = smp_processor_id();
> > + if (!arch_spin_trylock(&die_lock)) {
> > + if (cpu != die_owner)
> > + arch_spin_lock(&die_lock);
>
> So why not trylock and time out here after a few seconds,
> instead of indefinitely supressing some potentially vital
> output due to some other CPU crashing/locking with the lock
> held?
[...]
> If we fix the deadlock potential, and get a true global
> ordering of various oopses/warnings as they triggered (or
> at least timestamping them), [...]
If we had a global 'trouble counter' we could use that to
refine the spin-looping timeout: instead of using a pure
timeout of a few seconds, we could say 'a timeout of a few
seconds while the counter does not increase'.
I.e. only override the locking/ordering if the owner CPU
does not seem to be able to make progress with printing the
oops/warning.
Thanks,
Ingo
>> Some architectures already have their own recursive
>> locking for oopses and we have another version for
>> serialising dump_stack.
>>
>> Create a common version and use it everywhere (oopses,
>> BUGs, WARNs, dump_stack, soft lockups and hard lockups).
>
> Dunno. I've had cases where the simultaneity of the oopses
> (i.e. their garbled nature) gave me the clue about the type
> of race to expect.
>
one of the question is if you want to serialize, or if you just want to label.
If you take a cookie (could just be a monotonic increasing number) at
the start of the oops
and then prefix/postfix the stack printing with that number, you don't
serialize (risk of locking up),
but you can pretty trivially see which line came from where..
if you do the monotonic increasing number approach, you even get an
ordering out of it.
it does mean changing the dump_stack() and co function fingerprint to
take an extra argument,
but that is not TOO insane.
From: Ingo Molnar
...
> So why not trylock and time out here after a few seconds,
> instead of indefinitely supressing some potentially vital
> output due to some other CPU crashing/locking with the lock
> held?
I've used that for status requests that usually lock a table
to get a consistent view.
If trylock times out assume that the data is actually stable
and access it anyway. Remember the pid doing the access and
next time it tries to acquire the same lock do a trylock with
no timeout.
That way if (when) there is a locking fubar (or a driver oops
with a lock held) at least some of the relevant status commands
will work.
David
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Tue, Feb 24, 2015 at 01:39:46AM -0800, Arjan van de Ven wrote:
> one of the question is if you want to serialize, or if you just want
> to label. If you take a cookie (could just be a monotonic increasing
> number) at the start of the oops and then prefix/postfix the stack
> printing with that number, you don't serialize (risk of locking up),
> but you can pretty trivially see which line came from where..
> if you do the monotonic increasing number approach, you even get an
> ordering out of it. it does mean changing the dump_stack() and co
> function fingerprint to take an extra argument, but that is not TOO
> insane.
I like that idea, but it relies on ensuring that each line is printed
by one printk() statement - which in itself is a good idea.
I'd actually like a version of print_hex_dump() which we could use for
stack and code dumping - the existing print_hex_dump() assumes that it's
fine to dereference the pointer, whereas for stack and code dumping,
we can't always make that assumption. That's a separate issue though.
--
FTTC broadband for 0.8mile line: currently at 10.5Mbps down 400kbps up
according to speedtest.net.