Currently if an active CPU fails to respond to a roundup request the
CPU that requested the roundup will become stuck. This needlessly
reduces the robustness of the debugger.
This patch introduces a timeout allowing the system state to be examined
even when the system contains unresponsive processors. It also modifies
kdb's cpu command to make it censor attempts to switch to unresponsive
processors and to report their state as (D)ead.
Signed-off-by: Daniel Thompson <[email protected]>
Cc: Jason Wessel <[email protected]>
Cc: Mike Travis <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Dimitri Sivanich <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: [email protected]
---
kernel/debug/debug_core.c | 9 +++++++--
kernel/debug/kdb/kdb_main.c | 4 +++-
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b..acd7497 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -471,6 +471,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int cpu;
int trace_on = 0;
int online_cpus = num_online_cpus();
+ u64 time_left;
kgdb_info[ks->cpu].enter_kgdb++;
kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +596,13 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
- atomic_read(&slaves_in_kgdb)) != online_cpus)
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
cpu_relax();
+ if (!time_left)
+ pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2f7c760..49f2425 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2157,6 +2157,8 @@ static void kdb_cpu_status(void)
for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i)) {
state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2212,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
--
1.9.3
On 07/01/2014 09:16 AM, Daniel Thompson wrote:
> Currently if an active CPU fails to respond to a roundup request the
> CPU that requested the roundup will become stuck. This needlessly
> reduces the robustness of the debugger.
>
> This patch introduces a timeout allowing the system state to be examined
> even when the system contains unresponsive processors. It also modifies
> kdb's cpu command to make it censor attempts to switch to unresponsive
> processors and to report their state as (D)ead.
It seems reasonable to allow entry on the master core because there certainly could be useful information to be had with respect to how you got there in the first place, but I wonder about the case for resuming the system. In general if you couldn't sync in the the first place, the system is dead. My opinion is that we probably should explicitly disallow a resume or single step at that point.
Jason.
Currently if an active CPU fails to respond to a roundup request the
CPU that requested the roundup will become stuck. This needlessly
reduces the robustness of the debugger.
This patch introduces a timeout allowing the system state to be examined
even when the system contains unresponsive processors. It also modifies
kdb's cpu command to make it censor attempts to switch to unresponsive
processors and to report their state as (D)ead.
Signed-off-by: Daniel Thompson <[email protected]>
Cc: Jason Wessel <[email protected]>
Cc: Mike Travis <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Dimitri Sivanich <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: [email protected]
---
Notes:
Changes since v1:
- Set CATASTROPHIC if the system contains unresponsive processors
(Jason Wessel)
kernel/debug/debug_core.c | 9 +++++++--
kernel/debug/kdb/kdb_debugger.c | 4 ++++
kernel/debug/kdb/kdb_main.c | 4 +++-
3 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1adf62b..acd7497 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -471,6 +471,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int cpu;
int trace_on = 0;
int online_cpus = num_online_cpus();
+ u64 time_left;
kgdb_info[ks->cpu].enter_kgdb++;
kgdb_info[ks->cpu].exception_state |= exception_state;
@@ -595,9 +596,13 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
- atomic_read(&slaves_in_kgdb)) != online_cpus)
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
cpu_relax();
+ if (!time_left)
+ pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8859ca3..15e1a7a 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -129,6 +129,10 @@ int kdb_stub(struct kgdb_state *ks)
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
+ /* set CATASTROPHIC if the system contains unresponsive processors */
+ for_each_online_cpu(i)
+ if (!kgdb_info[i].enter_kgdb)
+ KDB_FLAG_SET(CATASTROPHIC);
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2f7c760..49f2425 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2157,6 +2157,8 @@ static void kdb_cpu_status(void)
for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
if (!cpu_online(i)) {
state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
if (kdb_task_state_char(KDB_TSK(i)) == 'I')
@@ -2210,7 +2212,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+ if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
--
1.9.3