2003-05-03 01:40:00

by Chuck Ebbert

[permalink] [raw]
Subject: [PATCH][Experimental] Debugging i386 using hardware task switching


This patch implements i386 debugging with hardware task switching. So far
it can take snapshots of either kernel or user CPU state and can be used to
time events, again in kernel or user space. Overhead for taking a snapshot
is about 1000 clocks on PPro 200. Snapshots are taken by placing an 'int 3'
in the code, causing a switch to the debugger TSS which automatically
saves the CPU state in the kernel TSS. The handler then reads the TSC and
computes clocks elapsed since last invocation, then returns immediately and
the invoking code continues at the next instruction. Everything is per-cpu
including a 1-page ring buffer for storing events (not currently used.)

So far it has survived two concurrent copies of this program executing
on SMP:

main() {
for (;;) asm("int $3;");
}

The timer shows 0x3e4 clocks per loop here on uniprocessor and the system
behaves just fine while this is running.

It also works in kernel mode, tested with this patch, which times how
long __ide_dma_end takes to execute (0x6b0 clocks including debugger overhead):
================================================================================
diff -u --exclude-from=/home/me/.exclude -r 68r/drivers/ide/pci/pdc202xx_old.c 68d/drivers/ide/pci/pdc202xx_old.c
--- 68r/drivers/ide/pci/pdc202xx_old.c Sat Mar 29 09:16:33 2003
+++ 68d/drivers/ide/pci/pdc202xx_old.c Fri May 2 20:25:46 2003
@@ -557,6 +557,8 @@

static int pdc202xx_old_ide_dma_end(ide_drive_t *drive)
{
+ int ret;
+
if (drive->addressing == 1) {
ide_hwif_t *hwif = HWIF(drive);
// unsigned long high_16 = pci_resource_start(hwif->pci_dev, 4);
@@ -568,7 +570,12 @@
clock = hwif->INB(high_16 + 0x11);
hwif->OUTB(clock & ~(hwif->channel ? 0x08:0x02), high_16+0x11);
}
- return __ide_dma_end(drive);
+
+ asm("int $3;");
+ ret = __ide_dma_end(drive);
+ asm("int $3;");
+
+ return ret;
}

static int pdc202xx_old_ide_dma_test_irq(ide_drive_t *drive)
================================================================================

Known bugs:

Invoking this with currently-invalid segment registers
will cause Bad Things to happen.

Patch itself is against vanilla 2.5.68:

================================================================================
arch/i386/kernel/Makefile | 2
arch/i386/kernel/cpu/common.c | 13 +++++-
arch/i386/kernel/debug.c | 85 ++++++++++++++++++++++++++++++++++++++++++
arch/i386/kernel/traps.c | 4 +
include/asm-i386/processor.h | 8 ++-
include/asm-i386/segment.h | 4 +
6 files changed, 108 insertions(+), 8 deletions(-)
================================================================================
diff -u --exclude-from=/home/me/.exclude -rN a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile Sat Mar 29 09:16:21 2003
+++ b/arch/i386/kernel/Makefile Wed Apr 30 18:49:50 2003
@@ -7,7 +7,7 @@
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o
+ doublefault.o debug.o

obj-y += cpu/
obj-y += timers/
diff -u --exclude-from=/home/me/.exclude -rN a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
--- a/arch/i386/kernel/cpu/common.c Sun Apr 20 06:26:50 2003
+++ b/arch/i386/kernel/cpu/common.c Wed Apr 30 22:23:15 2003
@@ -20,6 +20,11 @@

extern int disable_pse;

+extern unsigned long debug_stack[NR_CPUS][256];
+extern struct tss_struct debug_tss[NR_CPUS];
+extern struct tss_struct init_debug_tss;
+extern void debug_fn(void);
+
static void default_init(struct cpuinfo_x86 * c)
{
/* Not much we can do here... */
@@ -491,13 +496,17 @@

load_esp0(t, thread->esp0);
set_tss_desc(cpu,t);
- cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff;
load_TR_desc();
load_LDT(&init_mm.context);

/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
- cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff;
+
+ /* Set up debug TSS pointer in the GDT */
+ memcpy(&debug_tss[cpu], &init_debug_tss, sizeof(init_debug_tss));
+ /* FIXME const 256 */
+ debug_tss[cpu].esp0 = debug_tss[cpu].esp = debug_stack[cpu] + 256;
+ __set_tss_desc(cpu, GDT_ENTRY_DEBUG_TSS, debug_tss + cpu);

/* Clear %fs and %gs. */
asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
diff -u --exclude-from=/home/me/.exclude -rN a/arch/i386/kernel/debug.c b/arch/i386/kernel/debug.c
--- a/arch/i386/kernel/debug.c Wed Dec 31 19:00:00 1969
+++ b/arch/i386/kernel/debug.c Fri May 2 15:25:45 2003
@@ -0,0 +1,85 @@
+#include <linux/mm.h>
+#include <linux/threads.h>
+
+#include <asm/desc.h>
+
+/* FIXME #define size */
+static unsigned long debug_buffer[NR_CPUS][1024]; /* one page per cpu */
+/*
+ * Buffer layout:
+ * 0-15 work area
+ * 16-end ring buffer for events
+ */
+
+unsigned long debug_stack[NR_CPUS][256];
+struct tss_struct debug_tss[NR_CPUS] __cacheline_aligned;
+
+void debug_fn(void)
+{
+ /*
+ * Initialize the handler here. Save as much state as possible
+ * in regs since they are restored by task switch.
+ *
+ * ebp -> per-cpu buffer
+ * edi -> offset into buffer
+ * esi -> #calls
+ * ebx -> GDT
+ * edx:eax -> last tsc
+ * ecx -> 32-bit tsc delta
+ */
+
+ asm("movl %0,%%ebp": :"r" (debug_buffer + smp_processor_id()));
+
+ asm("sgdtl 2(%ebp);"
+ "movl 4(%ebp),%ebx;"
+ "movl $16,%edi;"); /* edi -> GDT */
+
+ /*
+ * Now handle the event. May want to init separately with int3
+ * from kernel during setup. If so, remove this goto.
+ */
+ goto debug_handler;
+
+debug_return:
+ /*
+ * Verify the kernel TSS:
+ * - invalid fs,gs -> 0
+ * - " ds,es -> __USER_DS
+ * - " cs,ss -> ?
+ */
+
+ /* FIXME: kernel TSS is at GDT+128 */
+
+ asm("iret");
+
+debug_handler:
+ /*
+ * Do something...
+ */
+
+ asm("movl %eax,(%ebp)"); /* save old tsc */
+ asm("rdtsc"); /* FIXME: must be supported */
+ asm("movl %eax,%ecx");
+ asm("subl (%ebp),%ecx"); /* ecx = tsc delta */
+ asm("incl %esi"); /* esi = counter */
+
+ goto debug_return;
+}
+
+struct tss_struct init_debug_tss = {
+ .esp0 = 0, /* percpu */
+ .ss0 = __KERNEL_DS,
+ .ldt = __KERNEL_LDT,
+ .bitmap = INVALID_IO_BITMAP_OFFSET,
+ .io_bitmap = { [0 ... IO_BITMAP_SIZE ] = ~0 },
+
+ .eip = (unsigned long) debug_fn,
+ .eflags = 0x00000082, /* interrupts off */
+ .esp = 0, /* percpu */
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+
+ .__cr3 = __pa(swapper_pg_dir)
+};
diff -u --exclude-from=/home/me/.exclude -rN a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
--- a/arch/i386/kernel/traps.c Sun Apr 20 06:26:50 2003
+++ b/arch/i386/kernel/traps.c Wed Apr 30 22:29:18 2003
@@ -853,7 +853,9 @@
set_trap_gate(0,&divide_error);
set_intr_gate(1,&debug);
set_intr_gate(2,&nmi);
- set_system_gate(3,&int3); /* int3-5 can be called from all */
+ /* set_system_gate(3,&int3); */ /* int3-5 can be called from all */
+ set_task_gate(3,GDT_ENTRY_DEBUG_TSS);
+ idt_table[3].b |= 0x6000; /* DPL 3 */
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -u --exclude-from=/home/me/.exclude -rN a/include/asm-i386/processor.h b/include/asm-i386/processor.h
--- a/include/asm-i386/processor.h Sun Apr 20 06:26:52 2003
+++ b/include/asm-i386/processor.h Wed Apr 30 20:31:38 2003
@@ -177,8 +177,10 @@
return edx;
}

-#define load_cr3(pgdir) \
- asm volatile("movl %0,%%cr3": :"r" (__pa(pgdir)))
+#define load_cr3(pgdir) { \
+ init_tss[smp_processor_id()].__cr3 = __pa(pgdir); \
+ asm volatile("movl %0,%%cr3": :"r" (__pa(pgdir))); \
+ }


/*
@@ -417,7 +419,7 @@
.ss0 = __KERNEL_DS, \
.esp1 = sizeof(init_tss[0]) + (long)&init_tss[0], \
.ss1 = __KERNEL_CS, \
- .ldt = GDT_ENTRY_LDT, \
+ .ldt = __KERNEL_LDT, \
.bitmap = INVALID_IO_BITMAP_OFFSET, \
.io_bitmap = { [ 0 ... IO_BITMAP_SIZE ] = ~0 }, \
}
diff -u --exclude-from=/home/me/.exclude -rN a/include/asm-i386/segment.h b/include/asm-i386/segment.h
--- a/include/asm-i386/segment.h Tue Mar 4 22:29:04 2003
+++ b/include/asm-i386/segment.h Wed Apr 30 19:06:01 2003
@@ -42,7 +42,7 @@
* 27 - unused
* 28 - unused
* 29 - unused
- * 30 - unused
+ * 30 - TSS for debugger
* 31 - TSS for double fault handler
*/
#define GDT_ENTRY_TLS_ENTRIES 3
@@ -67,10 +67,12 @@

#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
+#define __KERNEL_LDT (GDT_ENTRY_LDT * 8)

#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)

+#define GDT_ENTRY_DEBUG_TSS 30
#define GDT_ENTRY_DOUBLEFAULT_TSS 31

/*
------
Chuck


2003-05-03 13:21:47

by Chuck Ebbert

[permalink] [raw]
Subject: Re:[PATCH][Experimental] Debugging i386 using hardware task switching

Here is some output from testing the TSS-based int3 handler.

You need the usermode program at the end of this message to get this
output (I added the *'s at the right to match up with the comments
below.) This dump was after running this program for a while, then
breaking out with ctrl+c:

main () { for (;;) asm("int $3;"); }

*1 - CS:EIP: last int3 was from usermode near the start of an ELF program
*2 - SS:ESP: near the top of usermode stack
*3 - EAX-EBP: whatever glibc puts here at start of a C program
*4 - ECX: last two int3's on this CPU were 1012 clocks apart (incl. overhead)
*5 - ESI: int3 has been invoked 675251 times since boot on this CPU
*6 - LINK: last task to do int3 was kernel TSS


GDT entry #16: kernel TSS at c0353800, 236 bytes:

CS:0073 <GDT#14,RPL3> EIP:080483d5 eflags:00000292 <IF:1> *1
SS0:0068 <GDT#13,RPL0> ESP0:c2766000
SS1:0060 <GDT#12,RPL0> ESP1:c0353a00
SS2:0000 <GDT#00,RPL0> ESP2:00000000
SS:007b <GDT#15,RPL3> ESP:bffffb68 *2
DS:007b <GDT#15,RPL3> ES:007b <GDT#15,RPL3>
FS:0000 <GDT#00,RPL0> GS:0000 <GDT#00,RPL0>
EAX:00000001 EBX:4213030c ECX:42130f08 EDX:bffffbdc *3
ESI:40013020 EDI:bffffbd4 EBP:bffffb68 BITMAP:8000
LDT:0088 <GDT#17,RPL0> CR3:02938000 LINK:0000 <GDT#00,RPL0>


GDT entry #30: debugger TSS at c0355800, 236 bytes:

CS:0060 <GDT#12,RPL0> EIP:c010f7e6 eflags:00000002 <IF:0>
SS0:0068 <GDT#13,RPL0> ESP0:c03ae420
SS1:0000 <GDT#00,RPL0> ESP1:00000000
SS2:0000 <GDT#00,RPL0> ESP2:00000000
SS:0068 <GDT#13,RPL0> ESP:c03ae420
DS:007b <GDT#15,RPL3> ES:007b <GDT#15,RPL3>
FS:0000 <GDT#00,RPL0> GS:0000 <GDT#00,RPL0>
EAX:7a754010 EBX:c0306360 ECX:000003f4 EDX:000000ea *4
ESI:000a4db3 EDI:00000010 EBP:c03ad020 BITMAP:8000 *5
LDT:0088 <GDT#17,RPL0> CR3:00101000 LINK:0080 <GDT#16,RPL0> *6



#define PROG_NAME "tss"
#define PROG_TITLE "dump i386 CPU kernel and debug TSS"
#define VERSION "0.63"
/* v0.63: new program, derived from desc 0.63 */
#define AUTHOR "Chuck Ebbert <[email protected]>"
#define LICENSE "Released under the GPL"

#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>

typedef struct { /* from Linux */
unsigned short back_link,__blh;
unsigned long esp0;
unsigned short ss0,__ss0h;
unsigned long esp1;
unsigned short ss1,__ss1h;
unsigned long esp2;
unsigned short ss2,__ss2h;
unsigned long __cr3;
unsigned long eip;
unsigned long eflags;
unsigned long eax,ecx,edx,ebx;
unsigned long esp;
unsigned long ebp;
unsigned long esi;
unsigned long edi;
unsigned short es, __esh;
unsigned short cs, __csh;
unsigned short ss, __ssh;
unsigned short ds, __dsh;
unsigned short fs, __fsh;
unsigned short gs, __gsh;
unsigned short ldt, __ldth;
unsigned short trace, bitmap;
} __attribute__ ((packed)) tss_t;

typedef struct {
unsigned short limit;
unsigned int base;
} __attribute__ ((packed)) dtr_t;

typedef struct {
unsigned a,b;
} __attribute__ ((packed)) desc_t;

/* read/write from kmem */
#define BUILD_IO_FN(fn_name, fn_fn) \
static int fn_name (int fd, int offset, void *buf, int size) { \
if (lseek(fd, offset, 0) != offset) return 0; \
if (fn_fn(fd, buf, size) != size) return 0; \
return size; \
}
BUILD_IO_FN(rkm, read)

#define perr(err) { perror(err), exit(1); }
/* read/write wrapper with err handling */
#define xkm(f, p1, p2, p3, p4, msg) { \
int _p4 = (p4); if ((f)((p1), (p2), (p3), _p4) != _p4) perr(msg); }
#define TEST_BIT(v,b) ( !!((v) & (1 << (b))) )
#define nl printf("\n")

int kmem;

void dump_sel (char *nm, unsigned short s) {
printf("%s:%04x <%s#%02d,RPL%d>",
nm, s, (s & 4) ? "LDT" : "GDT", s >> 3, s & 3);
}
void dump_table (char * name, unsigned base, unsigned num) {
unsigned cdt, db, dl, a, b, t, i, f;
desc_t desc;
tss_t tss;

for (i = 16; i <= 30; i += 14) { /* just kernel and debug TSS */
xkm(rkm, kmem, base + 8*i, &desc, sizeof(desc), "rkm gdtldt");
a = desc.a, b = desc.b, f = (b & 0xf0ff00) >> 8, t = f & 0x1f;
if ((t != 0x09) & (t != 0x0b)) continue; /* not a 32b TSS */

dl = (a & 0xffff) | (b & 0x0f0000);
db = (b & 0xff000000) | ((b & 0xff) << 16) | (a >> 16);
xkm(rkm, kmem, db, &tss, sizeof(tss), "rkm tss");
printf("\nGDT entry #%d: %s TSS at %08x, %d bytes:\n\n",
i, (i == 16) ? "kernel" : "debugger", db, dl + 1);
dump_sel(" CS", tss.cs);
printf(" EIP:%08x eflags:%08x <IF:%d>\n",
tss.eip, tss.eflags, TEST_BIT(tss.eflags,9));
dump_sel(" SS0", tss.ss0);
printf(" ESP0:%08x\n", tss.esp0);
dump_sel(" SS1", tss.ss1);
printf(" ESP1:%08x\n", tss.esp1);
dump_sel(" SS2", tss.ss2);
printf(" ESP2:%08x\n", tss.esp2);
dump_sel(" SS", tss.ss);
printf(" ESP:%08x\n", tss.esp);
dump_sel(" DS", tss.ds);
dump_sel(" ES", tss.es), nl;
dump_sel(" FS", tss.fs);
dump_sel(" GS", tss.gs), nl;
printf(" EAX:%08x EBX:%08x ECX:%08x EDX:%08x\n",
tss.eax, tss.ebx, tss.ecx, tss.edx);
printf(" ESI:%08x EDI:%08x EBP:%08x"
" BITMAP:%04x\n",
tss.esi, tss.edi, tss.ebp, tss.bitmap);
dump_sel(" LDT", tss.ldt);
printf(" CR3:%08x", tss.__cr3);
dump_sel(" LINK", tss.back_link), nl, nl;
}
nl;
}
main() {
dtr_t dtr;
kmem = open("/dev/kmem", O_RDONLY);
if (kmem < 0) perr("open /dev/kmem");
asm("sgdt %0" : "=m" (dtr));
dump_table("GDT", dtr.base, (dtr.limit + 1) >> 3);
close(kmem);
}
------
Chuck

2003-05-03 15:13:17

by Chuck Ebbert

[permalink] [raw]
Subject: Re:[PATCH][Experimental] Debugging i386 using hardware task switching

Buoyed by the success of silly usermode testing, I got brave and stuck
an int3 instruction right in the middle of arch/i386/kernel/irq.c:do_IRQ()
and captured this:

*1 - EIP is at do_IRQ+0x38, with interrupts disabled
*2 - 0x50 (80) bytes of kernel stack are in use
*3 - ECX: last two interrupts were 199485 clocks apart
(boot CPU speed is 199.491 MHz)
*4 - ESI: int3 has been invoked 1653673 times since boot

This is with an SMP kernel on a 1-CPU machine. The 1000 clocks of extra
overhead every interrupt don't seem to make much of a difference...


GDT entry #16: kernel TSS at c0393800, 236 bytes:

CS:0060 <GDT#12,RPL0> EIP:c010afa8 eflags:00000002 <IF:0> *1
SS0:0068 <GDT#13,RPL0> ESP0:c2966000
SS1:0060 <GDT#12,RPL0> ESP1:c0393a00
SS2:0000 <GDT#00,RPL0> ESP2:00000000
SS:0068 <GDT#13,RPL0> ESP:c2965fb0 *2
DS:007b <GDT#15,RPL3> ES:007b <GDT#15,RPL3>
FS:0000 <GDT#00,RPL0> GS:0000 <GDT#00,RPL0>
EAX:c10a2020 EBX:00000000 ECX:00cb5f80 EDX:00000000
ESI:c0393c00 EDI:00000000 EBP:bffffb78 BITMAP:8000
LDT:0088 <GDT#17,RPL0> CR3:035d1000 LINK:0000 <GDT#00,RPL0>


GDT entry #30: debugger TSS at c0395a00, 236 bytes:

CS:0060 <GDT#12,RPL0> EIP:c0110173 eflags:00000006 <IF:0>
SS0:0068 <GDT#13,RPL0> ESP0:c03f27a0
SS1:0000 <GDT#00,RPL0> ESP1:00000000
SS2:0000 <GDT#00,RPL0> ESP2:00000000
SS:0068 <GDT#13,RPL0> ESP:c03f27a0
DS:007b <GDT#15,RPL3> ES:007b <GDT#15,RPL3>
FS:0000 <GDT#00,RPL0> GS:0000 <GDT#00,RPL0>
EAX:aa72bbf2 EBX:c0341a40 ECX:00030b3d EDX:0000004e *3
ESI:00193ba9 EDI:00000010 EBP:c03f03a0 BITMAP:8000
LDT:0088 <GDT#17,RPL0> CR3:00101000 LINK:0080 <GDT#16,RPL0> *4