2002-08-02 02:11:42

by Rusty Russell

[permalink] [raw]
Subject: [PATCH] kprobes for 2.5.30

Hi Linus,

Vamsi's kernel probes again, this time with EXPORT_SYMBOL_GPL
so people don't think this is blanket permission to hook into
arbitrary parts of the kernel (as separate from debugging, testing,
diagnostics, etc).

For a change, this one's cool and *doesn't* break anything 8)

Please apply,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: Kprobes for i386
Author: Vamsi Krishna S
Status: Tested on 2.5.26 SMP

D: This patch allows trapping at almost any kernel address, useful for
D: various kernel-hacking tasks, and building on for more
D: infrastructure. This patch is x86 only.

diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/arch/i386/Config.help working-2.5.26-kprobes/arch/i386/Config.help
--- linux-2.5.26/arch/i386/Config.help Mon Jun 17 23:19:15 2002
+++ working-2.5.26-kprobes/arch/i386/Config.help Fri Jul 19 11:17:46 2002
@@ -967,3 +967,9 @@ CONFIG_SOFTWARE_SUSPEND
absence of features.

For more information take a look at Documentation/swsusp.txt.
+
+CONFIG_KPROBES
+ Kprobes allows you to trap at almost any kernel address, using
+ register_kprobe(), and providing a callback function. This is useful
+ for kernel debugging, non-intrusive instrumentation and testing. If
+ in doubt, say "N".
diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/arch/i386/config.in working-2.5.26-kprobes/arch/i386/config.in
--- linux-2.5.26/arch/i386/config.in Wed Jul 17 10:25:46 2002
+++ working-2.5.26-kprobes/arch/i386/config.in Fri Jul 19 11:17:46 2002
@@ -419,6 +419,7 @@ if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; th
if [ "$CONFIG_HIGHMEM" = "y" ]; then
bool ' Highmem debugging' CONFIG_DEBUG_HIGHMEM
fi
+ bool ' Probes' CONFIG_KPROBES
fi

endmenu
diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/arch/i386/kernel/entry.S working-2.5.26-kprobes/arch/i386/kernel/entry.S
--- linux-2.5.26/arch/i386/kernel/entry.S Mon Jun 17 23:19:16 2002
+++ working-2.5.26-kprobes/arch/i386/kernel/entry.S Fri Jul 19 11:17:46 2002
@@ -442,9 +442,24 @@ device_not_available_emulate:
jmp ret_from_exception

ENTRY(debug)
+#ifdef CONFIG_KPROBES
+ pushl %eax
+ SAVE_ALL
+ GET_THREAD_INFO(%ebx)
+ movl %esp,%edx
+ pushl $0
+ pushl %edx
+ call do_debug
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ preempt_stop
+ jmp ret_from_exception
+#else
pushl $0
pushl $do_debug
jmp error_code
+#endif

ENTRY(nmi)
pushl %eax
@@ -457,9 +472,24 @@ ENTRY(nmi)
RESTORE_ALL

ENTRY(int3)
+#ifdef CONFIG_KPROBES
+ pushl %eax
+ SAVE_ALL
+ GET_THREAD_INFO(%ebx)
+ movl %esp,%edx
+ pushl $0
+ pushl %edx
+ call do_int3
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ preempt_stop
+ jmp ret_from_exception
+#else
pushl $0
pushl $do_int3
jmp error_code
+#endif

ENTRY(overflow)
pushl $0
diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/arch/i386/kernel/i386_ksyms.c working-2.5.26-kprobes/arch/i386/kernel/i386_ksyms.c
--- linux-2.5.26/arch/i386/kernel/i386_ksyms.c Fri Jun 21 09:41:52 2002
+++ working-2.5.26-kprobes/arch/i386/kernel/i386_ksyms.c Fri Jul 19 11:17:46 2002
@@ -29,6 +29,7 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
+#include <asm/kprobes.h>

extern void dump_thread(struct pt_regs *, struct user *);
extern spinlock_t rtc_lock;
@@ -176,6 +177,11 @@ extern int is_sony_vaio_laptop;
EXPORT_SYMBOL(is_sony_vaio_laptop);

EXPORT_SYMBOL(__PAGE_KERNEL);
+
+#ifdef CONFIG_KPROBES
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
+#endif

#ifdef CONFIG_MULTIQUAD
EXPORT_SYMBOL(xquad_portio);
diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/arch/i386/kernel/traps.c working-2.5.26-kprobes/arch/i386/kernel/traps.c
--- linux-2.5.26/arch/i386/kernel/traps.c Fri Jun 21 09:41:52 2002
+++ working-2.5.26-kprobes/arch/i386/kernel/traps.c Fri Jul 19 11:17:48 2002
@@ -5,6 +5,9 @@
*
* Pentium III FXSR, SSE support
* Gareth Hughes <[email protected]>, May 2000
+ *
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
*/

/*
@@ -50,6 +53,8 @@
#include <asm/cobalt.h>
#include <asm/lithium.h>
#endif
+#include <asm/kprobes.h>
+#include <linux/hash.h>

#include <linux/irq.h>
#include <linux/module.h>
@@ -297,6 +302,222 @@ static inline void die_if_kernel(const c
die(str, regs, err);
}

+#ifdef CONFIG_KPROBES
+static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED;
+unsigned int kprobe_cpu = NR_CPUS;
+static struct kprobe *kprobe_running;
+static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags;
+
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE 0x00000001
+#define KPROBE_HIT_SS 0x00000002
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+static struct list_head kprobe_table[KPROBE_TABLE_SIZE];
+
+/* You have to be holding the kprobe_lock */
+static struct kprobe *get_kprobe(void *addr)
+{
+ struct list_head *head, *tmp;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ list_for_each(tmp, head) {
+ struct kprobe *p = list_entry(tmp, struct kprobe, list);
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+/*
+ * We changed trap3/1 to an intr gate. So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+ if (regs->eflags & EF_IE)
+ __asm__ __volatile__ ("sti");
+}
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(u8 opcode)
+{
+ switch(opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+ return 0;
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ *p->addr = p->opcode;
+ regs->eip = (unsigned long)p->addr;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int kprobe_handler(struct pt_regs * regs)
+{
+ struct kprobe *p;
+ u8 *addr = (u8 *)(regs->eip-1);
+
+ /* Recursion check, so we don't deadlock. */
+ if (kprobe_cpu == smp_processor_id()) {
+ /* We *are* holding lock here, so this is safe.
+ Disarm the probe we just hit, and ignore it. */
+ p = get_kprobe(addr);
+ /* Not ours? Can't be delete race, since we hold lock. */
+ if (!p)
+ return 0;
+ disarm_kprobe(p, regs);
+ return 1;
+ }
+
+ spin_lock(&kprobe_lock);
+ kprobe_cpu = smp_processor_id();
+ p = get_kprobe(addr);
+ if (!p) {
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+ /* Unregistered (on another cpu) after this hit? Ignore */
+ if (*addr != BREAKPOINT_INSTRUCTION)
+ return 1;
+ /* Not one of ours: let kernel handle it */
+ restore_interrupts(regs);
+ return 0;
+ }
+
+ kprobe_status = KPROBE_HIT_ACTIVE;
+ kprobe_running = p;
+ kprobe_saved_eflags = kprobe_old_eflags = regs->eflags & (EF_TF|EF_IE);
+ if (is_IF_modifier(p->opcode))
+ kprobe_saved_eflags &= ~EF_IE;
+
+ p->pre_handler(p, regs);
+
+ regs->eflags |= EF_TF;
+ regs->eflags &= ~EF_IE;
+
+ /* We hold lock, now we remove breakpoint and single step. */
+ disarm_kprobe(p, regs);
+ kprobe_status = KPROBE_HIT_SS;
+ return 1;
+}
+
+static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ regs->eflags &= ~EF_TF;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function. And we hold kprobe_lock.
+ */
+static int post_kprobe_handler(struct pt_regs *regs)
+{
+ if (kprobe_running->post_handler)
+ kprobe_running->post_handler(kprobe_running, regs, 0);
+
+ /*
+ * We singlestepped with interrupts disabled. So, the result on
+ * the stack would be incorrect for "pushfl" instruction.
+ */
+ if (kprobe_running->opcode == 0x9c) { /* pushfl */
+ regs->esp &= ~(EF_TF | EF_IE);
+ regs->esp |= kprobe_old_eflags;
+ }
+
+ rearm_kprobe(kprobe_running, regs);
+ regs->eflags |= kprobe_saved_eflags;
+
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+
+ /*
+ * if somebody else is singlestepping across a probe point, eflags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->eflags & EF_TF) {
+ restore_interrupts(regs);
+ return 0;
+ }
+ return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ if (kprobe_running->fault_handler
+ && kprobe_running->fault_handler(kprobe_running, regs, trapnr))
+ return 1;
+
+ if (kprobe_status & KPROBE_HIT_SS) {
+ rearm_kprobe(kprobe_running, regs);
+ regs->eflags |= kprobe_old_eflags;
+
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+ }
+ return 0;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret = 0;
+
+ spin_lock_irq(&kprobe_lock);
+ if (get_kprobe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ p->status = 0UL;
+ p->opcode = *p->addr;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ /* This is a noop on Intel, but good form nonetheless */
+ flush_icache_range(p->addr, p->addr + 4);
+ out:
+ spin_unlock_irq(&kprobe_lock);
+ return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+ spin_lock_irq(&kprobe_lock);
+ *p->addr = p->opcode;
+ list_del(&p->list);
+ /* This is a noop on Intel, but good form nonetheless */
+ flush_icache_range(p->addr, p->addr + 4);
+ spin_unlock_irq(&kprobe_lock);
+}
+
+static int __init init_kprobes(void)
+{
+ int i;
+
+ /* FIXME allocate the probe table, currently defined statically */
+
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kprobe_table[i]);
+
+ return 0;
+}
+__initcall(init_kprobes);
+#endif /* CONFIG_KPROBES */
+
static inline unsigned long get_cr2(void)
{
unsigned long address;
@@ -326,6 +547,8 @@ static void inline do_trap(int trapnr, i
panic("do_trap: can't hit this");
}
#endif
+ if (kprobe_fault(regs, trapnr))
+ return;

if (!(regs->xcs & 3))
goto kernel_trap;
@@ -392,7 +615,9 @@ asmlinkage void do_##name(struct pt_regs
}

DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
+#ifndef CONFIG_KPROBES
DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+#endif
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
@@ -408,6 +633,9 @@ asmlinkage void do_general_protection(st
{
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
+
+ if (kprobe_fault(regs, 13))
+ return;

if (!(regs->xcs & 3))
goto gp_in_kernel;
@@ -508,6 +736,16 @@ asmlinkage void do_nmi(struct pt_regs *
inb(0x71); /* dummy */
}

+#ifdef CONFIG_KPROBES
+asmlinkage int do_int3(struct pt_regs * regs, long error_code)
+{
+ if (kprobe_handler(regs))
+ return 1;
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ return 0;
+}
+#endif
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -530,7 +768,7 @@ asmlinkage void do_nmi(struct pt_regs *
* find every occurrence of the TF bit that could be saved away even
* by user code)
*/
-asmlinkage void do_debug(struct pt_regs * regs, long error_code)
+asmlinkage int do_debug(struct pt_regs * regs, long error_code)
{
unsigned int condition;
struct task_struct *tsk = current;
@@ -552,6 +790,11 @@ asmlinkage void do_debug(struct pt_regs

/* Mask out spurious TF errors due to lazy TF clearing */
if (condition & DR_STEP) {
+#ifdef CONFIG_KPROBES
+ if (kprobe_cpu == smp_processor_id()
+ && post_kprobe_handler(regs))
+ return 1;
+#endif
/*
* The TF error should be masked out only if the current
* process is not traced and if the TRAP flag has been set
@@ -588,15 +831,15 @@ clear_dr7:
__asm__("movl %0,%%db7"
: /* no output */
: "r" (0));
- return;
+ return 0;

debug_vm86:
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- return;
+ return 0;

clear_TF:
regs->eflags &= ~TF_MASK;
- return;
+ return 0;
}

/*
@@ -760,6 +1003,8 @@ asmlinkage void math_state_restore(struc
struct task_struct *tsk = current;
clts(); /* Allow maths ops (or we recurse) */

+ if (kprobe_fault(&regs, 7))
+ return;
if (!tsk->used_math)
init_fpu(tsk);
restore_fpu(tsk);
@@ -975,9 +1220,17 @@ void __init trap_init(void)
#endif

set_trap_gate(0,&divide_error);
+#ifndef CONFIG_KPROBES
set_trap_gate(1,&debug);
+#else
+ _set_gate(idt_table+1,14,3,&debug);
+#endif
set_intr_gate(2,&nmi);
+#ifndef CONFIG_KPROBES
set_system_gate(3,&int3); /* int3-5 can be called from all */
+#else
+ _set_gate(idt_table+3,14,3,&int3);
+#endif
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/arch/i386/mm/fault.c working-2.5.26-kprobes/arch/i386/mm/fault.c
--- linux-2.5.26/arch/i386/mm/fault.c Sun Jul 7 02:12:18 2002
+++ working-2.5.26-kprobes/arch/i386/mm/fault.c Fri Jul 19 11:17:46 2002
@@ -20,6 +20,7 @@
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */

+#include <asm/kprobes.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
@@ -156,6 +157,9 @@ asmlinkage void do_page_fault(struct pt_

/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));
+
+ if (kprobe_fault(regs, 14))
+ return;

/* It's safe to allow irq's after cr2 has been saved */
if (regs->eflags & X86_EFLAGS_IF)
diff -urpN -I \$.*\$ --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.26/include/asm-i386/kprobes.h working-2.5.26-kprobes/include/asm-i386/kprobes.h
--- linux-2.5.26/include/asm-i386/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.26-kprobes/include/asm-i386/kprobes.h Fri Jul 19 11:17:48 2002
@@ -0,0 +1,63 @@
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
+ * Mailing list: [email protected]
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/ptrace.h>
+
+struct kprobe;
+
+typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *);
+typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *,
+ unsigned long flags);
+typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *,
+ int trapnr);
+
+struct kprobe {
+ u8 * addr; /* location of the probe point */
+ struct list_head list;
+ unsigned long status;
+ /* Called before addr is executed. */
+ kprobe_pre_handler_t pre_handler;
+ /* Called after addr is executed, unless... */
+ kprobe_post_handler_t post_handler;
+ /* ... called if executing addr causes a fault (eg. page fault).
+ * Return 1 if it handled fault, otherwise kernel will see it. */
+ kprobe_fault_handler_t fault_handler;
+ u8 opcode;
+};
+
+/* Set to cpu currently running a probe hit */
+extern unsigned int kprobe_cpu;
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+#define EF_TF 0x00000100
+#define EF_IE 0x00000200
+
+#ifdef CONFIG_KPROBES
+extern int register_kprobe(struct kprobe *p);
+extern void unregister_kprobe(struct kprobe *p);
+
+extern int kprobe_fault_handler(struct pt_regs * regs, int trapnr);
+
+static inline int kprobe_fault(struct pt_regs *regs, int trapnr)
+{
+ if (kprobe_cpu == smp_processor_id()
+ && kprobe_fault_handler(regs, trapnr))
+ return 1;
+ return 0;
+}
+#else /* ! CONFIG_KPROBES */
+static inline int register_kprobe(struct probe_struct *p) { return -ENOSYS; }
+static inline void unregister_kprobe(struct probe_struct *p) { }
+static inline int kprobe_fault(struct pt_regs *regs, int trapnr) { return 0; }
+#endif
+
+#endif /* _ASM_KPROBES_H */


2002-08-02 02:23:18

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

From: Rusty Russell <[email protected]>
Date: Fri, 02 Aug 2002 12:11:47 +1000

Vamsi's kernel probes again, this time with EXPORT_SYMBOL_GPL
so people don't think this is blanket permission to hook into
arbitrary parts of the kernel (as separate from debugging, testing,
diagnostics, etc).

A nice enhancement would be to move the kprobe table and
other generic bits into a common area so that it did not
need to be duplicated as other arches add kprobe support.

2002-08-02 13:47:00

by Vamsi Krishna S .

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

On Thu, Aug 01, 2002 at 07:14:49PM -0700, David S. Miller wrote:
> From: Rusty Russell <[email protected]>
> Date: Fri, 02 Aug 2002 12:11:47 +1000
>
> Vamsi's kernel probes again, this time with EXPORT_SYMBOL_GPL
> so people don't think this is blanket permission to hook into
> arbitrary parts of the kernel (as separate from debugging, testing,
> diagnostics, etc).
>
> A nice enhancement would be to move the kprobe table and
> other generic bits into a common area so that it did not
> need to be duplicated as other arches add kprobe support.

Yes. We didn't do it in the first version of this patch to avoid
touching too many files.

We do have the full version of dprobes which has generic bits
and arch-specific ones cleanly seperated. In fact, dprobes ports
are available for ia32, s390, s390x, ppc with ppc64 and ia64 ports
in early stages. Please check out:
http://www-124.ibm.com/linux/projects/dprobes/

Thanks,
--Vamsi

--
Vamsi Krishna S.
Linux Technology Center,
IBM Software Lab, Bangalore.
Ph: +91 80 5044959
Internet: [email protected]

2002-08-05 04:13:53

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

In message <[email protected]> you write:
> From: Rusty Russell <[email protected]>
> Date: Fri, 02 Aug 2002 12:11:47 +1000
>
> Vamsi's kernel probes again, this time with EXPORT_SYMBOL_GPL
> so people don't think this is blanket permission to hook into
> arbitrary parts of the kernel (as separate from debugging, testing,
> diagnostics, etc).
>
> A nice enhancement would be to move the kprobe table and
> other generic bits into a common area so that it did not
> need to be duplicated as other arches add kprobe support.

Done. Look better?

Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: Kprobes for i386
Author: Vamsi Krishna S
Status: Experimental

D: This patch allows trapping at almost any kernel address, useful for
D: various kernel-hacking tasks, and building on for more
D: infrastructure. This patch is x86 only, but other archs can add
D: support as required.

diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/Config.help working-2.5.30-kprobes/arch/i386/Config.help
--- linux-2.5.30/arch/i386/Config.help Mon Jun 17 23:19:15 2002
+++ working-2.5.30-kprobes/arch/i386/Config.help Fri Aug 2 15:36:03 2002
@@ -967,3 +967,9 @@ CONFIG_SOFTWARE_SUSPEND
absence of features.

For more information take a look at Documentation/swsusp.txt.
+
+CONFIG_KPROBES
+ Kprobes allows you to trap at almost any kernel address, using
+ register_kprobe(), and providing a callback function. This is useful
+ for kernel debugging, non-intrusive instrumentation and testing. If
+ in doubt, say "N".
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/include/linux/kprobes.h working-2.5.30-kprobes/include/linux/kprobes.h
--- linux-2.5.30/include/linux/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobes/include/linux/kprobes.h Mon Aug 5 10:56:44 2002
@@ -0,0 +1,54 @@
+#ifndef _LINUX_KPROBES_H
+#define _LINUX_KPROBES_H
+#include <linux/config.h>
+#include <linux/list.h>
+#include <asm/kprobes.h>
+
+struct kprobe;
+struct pt_regs;
+
+typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *);
+typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *,
+ unsigned long flags);
+typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *,
+ int trapnr);
+
+struct kprobe {
+ struct list_head list;
+
+ /* location of the probe point */
+ kprobe_opcode_t *addr;
+
+ /* Called before addr is executed. */
+ kprobe_pre_handler_t pre_handler;
+
+ /* Called after addr is executed, unless... */
+ kprobe_post_handler_t post_handler;
+
+ /* ... called if executing addr causes a fault (eg. page fault).
+ * Return 1 if it handled fault, otherwise kernel will see it. */
+ kprobe_fault_handler_t fault_handler;
+
+ /* Saved opcode (which has been replaced with breakpoint) */
+ kprobe_opcode_t opcode;
+};
+
+#ifdef CONFIG_KPROBES
+/* Locks kprobe: irq must be disabled */
+void lock_kprobes(void);
+void unlock_kprobes(void);
+
+/* kprobe running now on this CPU? */
+int kprobe_running(void);
+
+/* Get the kprobe at this addr (if any). Must have called lock_kprobes */
+struct kprobe *get_kprobe(void *addr);
+
+int register_kprobe(struct kprobe *p);
+void unregister_kprobe(struct kprobe *p);
+#else
+static inline int kprobe_running(void) { return 0; }
+static inline int register_kprobe(struct probe_struct *p) { return -ENOSYS; }
+static inline void unregister_kprobe(struct probe_struct *p) { }
+#endif
+#endif /* _LINUX_KPROBES_H */
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/kernel/Makefile working-2.5.30-kprobes/kernel/Makefile
--- linux-2.5.30/kernel/Makefile Sat Jul 27 15:24:39 2002
+++ working-2.5.30-kprobes/kernel/Makefile Fri Aug 2 15:36:03 2002
@@ -10,7 +10,7 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o platform.o suspend.o
+ printk.o platform.o suspend.o kprobes.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \
@@ -23,6 +23,7 @@ obj-$(CONFIG_MODULES) += ksyms.o
obj-$(CONFIG_PM) += pm.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KPROBES) += kprobes.o

ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/kernel/kprobes.c working-2.5.30-kprobes/kernel/kprobes.c
--- linux-2.5.30/kernel/kprobes.c Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobes/kernel/kprobes.c Fri Aug 2 15:45:13 2002
@@ -0,0 +1,94 @@
+/* Support for kernel probes.
+ (C) 2002 Vamsi Krishna S <[email protected]>.
+*/
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+static struct list_head kprobe_table[KPROBE_TABLE_SIZE];
+
+static unsigned int kprobe_cpu = NR_CPUS;
+static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED;
+
+int kprobe_running(void)
+{
+ return kprobe_cpu == smp_processor_id();
+}
+
+/* Locks kprobe: irqs must be disabled */
+void lock_kprobes(void)
+{
+ spin_lock(&kprobe_lock);
+ kprobe_cpu = smp_processor_id();
+}
+
+void unlock_kprobes(void)
+{
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+}
+
+/* You have to be holding the kprobe_lock */
+struct kprobe *get_kprobe(void *addr)
+{
+ struct list_head *head, *tmp;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ list_for_each(tmp, head) {
+ struct kprobe *p = list_entry(tmp, struct kprobe, list);
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret = 0;
+
+ spin_lock_irq(&kprobe_lock);
+ if (get_kprobe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ p->opcode = *p->addr;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ out:
+ spin_unlock_irq(&kprobe_lock);
+ return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+ spin_lock_irq(&kprobe_lock);
+ *p->addr = p->opcode;
+ list_del(&p->list);
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ spin_unlock_irq(&kprobe_lock);
+}
+
+static int __init init_kprobes(void)
+{
+ int i;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kprobe_table[i]);
+
+ return 0;
+}
+__initcall(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/config.in working-2.5.30-kprobes/arch/i386/config.in
--- linux-2.5.30/arch/i386/config.in Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobes/arch/i386/config.in Fri Aug 2 15:36:03 2002
@@ -415,6 +415,7 @@ if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; th
if [ "$CONFIG_HIGHMEM" = "y" ]; then
bool ' Highmem debugging' CONFIG_DEBUG_HIGHMEM
fi
+ bool ' Probes' CONFIG_KPROBES
fi

endmenu
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/kernel/entry.S working-2.5.30-kprobes/arch/i386/kernel/entry.S
--- linux-2.5.30/arch/i386/kernel/entry.S Fri Aug 2 11:15:05 2002
+++ working-2.5.30-kprobes/arch/i386/kernel/entry.S Fri Aug 2 15:36:03 2002
@@ -430,9 +430,22 @@ device_not_available_emulate:
jmp ret_from_exception

ENTRY(debug)
+#ifdef CONFIG_KPROBES
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
+ pushl $0
+ pushl %edx
+ call do_debug
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ jmp ret_from_exception
+#else
pushl $0
pushl $do_debug
jmp error_code
+#endif

ENTRY(nmi)
pushl %eax
@@ -445,9 +460,22 @@ ENTRY(nmi)
RESTORE_ALL

ENTRY(int3)
+#ifdef CONFIG_KPROBES
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
+ pushl $0
+ pushl %edx
+ call do_int3
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ jmp ret_from_exception
+#else
pushl $0
pushl $do_int3
jmp error_code
+#endif

ENTRY(overflow)
pushl $0
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/kernel/traps.c working-2.5.30-kprobes/arch/i386/kernel/traps.c
--- linux-2.5.30/arch/i386/kernel/traps.c Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobes/arch/i386/kernel/traps.c Fri Aug 2 17:38:56 2002
@@ -5,6 +5,9 @@
*
* Pentium III FXSR, SSE support
* Gareth Hughes <[email protected]>, May 2000
+ *
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
*/

/*
@@ -24,6 +27,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>

#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -50,6 +54,7 @@
#include <asm/cobalt.h>
#include <asm/lithium.h>
#endif
+#include <linux/hash.h>

#include <linux/irq.h>
#include <linux/module.h>
@@ -297,6 +302,158 @@ static inline void die_if_kernel(const c
die(str, regs, err);
}

+#ifdef CONFIG_KPROBES
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE 0x00000001
+#define KPROBE_HIT_SS 0x00000002
+
+#define EF_TF 0x00000100
+#define EF_IE 0x00000200
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags;
+
+/*
+ * We changed trap3/1 to an intr gate. So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+ if (regs->eflags & EF_IE)
+ __asm__ __volatile__ ("sti");
+}
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(u8 opcode)
+{
+ switch(opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+ return 0;
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ *p->addr = p->opcode;
+ regs->eip = (unsigned long)p->addr;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int kprobe_handler(struct pt_regs * regs)
+{
+ struct kprobe *p;
+ u8 *addr = (u8 *)(regs->eip-1);
+
+ /* Lock, check we're not actually recursing */
+ if (kprobe_running()) {
+ /* We *are* holding lock here, so this is safe.
+ Disarm the probe we just hit, and ignore it. */
+ p = get_kprobe(addr);
+ /* Not ours? Can't be delete race, since we hold lock. */
+ if (!p)
+ return 0;
+ disarm_kprobe(p, regs);
+ return 1;
+ }
+
+ lock_kprobes();
+ p = get_kprobe(addr);
+ if (!p) {
+ unlock_kprobes();
+ /* Unregistered (on another cpu) after this hit? Ignore */
+ if (*addr != BREAKPOINT_INSTRUCTION)
+ return 1;
+ /* Not one of ours: let kernel handle it */
+ restore_interrupts(regs);
+ return 0;
+ }
+
+ kprobe_status = KPROBE_HIT_ACTIVE;
+ current_kprobe = p;
+ kprobe_saved_eflags = kprobe_old_eflags = regs->eflags & (EF_TF|EF_IE);
+ if (is_IF_modifier(p->opcode))
+ kprobe_saved_eflags &= ~EF_IE;
+
+ p->pre_handler(p, regs);
+
+ regs->eflags |= EF_TF;
+ regs->eflags &= ~EF_IE;
+
+ /* We hold lock, now we remove breakpoint and single step. */
+ disarm_kprobe(p, regs);
+ kprobe_status = KPROBE_HIT_SS;
+ return 1;
+}
+
+static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ regs->eflags &= ~EF_TF;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function. And we hold kprobe lock.
+ */
+static int post_kprobe_handler(struct pt_regs *regs)
+{
+ if (current_kprobe->post_handler)
+ current_kprobe->post_handler(current_kprobe, regs, 0);
+
+ /*
+ * We singlestepped with interrupts disabled. So, the result on
+ * the stack would be incorrect for "pushfl" instruction.
+ */
+ if (current_kprobe->opcode == 0x9c) { /* pushfl */
+ regs->esp &= ~(EF_TF | EF_IE);
+ regs->esp |= kprobe_old_eflags;
+ }
+
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_saved_eflags;
+
+ unlock_kprobes();
+
+ /*
+ * if somebody else is singlestepping across a probe point, eflags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->eflags & EF_TF) {
+ restore_interrupts(regs);
+ return 0;
+ }
+ return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ if (current_kprobe->fault_handler
+ && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ return 1;
+
+ if (kprobe_status & KPROBE_HIT_SS) {
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_old_eflags;
+
+ unlock_kprobes();
+ }
+ return 0;
+}
+#else
+static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; }
+#endif /* CONFIG_KPROBES */
+
static inline unsigned long get_cr2(void)
{
unsigned long address;
@@ -326,6 +483,8 @@ static void inline do_trap(int trapnr, i
panic("do_trap: can't hit this");
}
#endif
+ if (kprobe_fault(regs, trapnr))
+ return;

if (!(regs->xcs & 3))
goto kernel_trap;
@@ -392,7 +551,9 @@ asmlinkage void do_##name(struct pt_regs
}

DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
+#ifndef CONFIG_KPROBES
DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+#endif
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
@@ -408,6 +569,9 @@ asmlinkage void do_general_protection(st
{
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
+
+ if (kprobe_fault(regs, 13))
+ return;

if (!(regs->xcs & 3))
goto gp_in_kernel;
@@ -508,6 +672,16 @@ asmlinkage void do_nmi(struct pt_regs *
inb(0x71); /* dummy */
}

+#ifdef CONFIG_KPROBES
+asmlinkage int do_int3(struct pt_regs * regs, long error_code)
+{
+ if (kprobe_handler(regs))
+ return 1;
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ return 0;
+}
+#endif
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -530,7 +704,7 @@ asmlinkage void do_nmi(struct pt_regs *
* find every occurrence of the TF bit that could be saved away even
* by user code)
*/
-asmlinkage void do_debug(struct pt_regs * regs, long error_code)
+asmlinkage int do_debug(struct pt_regs * regs, long error_code)
{
unsigned int condition;
struct task_struct *tsk = current;
@@ -552,6 +726,8 @@ asmlinkage void do_debug(struct pt_regs

/* Mask out spurious TF errors due to lazy TF clearing */
if (condition & DR_STEP) {
+ if (kprobe_running() && post_kprobe_handler(regs))
+ return 1;
/*
* The TF error should be masked out only if the current
* process is not traced and if the TRAP flag has been set
@@ -588,15 +764,15 @@ clear_dr7:
__asm__("movl %0,%%db7"
: /* no output */
: "r" (0));
- return;
+ return 0;

debug_vm86:
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- return;
+ return 0;

clear_TF:
regs->eflags &= ~TF_MASK;
- return;
+ return 0;
}

/*
@@ -760,6 +936,8 @@ asmlinkage void math_state_restore(struc
struct task_struct *tsk = current;
clts(); /* Allow maths ops (or we recurse) */

+ if (kprobe_fault(&regs, 7))
+ return;
if (!tsk->used_math)
init_fpu(tsk);
restore_fpu(tsk);
@@ -943,9 +1121,17 @@ void __init trap_init(void)
#endif

set_trap_gate(0,&divide_error);
+#ifndef CONFIG_KPROBES
set_trap_gate(1,&debug);
+#else
+ _set_gate(idt_table+1,14,3,&debug);
+#endif
set_intr_gate(2,&nmi);
+#ifndef CONFIG_KPROBES
set_system_gate(3,&int3); /* int3-5 can be called from all */
+#else
+ _set_gate(idt_table+3,14,3,&int3);
+#endif
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/mm/fault.c working-2.5.30-kprobes/arch/i386/mm/fault.c
--- linux-2.5.30/arch/i386/mm/fault.c Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobes/arch/i386/mm/fault.c Mon Aug 5 10:57:18 2002
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -154,6 +155,9 @@ asmlinkage void do_page_fault(struct pt_

/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));
+
+ if (kprobe_fault(regs, 14))
+ return;

/* It's safe to allow irq's after cr2 has been saved */
if (regs->eflags & X86_EFLAGS_IF)
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/fs/partitions/check.c working-2.5.30-kprobes/fs/partitions/check.c
--- linux-2.5.30/fs/partitions/check.c Fri Aug 2 11:15:09 2002
+++ working-2.5.30-kprobes/fs/partitions/check.c Fri Aug 2 17:30:34 2002
@@ -467,7 +467,7 @@ void devfs_register_partitions (struct g
for (part = 1; part < max_p; part++) {
if ( unregister || (p[part].nr_sects < 1) ) {
devfs_unregister(p[part].de);
- dev->part[p].de = NULL;
+ dev->part[part].de = NULL;
continue;
}
devfs_register_partition (dev, minor, part);
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/include/asm-i386/kprobes.h working-2.5.30-kprobes/include/asm-i386/kprobes.h
--- linux-2.5.30/include/asm-i386/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobes/include/asm-i386/kprobes.h Fri Aug 2 15:40:33 2002
@@ -0,0 +1,28 @@
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
+ * Mailing list: [email protected]
+ */
+#include <linux/smp.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+#ifdef CONFIG_KPROBES
+typedef u8 kprobe_opcode_t;
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+
+/* kprobe_running() not defined yet, so this is a macro. */
+#define kprobe_fault(regs, trapnr) \
+ (kprobe_running() && kprobe_fault_handler(regs, trapnr))
+
+#else /* !CONFIG_KPROBES */
+static inline int kprobe_fault(struct pt_regs *regs, int trapnr) { return 0; }
+#endif
+
+#endif /* _ASM_KPROBES_H */

2002-08-05 04:21:04

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30


On Mon, 5 Aug 2002, Rusty Russell wrote:
>
> Done. Look better?

How about one more cleanup: make the x86 do_int3()/do_debug() calling
convention be independent of CONFIG_KPROBE?

Btw, the way to test against zero in x86 asm is not

cmpl $0,%eax

but rather the shorter

testl %eax,%eax

which just shows that the person who wrote the asm probably was used to
saner CPUs ;)

Linus

2002-08-05 05:43:38

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

From: Rusty Russell <[email protected]>
Date: Mon, 05 Aug 2002 14:14:12 +1000

In message <[email protected]> you write:
> A nice enhancement would be to move the kprobe table and
> other generic bits into a common area so that it did not
> need to be duplicated as other arches add kprobe support.

Done. Look better?

That's exactly how I wanted the generic stuff split out,
it's perfect.

2002-08-05 07:21:24

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

In message <[email protected]> you wri
te:
>
> On Mon, 5 Aug 2002, Rusty Russell wrote:
> >
> > Done. Look better?
>
> How about one more cleanup: make the x86 do_int3()/do_debug() calling
> convention be independent of CONFIG_KPROBE?

Thanks, fixed (and, because my x86 asm is lousy, actually tested).

In testing, I came up against the "spin_unlock() causes schedule()
inside interrupt" problem. Fix is counterintuitive, IMHO (search for
"Linus"). If you really want us not to assume that preemption is
disabled in interrupt handlers, then more code needs to change.

Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: Kprobes for i386
Author: Vamsi Krishna S
Status: Experimental

D: This patch allows trapping at almost any kernel address, useful for
D: various kernel-hacking tasks, and building on for more
D: infrastructure. This patch is x86 only, but other archs can add
D: support as required.

diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/Config.help working-2.5.30-kprobe/arch/i386/Config.help
--- linux-2.5.30/arch/i386/Config.help Mon Jun 17 23:19:15 2002
+++ working-2.5.30-kprobe/arch/i386/Config.help Mon Aug 5 15:05:46 2002
@@ -967,3 +967,9 @@ CONFIG_SOFTWARE_SUSPEND
absence of features.

For more information take a look at Documentation/swsusp.txt.
+
+CONFIG_KPROBES
+ Kprobes allows you to trap at almost any kernel address, using
+ register_kprobe(), and providing a callback function. This is useful
+ for kernel debugging, non-intrusive instrumentation and testing. If
+ in doubt, say "N".
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/config.in working-2.5.30-kprobe/arch/i386/config.in
--- linux-2.5.30/arch/i386/config.in Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobe/arch/i386/config.in Mon Aug 5 15:05:46 2002
@@ -415,6 +415,7 @@ if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; th
if [ "$CONFIG_HIGHMEM" = "y" ]; then
bool ' Highmem debugging' CONFIG_DEBUG_HIGHMEM
fi
+ bool ' Probes' CONFIG_KPROBES
fi

endmenu
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/kernel/entry.S working-2.5.30-kprobe/arch/i386/kernel/entry.S
--- linux-2.5.30/arch/i386/kernel/entry.S Fri Aug 2 11:15:05 2002
+++ working-2.5.30-kprobe/arch/i386/kernel/entry.S Mon Aug 5 15:05:46 2002
@@ -430,9 +430,16 @@ device_not_available_emulate:
jmp ret_from_exception

ENTRY(debug)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $do_debug
- jmp error_code
+ pushl %edx
+ call do_debug
+ addl $8,%esp
+ testl %eax,%eax
+ jnz restore_all
+ jmp ret_from_exception

ENTRY(nmi)
pushl %eax
@@ -445,9 +452,16 @@ ENTRY(nmi)
RESTORE_ALL

ENTRY(int3)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $do_int3
- jmp error_code
+ pushl %edx
+ call do_int3
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ jmp ret_from_exception

ENTRY(overflow)
pushl $0
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/kernel/traps.c working-2.5.30-kprobe/arch/i386/kernel/traps.c
--- linux-2.5.30/arch/i386/kernel/traps.c Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobe/arch/i386/kernel/traps.c Mon Aug 5 16:25:50 2002
@@ -5,6 +5,9 @@
*
* Pentium III FXSR, SSE support
* Gareth Hughes <[email protected]>, May 2000
+ *
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
*/

/*
@@ -24,6 +27,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>

#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -50,6 +54,7 @@
#include <asm/cobalt.h>
#include <asm/lithium.h>
#endif
+#include <linux/hash.h>

#include <linux/irq.h>
#include <linux/module.h>
@@ -297,6 +302,159 @@ static inline void die_if_kernel(const c
die(str, regs, err);
}

+#ifdef CONFIG_KPROBES
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE 0x00000001
+#define KPROBE_HIT_SS 0x00000002
+
+#define EF_TF 0x00000100
+#define EF_IE 0x00000200
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags;
+
+/*
+ * We changed trap3/1 to an intr gate. So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+ if (regs->eflags & EF_IE)
+ __asm__ __volatile__ ("sti");
+}
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(u8 opcode)
+{
+ switch(opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+ return 0;
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ *p->addr = p->opcode;
+ regs->eip = (unsigned long)p->addr;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int kprobe_handler(struct pt_regs *regs)
+{
+ struct kprobe *p;
+ u8 *addr = (u8 *)(regs->eip-1);
+
+ /* Lock, check we're not actually recursing */
+ if (kprobe_running()) {
+ /* We *are* holding lock here, so this is safe.
+ Disarm the probe we just hit, and ignore it. */
+ p = get_kprobe(addr);
+ /* Not ours? Can't be delete race, since we hold lock. */
+ if (!p)
+ return 0;
+ disarm_kprobe(p, regs);
+ return 1;
+ }
+
+ lock_kprobes();
+ p = get_kprobe(addr);
+ if (!p) {
+ unlock_kprobes();
+ /* Unregistered (on another cpu) after this hit? Ignore */
+ if (*addr != BREAKPOINT_INSTRUCTION)
+ return 1;
+ /* Not one of ours: let kernel handle it */
+ restore_interrupts(regs);
+ return 0;
+ }
+
+ kprobe_status = KPROBE_HIT_ACTIVE;
+ current_kprobe = p;
+ kprobe_saved_eflags = kprobe_old_eflags = regs->eflags & (EF_TF|EF_IE);
+ if (is_IF_modifier(p->opcode))
+ kprobe_saved_eflags &= ~EF_IE;
+
+ p->pre_handler(p, regs);
+
+ regs->eflags |= EF_TF;
+ regs->eflags &= ~EF_IE;
+
+ /* We hold lock, now we remove breakpoint and single step. */
+ disarm_kprobe(p, regs);
+ kprobe_status = KPROBE_HIT_SS;
+ return 1;
+}
+
+static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ regs->eflags &= ~EF_TF;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function. And we hold kprobe lock.
+ */
+static int post_kprobe_handler(struct pt_regs *regs)
+{
+ if (current_kprobe->post_handler)
+ current_kprobe->post_handler(current_kprobe, regs, 0);
+
+ /*
+ * We singlestepped with interrupts disabled. So, the result on
+ * the stack would be incorrect for "pushfl" instruction.
+ */
+ if (current_kprobe->opcode == 0x9c) { /* pushfl */
+ regs->esp &= ~(EF_TF | EF_IE);
+ regs->esp |= kprobe_old_eflags;
+ }
+
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_saved_eflags;
+
+ unlock_kprobes();
+
+ /*
+ * if somebody else is singlestepping across a probe point, eflags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->eflags & EF_TF) {
+ restore_interrupts(regs);
+ return 0;
+ }
+ return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ if (current_kprobe->fault_handler
+ && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ return 1;
+
+ if (kprobe_status & KPROBE_HIT_SS) {
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_old_eflags;
+
+ unlock_kprobes();
+ }
+ return 0;
+}
+#else
+static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; }
+static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
+#endif /* CONFIG_KPROBES */
+
static inline unsigned long get_cr2(void)
{
unsigned long address;
@@ -326,6 +484,8 @@ static void inline do_trap(int trapnr, i
panic("do_trap: can't hit this");
}
#endif
+ if (kprobe_fault(regs, trapnr))
+ return;

if (!(regs->xcs & 3))
goto kernel_trap;
@@ -392,7 +552,6 @@ asmlinkage void do_##name(struct pt_regs
}

DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
@@ -408,6 +567,9 @@ asmlinkage void do_general_protection(st
{
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
+
+ if (kprobe_fault(regs, 13))
+ return;

if (!(regs->xcs & 3))
goto gp_in_kernel;
@@ -508,6 +670,14 @@ asmlinkage void do_nmi(struct pt_regs *
inb(0x71); /* dummy */
}

+asmlinkage int do_int3(struct pt_regs *regs, long error_code)
+{
+ if (kprobe_handler(regs))
+ return 1;
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ return 0;
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -530,7 +700,7 @@ asmlinkage void do_nmi(struct pt_regs *
* find every occurrence of the TF bit that could be saved away even
* by user code)
*/
-asmlinkage void do_debug(struct pt_regs * regs, long error_code)
+asmlinkage int do_debug(struct pt_regs * regs, long error_code)
{
unsigned int condition;
struct task_struct *tsk = current;
@@ -552,6 +722,8 @@ asmlinkage void do_debug(struct pt_regs

/* Mask out spurious TF errors due to lazy TF clearing */
if (condition & DR_STEP) {
+ if (kprobe_running() && post_kprobe_handler(regs))
+ return 1;
/*
* The TF error should be masked out only if the current
* process is not traced and if the TRAP flag has been set
@@ -588,15 +760,15 @@ clear_dr7:
__asm__("movl %0,%%db7"
: /* no output */
: "r" (0));
- return;
+ return 0;

debug_vm86:
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- return;
+ return 0;

clear_TF:
regs->eflags &= ~TF_MASK;
- return;
+ return 0;
}

/*
@@ -760,6 +932,8 @@ asmlinkage void math_state_restore(struc
struct task_struct *tsk = current;
clts(); /* Allow maths ops (or we recurse) */

+ if (kprobe_fault(&regs, 7))
+ return;
if (!tsk->used_math)
init_fpu(tsk);
restore_fpu(tsk);
@@ -943,9 +1117,9 @@ void __init trap_init(void)
#endif

set_trap_gate(0,&divide_error);
- set_trap_gate(1,&debug);
+ _set_gate(idt_table+1,14,3,&debug);
set_intr_gate(2,&nmi);
- set_system_gate(3,&int3); /* int3-5 can be called from all */
+ _set_gate(idt_table+3,14,3,&int3);
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/mm/fault.c working-2.5.30-kprobe/arch/i386/mm/fault.c
--- linux-2.5.30/arch/i386/mm/fault.c Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobe/arch/i386/mm/fault.c Mon Aug 5 15:05:46 2002
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -154,6 +155,9 @@ asmlinkage void do_page_fault(struct pt_

/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));
+
+ if (kprobe_fault(regs, 14))
+ return;

/* It's safe to allow irq's after cr2 has been saved */
if (regs->eflags & X86_EFLAGS_IF)
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/include/asm-i386/kprobes.h working-2.5.30-kprobe/include/asm-i386/kprobes.h
--- linux-2.5.30/include/asm-i386/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobe/include/asm-i386/kprobes.h Mon Aug 5 15:15:45 2002
@@ -0,0 +1,29 @@
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
+ * Mailing list: [email protected]
+ */
+#include <linux/smp.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+typedef u8 kprobe_opcode_t;
+
+#ifdef CONFIG_KPROBES
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+
+/* kprobe_running() not defined yet, so this is a macro. */
+#define kprobe_fault(regs, trapnr) \
+ (kprobe_running() && kprobe_fault_handler(regs, trapnr))
+
+#else /* !CONFIG_KPROBES */
+static inline int kprobe_fault(struct pt_regs *regs, int trapnr) { return 0; }
+#endif
+
+#endif /* _ASM_KPROBES_H */
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/include/linux/kprobes.h working-2.5.30-kprobe/include/linux/kprobes.h
--- linux-2.5.30/include/linux/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobe/include/linux/kprobes.h Mon Aug 5 15:16:05 2002
@@ -0,0 +1,54 @@
+#ifndef _LINUX_KPROBES_H
+#define _LINUX_KPROBES_H
+#include <linux/config.h>
+#include <linux/list.h>
+#include <asm/kprobes.h>
+
+struct kprobe;
+struct pt_regs;
+
+typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *);
+typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *,
+ unsigned long flags);
+typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *,
+ int trapnr);
+
+struct kprobe {
+ struct list_head list;
+
+ /* location of the probe point */
+ kprobe_opcode_t *addr;
+
+ /* Called before addr is executed. */
+ kprobe_pre_handler_t pre_handler;
+
+ /* Called after addr is executed, unless... */
+ kprobe_post_handler_t post_handler;
+
+ /* ... called if executing addr causes a fault (eg. page fault).
+ * Return 1 if it handled fault, otherwise kernel will see it. */
+ kprobe_fault_handler_t fault_handler;
+
+ /* Saved opcode (which has been replaced with breakpoint) */
+ kprobe_opcode_t opcode;
+};
+
+#ifdef CONFIG_KPROBES
+/* Locks kprobe: irq must be disabled */
+void lock_kprobes(void);
+void unlock_kprobes(void);
+
+/* kprobe running now on this CPU? */
+int kprobe_running(void);
+
+/* Get the kprobe at this addr (if any). Must have called lock_kprobes */
+struct kprobe *get_kprobe(void *addr);
+
+int register_kprobe(struct kprobe *p);
+void unregister_kprobe(struct kprobe *p);
+#else
+static inline int kprobe_running(void) { return 0; }
+static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; }
+static inline void unregister_kprobe(struct kprobe *p) { }
+#endif
+#endif /* _LINUX_KPROBES_H */
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/kernel/Makefile working-2.5.30-kprobe/kernel/Makefile
--- linux-2.5.30/kernel/Makefile Sat Jul 27 15:24:39 2002
+++ working-2.5.30-kprobe/kernel/Makefile Mon Aug 5 16:01:58 2002
@@ -10,7 +10,7 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o platform.o suspend.o
+ printk.o platform.o suspend.o kprobes.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \
@@ -23,6 +23,7 @@ obj-$(CONFIG_MODULES) += ksyms.o
obj-$(CONFIG_PM) += pm.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KPROBES) += kprobes.o futex.o

ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
diff -urNp -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/kernel/kprobes.c working-2.5.30-kprobe/kernel/kprobes.c
--- linux-2.5.30/kernel/kprobes.c Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobe/kernel/kprobes.c Mon Aug 5 16:25:58 2002
@@ -0,0 +1,97 @@
+/* Support for kernel probes.
+ (C) 2002 Vamsi Krishna S <[email protected]>.
+*/
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+static struct list_head kprobe_table[KPROBE_TABLE_SIZE];
+
+static unsigned int kprobe_cpu = NR_CPUS;
+static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED;
+
+int kprobe_running(void)
+{
+ return kprobe_cpu == smp_processor_id();
+}
+
+/* Locks kprobe: irqs must be disabled */
+void lock_kprobes(void)
+{
+ /* This is Linus' fault. We're in an interrupt, but ... */
+ preempt_disable();
+ spin_lock(&kprobe_lock);
+ kprobe_cpu = smp_processor_id();
+}
+
+void unlock_kprobes(void)
+{
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+ preempt_enable_no_resched();
+}
+
+/* You have to be holding the kprobe_lock */
+struct kprobe *get_kprobe(void *addr)
+{
+ struct list_head *head, *tmp;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ list_for_each(tmp, head) {
+ struct kprobe *p = list_entry(tmp, struct kprobe, list);
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret = 0;
+
+ spin_lock_irq(&kprobe_lock);
+ if (get_kprobe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ p->opcode = *p->addr;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ out:
+ spin_unlock_irq(&kprobe_lock);
+ return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+ spin_lock_irq(&kprobe_lock);
+ *p->addr = p->opcode;
+ list_del(&p->list);
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ spin_unlock_irq(&kprobe_lock);
+}
+
+static int __init init_kprobes(void)
+{
+ int i;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kprobe_table[i]);
+
+ return 0;
+}
+__initcall(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);

2002-08-05 16:06:33

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30


On Mon, 5 Aug 2002, Rusty Russell wrote:
>
> In testing, I came up against the "spin_unlock() causes schedule()
> inside interrupt" problem.

It shouldn't cause a schedule, it should cause a big warning (with
complete trace) to be printed out. Or did you mean something else?

Maybe the warning should be changed to

Warning, kernel is mixing metaphors. "It's not rocket surgery".

to make it clear why it's a bad idea.

Linus

2002-08-06 07:32:58

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

In message <[email protected]> you wri
te:
>
> On Tue, 6 Aug 2002, Rusty Russell wrote:
> >
> > I am reading from this that we *should* be explicitly disabling
> > preemption in interrupt handlers if we rely on the cpu number not
> > changing underneath us, even if it's (a) currently unneccessary, and
> > (b) arch-specific code.
>
> But do_irq() already does that.

Right, that's what I wanted to check.

> You mean _exception_ handlers. It's definitely not unnecessary. Exceptions
> can very much be preempted.

The patch changes traps 1 and 3 (debug & int3) to interrupt gates
though.

In fact, the removal of the #ifdef CONFIG_KPROBES around that change
introduced a bug: we didn't reenable interrupts like the older code
expects.

Vamsi, what do you think of this patch? Is it neccessary to restore
interrupts before handle_vm86_trap (the original patch didn't do this
either, not sure if it's required).

Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: Kprobes for i386
Author: Vamsi Krishna S
Status: Experimental

D: This patch allows trapping at almost any kernel address, useful for
D: various kernel-hacking tasks, and building on for more
D: infrastructure. This patch is x86 only, but other archs can add
D: support as required.

diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/Config.help working-2.5.30-kprobes/arch/i386/Config.help
--- linux-2.5.30/arch/i386/Config.help Mon Jun 17 23:19:15 2002
+++ working-2.5.30-kprobes/arch/i386/Config.help Tue Aug 6 16:52:59 2002
@@ -967,3 +967,9 @@ CONFIG_SOFTWARE_SUSPEND
absence of features.

For more information take a look at Documentation/swsusp.txt.
+
+CONFIG_KPROBES
+ Kprobes allows you to trap at almost any kernel address, using
+ register_kprobe(), and providing a callback function. This is useful
+ for kernel debugging, non-intrusive instrumentation and testing. If
+ in doubt, say "N".
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/config.in working-2.5.30-kprobes/arch/i386/config.in
--- linux-2.5.30/arch/i386/config.in Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobes/arch/i386/config.in Tue Aug 6 16:52:59 2002
@@ -415,6 +415,7 @@ if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; th
if [ "$CONFIG_HIGHMEM" = "y" ]; then
bool ' Highmem debugging' CONFIG_DEBUG_HIGHMEM
fi
+ bool ' Probes' CONFIG_KPROBES
fi

endmenu
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/kernel/entry.S working-2.5.30-kprobes/arch/i386/kernel/entry.S
--- linux-2.5.30/arch/i386/kernel/entry.S Fri Aug 2 11:15:05 2002
+++ working-2.5.30-kprobes/arch/i386/kernel/entry.S Tue Aug 6 16:52:59 2002
@@ -430,9 +430,16 @@ device_not_available_emulate:
jmp ret_from_exception

ENTRY(debug)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $do_debug
- jmp error_code
+ pushl %edx
+ call do_debug
+ addl $8,%esp
+ testl %eax,%eax
+ jnz restore_all
+ jmp ret_from_exception

ENTRY(nmi)
pushl %eax
@@ -445,9 +452,16 @@ ENTRY(nmi)
RESTORE_ALL

ENTRY(int3)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $do_int3
- jmp error_code
+ pushl %edx
+ call do_int3
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ jmp ret_from_exception

ENTRY(overflow)
pushl $0
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/kernel/traps.c working-2.5.30-kprobes/arch/i386/kernel/traps.c
--- linux-2.5.30/arch/i386/kernel/traps.c Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobes/arch/i386/kernel/traps.c Tue Aug 6 17:04:15 2002
@@ -5,6 +5,9 @@
*
* Pentium III FXSR, SSE support
* Gareth Hughes <[email protected]>, May 2000
+ *
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
*/

/*
@@ -24,6 +27,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>

#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -50,6 +54,7 @@
#include <asm/cobalt.h>
#include <asm/lithium.h>
#endif
+#include <linux/hash.h>

#include <linux/irq.h>
#include <linux/module.h>
@@ -297,6 +302,162 @@ static inline void die_if_kernel(const c
die(str, regs, err);
}

+/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+ if (regs->eflags & IF_MASK)
+ __asm__ __volatile__ ("sti");
+}
+
+#ifdef CONFIG_KPROBES
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE 0x00000001
+#define KPROBE_HIT_SS 0x00000002
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags;
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(u8 opcode)
+{
+ switch(opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+ return 0;
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ *p->addr = p->opcode;
+ regs->eip = (unsigned long)p->addr;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int kprobe_handler(struct pt_regs *regs)
+{
+ struct kprobe *p;
+ int ret = 0;
+ u8 *addr = (u8 *)(regs->eip-1);
+
+ /* We're in an interrupt, but this is clear and BUG()-safe. */
+ preempt_disable();
+
+ /* Check we're not actually recursing */
+ if (kprobe_running()) {
+ /* We *are* holding lock here, so this is safe.
+ Disarm the probe we just hit, and ignore it. */
+ p = get_kprobe(addr);
+ /* If it's not ours, can't be delete race, (we hold lock). */
+ if (p) {
+ disarm_kprobe(p, regs);
+ ret = 1;
+ }
+ goto out;
+ }
+
+ lock_kprobes();
+ p = get_kprobe(addr);
+ if (!p) {
+ unlock_kprobes();
+ /* Unregistered (on another cpu) after this hit? Ignore */
+ if (*addr != BREAKPOINT_INSTRUCTION)
+ ret = 1;
+ /* Not one of ours: let kernel handle it */
+ out:
+ preempt_enable_no_resched();
+ return ret;
+ }
+
+ kprobe_status = KPROBE_HIT_ACTIVE;
+ current_kprobe = p;
+ kprobe_saved_eflags = kprobe_old_eflags
+ = (regs->eflags & (TF_MASK|IF_MASK));
+ if (is_IF_modifier(p->opcode))
+ kprobe_saved_eflags &= ~IF_MASK;
+
+ p->pre_handler(p, regs);
+
+ regs->eflags |= TF_MASK;
+ regs->eflags &= ~IF_MASK;
+
+ /* We hold lock, now we remove breakpoint and single step. */
+ disarm_kprobe(p, regs);
+ kprobe_status = KPROBE_HIT_SS;
+ return 1;
+}
+
+static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ regs->eflags &= ~TF_MASK;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function. And we hold kprobe lock.
+ */
+static int post_kprobe_handler(struct pt_regs *regs)
+{
+ if (current_kprobe->post_handler)
+ current_kprobe->post_handler(current_kprobe, regs, 0);
+
+ /*
+ * We singlestepped with interrupts disabled. So, the result on
+ * the stack would be incorrect for "pushfl" instruction.
+ */
+ if (current_kprobe->opcode == 0x9c) { /* pushfl */
+ regs->esp &= ~(TF_MASK | IF_MASK);
+ regs->esp |= kprobe_old_eflags;
+ }
+
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_saved_eflags;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+
+ /*
+ * if somebody else is singlestepping across a probe point, eflags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->eflags & TF_MASK)
+ return 0;
+ return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ if (current_kprobe->fault_handler
+ && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ return 1;
+
+ if (kprobe_status & KPROBE_HIT_SS) {
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_old_eflags;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+ }
+ return 0;
+}
+#else
+static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; }
+static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
+#endif /* CONFIG_KPROBES */
+
static inline unsigned long get_cr2(void)
{
unsigned long address;
@@ -326,6 +487,8 @@ static void inline do_trap(int trapnr, i
panic("do_trap: can't hit this");
}
#endif
+ if (kprobe_running() && kprobe_fault_handler(regs, trapnr))
+ return;

if (!(regs->xcs & 3))
goto kernel_trap;
@@ -392,7 +555,6 @@ asmlinkage void do_##name(struct pt_regs
}

DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
@@ -408,6 +570,9 @@ asmlinkage void do_general_protection(st
{
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
+
+ if (kprobe_running() && kprobe_fault_handler(regs, 13))
+ return;

if (!(regs->xcs & 3))
goto gp_in_kernel;
@@ -508,6 +673,15 @@ asmlinkage void do_nmi(struct pt_regs *
inb(0x71); /* dummy */
}

+asmlinkage int do_int3(struct pt_regs *regs, long error_code)
+{
+ if (kprobe_handler(regs))
+ return 1;
+ restore_interrupts(regs);
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ return 0;
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -530,7 +704,7 @@ asmlinkage void do_nmi(struct pt_regs *
* find every occurrence of the TF bit that could be saved away even
* by user code)
*/
-asmlinkage void do_debug(struct pt_regs * regs, long error_code)
+asmlinkage int do_debug(struct pt_regs * regs, long error_code)
{
unsigned int condition;
struct task_struct *tsk = current;
@@ -552,6 +726,9 @@ asmlinkage void do_debug(struct pt_regs

/* Mask out spurious TF errors due to lazy TF clearing */
if (condition & DR_STEP) {
+ if (kprobe_running() && post_kprobe_handler(regs))
+ return 1;
+ restore_interrupts(regs);
/*
* The TF error should be masked out only if the current
* process is not traced and if the TRAP flag has been set
@@ -565,7 +742,8 @@ asmlinkage void do_debug(struct pt_regs
goto clear_TF;
if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
goto clear_TF;
- }
+ } else
+ restore_interrupts(regs);

/* Ok, finally something we can handle */
tsk->thread.trap_no = 1;
@@ -588,15 +766,16 @@ clear_dr7:
__asm__("movl %0,%%db7"
: /* no output */
: "r" (0));
- return;
+ return 0;

debug_vm86:
+ restore_interrupts(regs);
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- return;
+ return 0;

clear_TF:
regs->eflags &= ~TF_MASK;
- return;
+ return 0;
}

/*
@@ -760,6 +939,8 @@ asmlinkage void math_state_restore(struc
struct task_struct *tsk = current;
clts(); /* Allow maths ops (or we recurse) */

+ if (kprobe_running() && kprobe_fault_handler(&regs, 7))
+ return;
if (!tsk->used_math)
init_fpu(tsk);
restore_fpu(tsk);
@@ -943,9 +1124,9 @@ void __init trap_init(void)
#endif

set_trap_gate(0,&divide_error);
- set_trap_gate(1,&debug);
+ _set_gate(idt_table+1,14,3,&debug);
set_intr_gate(2,&nmi);
- set_system_gate(3,&int3); /* int3-5 can be called from all */
+ _set_gate(idt_table+3,14,3,&int3);
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/arch/i386/mm/fault.c working-2.5.30-kprobes/arch/i386/mm/fault.c
--- linux-2.5.30/arch/i386/mm/fault.c Sat Jul 27 15:24:35 2002
+++ working-2.5.30-kprobes/arch/i386/mm/fault.c Tue Aug 6 16:52:59 2002
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -154,6 +155,9 @@ asmlinkage void do_page_fault(struct pt_

/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));
+
+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ return;

/* It's safe to allow irq's after cr2 has been saved */
if (regs->eflags & X86_EFLAGS_IF)
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/include/asm-i386/kprobes.h working-2.5.30-kprobes/include/asm-i386/kprobes.h
--- linux-2.5.30/include/asm-i386/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobes/include/asm-i386/kprobes.h Tue Aug 6 16:52:59 2002
@@ -0,0 +1,20 @@
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
+ * Mailing list: [email protected]
+ */
+#include <linux/smp.h>
+#include <linux/types.h>
+
+struct pt_regs;
+
+typedef u8 kprobe_opcode_t;
+
+/* Doesn't exist if !CONFIG_KPROBES, but calls optimized out. */
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+
+#endif /* _ASM_KPROBES_H */
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/include/linux/kprobes.h working-2.5.30-kprobes/include/linux/kprobes.h
--- linux-2.5.30/include/linux/kprobes.h Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobes/include/linux/kprobes.h Tue Aug 6 16:52:59 2002
@@ -0,0 +1,54 @@
+#ifndef _LINUX_KPROBES_H
+#define _LINUX_KPROBES_H
+#include <linux/config.h>
+#include <linux/list.h>
+#include <asm/kprobes.h>
+
+struct kprobe;
+struct pt_regs;
+
+typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *);
+typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *,
+ unsigned long flags);
+typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *,
+ int trapnr);
+
+struct kprobe {
+ struct list_head list;
+
+ /* location of the probe point */
+ kprobe_opcode_t *addr;
+
+ /* Called before addr is executed. */
+ kprobe_pre_handler_t pre_handler;
+
+ /* Called after addr is executed, unless... */
+ kprobe_post_handler_t post_handler;
+
+ /* ... called if executing addr causes a fault (eg. page fault).
+ * Return 1 if it handled fault, otherwise kernel will see it. */
+ kprobe_fault_handler_t fault_handler;
+
+ /* Saved opcode (which has been replaced with breakpoint) */
+ kprobe_opcode_t opcode;
+};
+
+#ifdef CONFIG_KPROBES
+/* Locks kprobe: irq must be disabled */
+void lock_kprobes(void);
+void unlock_kprobes(void);
+
+/* kprobe running now on this CPU? */
+int kprobe_running(void);
+
+/* Get the kprobe at this addr (if any). Must have called lock_kprobes */
+struct kprobe *get_kprobe(void *addr);
+
+int register_kprobe(struct kprobe *p);
+void unregister_kprobe(struct kprobe *p);
+#else
+static inline int kprobe_running(void) { return 0; }
+static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; }
+static inline void unregister_kprobe(struct kprobe *p) { }
+#endif
+#endif /* _LINUX_KPROBES_H */
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/kernel/Makefile working-2.5.30-kprobes/kernel/Makefile
--- linux-2.5.30/kernel/Makefile Sat Jul 27 15:24:39 2002
+++ working-2.5.30-kprobes/kernel/Makefile Tue Aug 6 16:52:59 2002
@@ -10,7 +10,7 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o platform.o suspend.o
+ printk.o platform.o suspend.o kprobes.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \
@@ -23,6 +23,7 @@ obj-$(CONFIG_MODULES) += ksyms.o
obj-$(CONFIG_PM) += pm.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KPROBES) += kprobes.o

ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.30/kernel/kprobes.c working-2.5.30-kprobes/kernel/kprobes.c
--- linux-2.5.30/kernel/kprobes.c Thu Jan 1 10:00:00 1970
+++ working-2.5.30-kprobes/kernel/kprobes.c Tue Aug 6 16:52:59 2002
@@ -0,0 +1,94 @@
+/* Support for kernel probes.
+ (C) 2002 Vamsi Krishna S <[email protected]>.
+*/
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+static struct list_head kprobe_table[KPROBE_TABLE_SIZE];
+
+static unsigned int kprobe_cpu = NR_CPUS;
+static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED;
+
+int kprobe_running(void)
+{
+ return kprobe_cpu == smp_processor_id();
+}
+
+/* Locks kprobe: irqs must be disabled */
+void lock_kprobes(void)
+{
+ spin_lock(&kprobe_lock);
+ kprobe_cpu = smp_processor_id();
+}
+
+void unlock_kprobes(void)
+{
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+}
+
+/* You have to be holding the kprobe_lock */
+struct kprobe *get_kprobe(void *addr)
+{
+ struct list_head *head, *tmp;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ list_for_each(tmp, head) {
+ struct kprobe *p = list_entry(tmp, struct kprobe, list);
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret = 0;
+
+ spin_lock_irq(&kprobe_lock);
+ if (get_kprobe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ p->opcode = *p->addr;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ out:
+ spin_unlock_irq(&kprobe_lock);
+ return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+ spin_lock_irq(&kprobe_lock);
+ *p->addr = p->opcode;
+ list_del(&p->list);
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ spin_unlock_irq(&kprobe_lock);
+}
+
+static int __init init_kprobes(void)
+{
+ int i;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kprobe_table[i]);
+
+ return 0;
+}
+__initcall(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);

2002-08-06 05:45:26

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30


On Tue, 6 Aug 2002, Rusty Russell wrote:
>
> I am reading from this that we *should* be explicitly disabling
> preemption in interrupt handlers if we rely on the cpu number not
> changing underneath us, even if it's (a) currently unneccessary, and
> (b) arch-specific code.

But do_irq() already does that.

You mean _exception_ handlers. It's definitely not unnecessary. Exceptions
can very much be preempted.

Linus

2002-08-06 03:52:41

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

In message <[email protected]> you wri
te:
>
> On Mon, 5 Aug 2002, Rusty Russell wrote:
> >
> > In testing, I came up against the "spin_unlock() causes schedule()
> > inside interrupt" problem.
>
> It shouldn't cause a schedule, it should cause a big warning (with
> complete trace) to be printed out. Or did you mean something else?

Yes, that's what I meant.

> Maybe the warning should be changed to
>
> Warning, kernel is mixing metaphors. "It's not rocket surgery".
>
> to make it clear why it's a bad idea.

Oh yes, that's *much* clearer!

I am reading from this that we *should* be explicitly disabling
preemption in interrupt handlers if we rely on the cpu number not
changing underneath us, even if it's (a) currently unneccessary, and
(b) arch-specific code.

Yes?
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-08-06 07:55:43

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

On Tue, Aug 06, 2002 at 05:22:15PM +1000, Rusty Russell wrote:
> Vamsi, what do you think of this patch? Is it neccessary to restore
> interrupts before handle_vm86_trap (the original patch didn't do this
> either, not sure if it's required).

Any chance you could split the i386-specific kprobes code into
arch/i386/kernel/kprobes.c instead of bloating traps.c?

2002-08-06 10:46:22

by Vamsi Krishna S .

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

On Tue, Aug 06, 2002 at 08:59:18AM +0100, Christoph Hellwig wrote:
> On Tue, Aug 06, 2002 at 05:22:15PM +1000, Rusty Russell wrote:
> > Vamsi, what do you think of this patch? Is it neccessary to restore
> > interrupts before handle_vm86_trap (the original patch didn't do this
> > either, not sure if it's required).
>
> Any chance you could split the i386-specific kprobes code into
> arch/i386/kernel/kprobes.c instead of bloating traps.c?

Agreed. Please see the latest patch.

--
Vamsi Krishna S.
Linux Technology Center,
IBM Software Lab, Bangalore.
Ph: +91 80 5044959
Internet: [email protected]

2002-08-06 10:58:56

by Vamsi Krishna S .

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

Hi Rusty,

On Tue, Aug 06, 2002 at 05:22:15PM +1000, Rusty Russell wrote:
>
> > You mean _exception_ handlers. It's definitely not unnecessary. Exceptions
> > can very much be preempted.
>
> The patch changes traps 1 and 3 (debug & int3) to interrupt gates
> though.
>
> In fact, the removal of the #ifdef CONFIG_KPROBES around that change
> introduced a bug: we didn't reenable interrupts like the older code
> expects.
>
> Vamsi, what do you think of this patch? Is it neccessary to restore
> interrupts before handle_vm86_trap (the original patch didn't do this
> either, not sure if it's required).
>
Yes. It is necessary to restore interrupts before any of the normal
do_int3/do_debug code is executed. However, it could be done in a less
intrusive way. Please see the patch below, which does:

- move kprobes code from traps.c to kprobes.c in arch/i386/kernel
(per Christoph Hellwig)

- restore interrupts in all cases before returning to execute
do_int3/do_debug from kprobe_handler and post_kprobe_handler. This
brings down the size of the diff, keeps do_int3 and do_debug in
traps.c clean.

- move trap1 and trap3 interrupt gates only under CONFIG_KPROBES. Please
note that if we don't do this, we need to call restore_interrupts()
from the dummy (post_)kprobe_handler() in include/asm-i386/kprobes.h
when CONFIG_KPROBES is off. I didn't like this subtle side effect. hence
the #ifdef CONFIG_KPROBES around _set_trap_gate. Still, the calling
conventions of do_debug and do_int3 remain independent of CONFIG_KPROBES.

Comments?

Thanks,
--Vamsi
--
Vamsi Krishna S.
Linux Technology Center,
IBM Software Lab, Bangalore.
Ph: +91 80 5044959
Internet: [email protected]
--
>
> Name: Kprobes for i386
> Author: Vamsi Krishna S
> Status: Experimental
>
> D: This patch allows trapping at almost any kernel address, useful for
> D: various kernel-hacking tasks, and building on for more
> D: infrastructure. This patch is x86 only, but other archs can add
> D: support as required.
>

diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/Config.help 30-dp/arch/i386/Config.help
--- /usr/src/30-pure/arch/i386/Config.help 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/arch/i386/Config.help 2002-08-06 12:23:43.000000000 +0530
@@ -967,3 +967,9 @@
absence of features.

For more information take a look at Documentation/swsusp.txt.
+
+CONFIG_KPROBES
+ Kprobes allows you to trap at almost any kernel address, using
+ register_kprobe(), and providing a callback function. This is useful
+ for kernel debugging, non-intrusive instrumentation and testing. If
+ in doubt, say "N".
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/config.in 30-dp/arch/i386/config.in
--- /usr/src/30-pure/arch/i386/config.in 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/arch/i386/config.in 2002-08-06 12:23:43.000000000 +0530
@@ -415,6 +415,7 @@
if [ "$CONFIG_HIGHMEM" = "y" ]; then
bool ' Highmem debugging' CONFIG_DEBUG_HIGHMEM
fi
+ bool ' Probes' CONFIG_KPROBES
fi

endmenu
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/kernel/entry.S 30-dp/arch/i386/kernel/entry.S
--- /usr/src/30-pure/arch/i386/kernel/entry.S 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/arch/i386/kernel/entry.S 2002-08-06 12:23:43.000000000 +0530
@@ -430,9 +430,16 @@
jmp ret_from_exception

ENTRY(debug)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $do_debug
- jmp error_code
+ pushl %edx
+ call do_debug
+ addl $8,%esp
+ testl %eax,%eax
+ jnz restore_all
+ jmp ret_from_exception

ENTRY(nmi)
pushl %eax
@@ -445,9 +452,16 @@
RESTORE_ALL

ENTRY(int3)
+ pushl %eax
+ SAVE_ALL
+ movl %esp,%edx
pushl $0
- pushl $do_int3
- jmp error_code
+ pushl %edx
+ call do_int3
+ addl $8,%esp
+ cmpl $0,%eax
+ jnz restore_all
+ jmp ret_from_exception

ENTRY(overflow)
pushl $0
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/kernel/kprobes.c 30-dp/arch/i386/kernel/kprobes.c
--- /usr/src/30-pure/arch/i386/kernel/kprobes.c 1970-01-01 05:30:00.000000000 +0530
+++ 30-dp/arch/i386/kernel/kprobes.c 2002-08-06 13:34:59.000000000 +0530
@@ -0,0 +1,172 @@
+/*
+ * Support for kernel probes.
+ * (C) 2002 Vamsi Krishna S <[email protected]>.
+ */
+
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/preempt.h>
+
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE 0x00000001
+#define KPROBE_HIT_SS 0x00000002
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags;
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(u8 opcode)
+{
+ switch(opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+ return 0;
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ *p->addr = p->opcode;
+ regs->eip = (unsigned long)p->addr;
+}
+
+/* trap3/1 are intr gates for kprobes. So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+ if (regs->eflags & IF_MASK)
+ __asm__ __volatile__ ("sti");
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+int kprobe_handler(struct pt_regs *regs)
+{
+ struct kprobe *p;
+ int ret = 0;
+ u8 *addr = (u8 *)(regs->eip-1);
+
+ /* We're in an interrupt, but this is clear and BUG()-safe. */
+ preempt_disable();
+
+ /* Check we're not actually recursing */
+ if (kprobe_running()) {
+ /* We *are* holding lock here, so this is safe.
+ Disarm the probe we just hit, and ignore it. */
+ p = get_kprobe(addr);
+ if (p) {
+ disarm_kprobe(p, regs);
+ ret = 1;
+ }
+ /* If it's not ours, can't be delete race, (we hold lock). */
+ goto no_kprobe;
+ }
+
+ lock_kprobes();
+ p = get_kprobe(addr);
+ if (!p) {
+ unlock_kprobes();
+ /* Unregistered (on another cpu) after this hit? Ignore */
+ if (*addr != BREAKPOINT_INSTRUCTION)
+ ret = 1;
+ /* Not one of ours: let kernel handle it */
+ goto no_kprobe;
+ }
+
+ kprobe_status = KPROBE_HIT_ACTIVE;
+ current_kprobe = p;
+ kprobe_saved_eflags = kprobe_old_eflags
+ = (regs->eflags & (TF_MASK|IF_MASK));
+ if (is_IF_modifier(p->opcode))
+ kprobe_saved_eflags &= ~IF_MASK;
+
+ p->pre_handler(p, regs);
+
+ regs->eflags |= TF_MASK;
+ regs->eflags &= ~IF_MASK;
+
+ /* We hold lock, now we remove breakpoint and single step. */
+ disarm_kprobe(p, regs);
+ kprobe_status = KPROBE_HIT_SS;
+ return 1;
+
+no_kprobe:
+ preempt_enable_no_resched();
+ restore_interrupts(regs);
+ return ret;
+}
+
+static void rearm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ regs->eflags &= ~TF_MASK;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function. And we hold kprobe lock.
+ */
+int post_kprobe_handler(struct pt_regs *regs)
+{
+ if (!kprobe_running())
+ goto no_kprobe;
+
+ if (current_kprobe->post_handler)
+ current_kprobe->post_handler(current_kprobe, regs, 0);
+
+ /*
+ * We singlestepped with interrupts disabled. So, the result on
+ * the stack would be incorrect for "pushfl" instruction.
+ */
+ if (current_kprobe->opcode == 0x9c) { /* pushfl */
+ regs->esp &= ~(TF_MASK | IF_MASK);
+ regs->esp |= kprobe_old_eflags;
+ }
+
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_saved_eflags;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+
+ /*
+ * if somebody else is singlestepping across a probe point, eflags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->eflags & TF_MASK)
+ goto no_kprobe;
+
+ return 1;
+
+no_kprobe:
+ restore_interrupts(regs);
+ return 0;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ if (current_kprobe->fault_handler
+ && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ return 1;
+
+ if (kprobe_status & KPROBE_HIT_SS) {
+ rearm_kprobe(current_kprobe, regs);
+ regs->eflags |= kprobe_old_eflags;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+ }
+ return 0;
+}
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/kernel/Makefile 30-dp/arch/i386/kernel/Makefile
--- /usr/src/30-pure/arch/i386/kernel/Makefile 2002-08-02 14:02:39.000000000 +0530
+++ 30-dp/arch/i386/kernel/Makefile 2002-08-06 13:19:12.000000000 +0530
@@ -26,6 +26,7 @@
obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KPROBES) += kprobes.o
ifdef CONFIG_VISWS
obj-y += setup-visws.o
obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/kernel/traps.c 30-dp/arch/i386/kernel/traps.c
--- /usr/src/30-pure/arch/i386/kernel/traps.c 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/arch/i386/kernel/traps.c 2002-08-06 15:01:18.000000000 +0530
@@ -24,6 +24,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>

#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -326,6 +327,8 @@
panic("do_trap: can't hit this");
}
#endif
+ if (kprobe_running() && kprobe_fault_handler(regs, trapnr))
+ return;

if (!(regs->xcs & 3))
goto kernel_trap;
@@ -392,7 +395,6 @@
}

DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
@@ -408,6 +410,9 @@
{
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
+
+ if (kprobe_running() && kprobe_fault_handler(regs, 13))
+ return;

if (!(regs->xcs & 3))
goto gp_in_kernel;
@@ -508,6 +513,14 @@
inb(0x71); /* dummy */
}

+asmlinkage int do_int3(struct pt_regs *regs, long error_code)
+{
+ if (kprobe_handler(regs))
+ return 1;
+ do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+ return 0;
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -530,7 +543,7 @@
* find every occurrence of the TF bit that could be saved away even
* by user code)
*/
-asmlinkage void do_debug(struct pt_regs * regs, long error_code)
+asmlinkage int do_debug(struct pt_regs * regs, long error_code)
{
unsigned int condition;
struct task_struct *tsk = current;
@@ -538,6 +551,9 @@

__asm__ __volatile__("movl %%db6,%0" : "=r" (condition));

+ if (post_kprobe_handler(regs))
+ return 1;
+
/* Mask out spurious debug traps due to lazy DR7 setting */
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
if (!tsk->thread.debugreg[7])
@@ -588,15 +604,15 @@
__asm__("movl %0,%%db7"
: /* no output */
: "r" (0));
- return;
+ return 0;

debug_vm86:
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
- return;
+ return 0;

clear_TF:
regs->eflags &= ~TF_MASK;
- return;
+ return 0;
}

/*
@@ -760,6 +776,8 @@
struct task_struct *tsk = current;
clts(); /* Allow maths ops (or we recurse) */

+ if (kprobe_running() && kprobe_fault_handler(&regs, 7))
+ return;
if (!tsk->used_math)
init_fpu(tsk);
restore_fpu(tsk);
@@ -943,9 +961,14 @@
#endif

set_trap_gate(0,&divide_error);
- set_trap_gate(1,&debug);
set_intr_gate(2,&nmi);
- set_system_gate(3,&int3); /* int3-5 can be called from all */
+#ifdef CONFIG_KPROBES
+ _set_gate(idt_table+1,14,3,&debug);
+ _set_gate(idt_table+3,14,3,&int3);
+#else
+ set_trap_gate(1,&debug);
+ set_system_gate(3,&int3); /* int3-5 can be called from all */
+#endif
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/arch/i386/mm/fault.c 30-dp/arch/i386/mm/fault.c
--- /usr/src/30-pure/arch/i386/mm/fault.c 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/arch/i386/mm/fault.c 2002-08-06 12:23:43.000000000 +0530
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
@@ -155,6 +156,9 @@
/* get the address */
__asm__("movl %%cr2,%0":"=r" (address));

+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ return;
+
/* It's safe to allow irq's after cr2 has been saved */
if (regs->eflags & X86_EFLAGS_IF)
local_irq_enable();
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/include/asm-i386/kprobes.h 30-dp/include/asm-i386/kprobes.h
--- /usr/src/30-pure/include/asm-i386/kprobes.h 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/include/asm-i386/kprobes.h 2002-08-06 13:43:09.000000000 +0530
@@ -0,0 +1,24 @@
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ * Dynamic Probes (kprobes) support
+ * Vamsi Krishna S <[email protected]>, July, 2002
+ * Mailing list: [email protected]
+ */
+#include <linux/types.h>
+
+struct pt_regs;
+
+typedef u8 kprobe_opcode_t;
+#define BREAKPOINT_INSTRUCTION 0xcc
+
+#ifdef CONFIG_KPROBES
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int post_kprobe_handler(struct pt_regs *regs);
+extern int kprobe_handler(struct pt_regs *regs);
+#else /* !CONFIG_KPROBES */
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { return 0; }
+static inline int post_kprobe_handler(struct pt_regs *regs) { return 0; }
+static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
+#endif
+#endif /* _ASM_KPROBES_H */
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/include/linux/kprobes.h 30-dp/include/linux/kprobes.h
--- /usr/src/30-pure/include/linux/kprobes.h 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/include/linux/kprobes.h 2002-08-06 12:23:43.000000000 +0530
@@ -0,0 +1,54 @@
+#ifndef _LINUX_KPROBES_H
+#define _LINUX_KPROBES_H
+#include <linux/config.h>
+#include <linux/list.h>
+#include <asm/kprobes.h>
+
+struct kprobe;
+struct pt_regs;
+
+typedef void (*kprobe_pre_handler_t)(struct kprobe *, struct pt_regs *);
+typedef void (*kprobe_post_handler_t)(struct kprobe *, struct pt_regs *,
+ unsigned long flags);
+typedef int (*kprobe_fault_handler_t)(struct kprobe *, struct pt_regs *,
+ int trapnr);
+
+struct kprobe {
+ struct list_head list;
+
+ /* location of the probe point */
+ kprobe_opcode_t *addr;
+
+ /* Called before addr is executed. */
+ kprobe_pre_handler_t pre_handler;
+
+ /* Called after addr is executed, unless... */
+ kprobe_post_handler_t post_handler;
+
+ /* ... called if executing addr causes a fault (eg. page fault).
+ * Return 1 if it handled fault, otherwise kernel will see it. */
+ kprobe_fault_handler_t fault_handler;
+
+ /* Saved opcode (which has been replaced with breakpoint) */
+ kprobe_opcode_t opcode;
+};
+
+#ifdef CONFIG_KPROBES
+/* Locks kprobe: irq must be disabled */
+void lock_kprobes(void);
+void unlock_kprobes(void);
+
+/* kprobe running now on this CPU? */
+int kprobe_running(void);
+
+/* Get the kprobe at this addr (if any). Must have called lock_kprobes */
+struct kprobe *get_kprobe(void *addr);
+
+int register_kprobe(struct kprobe *p);
+void unregister_kprobe(struct kprobe *p);
+#else
+static inline int kprobe_running(void) { return 0; }
+static inline int register_kprobe(struct kprobe *p) { return -ENOSYS; }
+static inline void unregister_kprobe(struct kprobe *p) { }
+#endif
+#endif /* _LINUX_KPROBES_H */
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/kernel/kprobes.c 30-dp/kernel/kprobes.c
--- /usr/src/30-pure/kernel/kprobes.c 1970-01-01 05:30:00.000000000 +0530
+++ 30-dp/kernel/kprobes.c 2002-08-06 12:23:43.000000000 +0530
@@ -0,0 +1,94 @@
+/* Support for kernel probes.
+ (C) 2002 Vamsi Krishna S <[email protected]>.
+*/
+#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+static struct list_head kprobe_table[KPROBE_TABLE_SIZE];
+
+static unsigned int kprobe_cpu = NR_CPUS;
+static spinlock_t kprobe_lock = SPIN_LOCK_UNLOCKED;
+
+int kprobe_running(void)
+{
+ return kprobe_cpu == smp_processor_id();
+}
+
+/* Locks kprobe: irqs must be disabled */
+void lock_kprobes(void)
+{
+ spin_lock(&kprobe_lock);
+ kprobe_cpu = smp_processor_id();
+}
+
+void unlock_kprobes(void)
+{
+ kprobe_cpu = NR_CPUS;
+ spin_unlock(&kprobe_lock);
+}
+
+/* You have to be holding the kprobe_lock */
+struct kprobe *get_kprobe(void *addr)
+{
+ struct list_head *head, *tmp;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ list_for_each(tmp, head) {
+ struct kprobe *p = list_entry(tmp, struct kprobe, list);
+ if (p->addr == addr)
+ return p;
+ }
+ return NULL;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret = 0;
+
+ spin_lock_irq(&kprobe_lock);
+ if (get_kprobe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ list_add(&p->list, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ p->opcode = *p->addr;
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ out:
+ spin_unlock_irq(&kprobe_lock);
+ return ret;
+}
+
+void unregister_kprobe(struct kprobe *p)
+{
+ spin_lock_irq(&kprobe_lock);
+ *p->addr = p->opcode;
+ list_del(&p->list);
+ flush_icache_range(p->addr, p->addr + sizeof(kprobe_opcode_t));
+ spin_unlock_irq(&kprobe_lock);
+}
+
+static int __init init_kprobes(void)
+{
+ int i;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kprobe_table[i]);
+
+ return 0;
+}
+__initcall(init_kprobes);
+
+EXPORT_SYMBOL_GPL(register_kprobe);
+EXPORT_SYMBOL_GPL(unregister_kprobe);
diff -urN -X /home/vamsi/.dontdiff /usr/src/30-pure/kernel/Makefile 30-dp/kernel/Makefile
--- /usr/src/30-pure/kernel/Makefile 2002-08-05 11:33:52.000000000 +0530
+++ 30-dp/kernel/Makefile 2002-08-06 12:23:43.000000000 +0530
@@ -10,7 +10,7 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o platform.o suspend.o
+ printk.o platform.o suspend.o kprobes.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \
@@ -23,6 +23,7 @@
obj-$(CONFIG_PM) += pm.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KPROBES) += kprobes.o

ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is

2002-08-06 16:31:39

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30


On Tue, 6 Aug 2002, Rusty Russell wrote:
>
> > You mean _exception_ handlers. It's definitely not unnecessary. Exceptions
> > can very much be preempted.
>
> The patch changes traps 1 and 3 (debug & int3) to interrupt gates
> though.

Yes, but then it enables interrupts at one point.

And I'm not saying that is wrong - I'm saying that the warning is really
because you didn't tell the kernel that it was _not_ wrong. The warning is
a "I got called with interrupts disabled, not nobody actually told me that
I shouldn't reschedule. I will refuse to reschedule (exactly because
interrupts weren't enabled), but I don't like the fact that somebody
apparently did things behind my back".

Think of the kernel as a grumpy girlfriend that you just stood up, and
bring flowers next time.

Linus

2002-08-07 01:58:18

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

In message <[email protected]> you write:
> - move trap1 and trap3 interrupt gates only under CONFIG_KPROBES. Please
> note that if we don't do this, we need to call restore_interrupts()
> from the dummy (post_)kprobe_handler() in include/asm-i386/kprobes.h
> when CONFIG_KPROBES is off. I didn't like this subtle side effect. hence
> the #ifdef CONFIG_KPROBES around _set_trap_gate. Still, the calling
> conventions of do_debug and do_int3 remain independent of CONFIG_KPROBES.

Hmm, I thought about this but then decided against it. Your way is
pretty subtle too: I think I prefer the restore_interrupts()
explicitly after the (failed) call to kprobe_handler, ie (on top of
your patch, which looks excellent):

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.30-kprobes-vamsi/arch/i386/kernel/traps.c working-2.5.30-kprobes/arch/i386/kernel/traps.c
--- working-2.5.30-kprobes-vamsi/arch/i386/kernel/traps.c 2002-08-07 10:45:26.000000000 +1000
+++ working-2.5.30-kprobes/arch/i386/kernel/traps.c 2002-08-07 10:51:28.000000000 +1000
@@ -517,6 +517,9 @@ asmlinkage int do_int3(struct pt_regs *r
{
if (kprobe_handler(regs))
return 1;
+ /* This is an interrupt gate, because kprobes wants interrupts
+ disabled. Normal trap handlers don't. */
+ restore_interrupts(regs);
do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
return 0;
}
@@ -554,6 +557,9 @@ asmlinkage int do_debug(struct pt_regs *
if (post_kprobe_handler(regs))
return 1;

+ /* Interrupts not disabled for normal trap handling. */
+ restore_interrupts(regs);
+
/* Mask out spurious debug traps due to lazy DR7 setting */
if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
if (!tsk->thread.debugreg[7])
@@ -961,14 +967,9 @@ void __init trap_init(void)
#endif

set_trap_gate(0,&divide_error);
+ _set_gate(idt_table+1,14,3,&debug); /* debug trap for kprobes */
set_intr_gate(2,&nmi);
-#ifdef CONFIG_KPROBES
- _set_gate(idt_table+1,14,3,&debug);
- _set_gate(idt_table+3,14,3,&int3);
-#else
- set_trap_gate(1,&debug);
- set_system_gate(3,&int3); /* int3-5 can be called from all */
-#endif
+ _set_gate(idt_table+3,14,3,&int3); /* int3-5 can be called from all */
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.5.30-kprobes-vamsi/arch/i386/kernel/kprobes.c working-2.5.30-kprobes/arch/i386/kernel/kprobes.c
--- working-2.5.30-kprobes-vamsi/arch/i386/kernel/kprobes.c 2002-08-07 10:45:25.000000000 +1000
+++ working-2.5.30-kprobes/arch/i386/kernel/kprobes.c 2002-08-07 10:52:15.000000000 +1000
@@ -102,7 +102,6 @@ int kprobe_handler(struct pt_regs *regs)

no_kprobe:
preempt_enable_no_resched();
- restore_interrupts(regs);
return ret;
}

@@ -119,7 +118,7 @@ static void rearm_kprobe(struct kprobe *
int post_kprobe_handler(struct pt_regs *regs)
{
if (!kprobe_running())
- goto no_kprobe;
+ return 0;

if (current_kprobe->post_handler)
current_kprobe->post_handler(current_kprobe, regs, 0);
@@ -145,13 +144,9 @@ int post_kprobe_handler(struct pt_regs *
* of do_debug, as if this is not a probe hit.
*/
if (regs->eflags & TF_MASK)
- goto no_kprobe;
+ return 0;

return 1;
-
-no_kprobe:
- restore_interrupts(regs);
- return 0;
}

/* Interrupts disabled, kprobe_lock held. */
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

2002-08-07 04:45:34

by Vamsi Krishna S .

[permalink] [raw]
Subject: Re: [PATCH] kprobes for 2.5.30

Hi Rusty,

On Wed, Aug 07, 2002 at 10:55:04AM +1000, Rusty Russell wrote:
> In message <[email protected]> you write:
> > - move trap1 and trap3 interrupt gates only under CONFIG_KPROBES. Please
> > note that if we don't do this, we need to call restore_interrupts()
> > from the dummy (post_)kprobe_handler() in include/asm-i386/kprobes.h
> > when CONFIG_KPROBES is off. I didn't like this subtle side effect. hence
> > the #ifdef CONFIG_KPROBES around _set_trap_gate. Still, the calling
> > conventions of do_debug and do_int3 remain independent of CONFIG_KPROBES.
>
> Hmm, I thought about this but then decided against it. Your way is
> pretty subtle too: I think I prefer the restore_interrupts()
> explicitly after the (failed) call to kprobe_handler, ie (on top of
> your patch, which looks excellent):

I agree this one is even better.

Thanks,
Vamsi.

> <snip patch>

--
Vamsi Krishna S.
Linux Technology Center,
IBM Software Lab, Bangalore.
Ph: +91 80 5044959
Internet: [email protected]