2013-07-30 22:53:34

by Seiji Aguchi

[permalink] [raw]
Subject: [RFC][PATCH] Introduce page fault tracepoint

This patch introduces page fault tracepoints to x86 architecture
by switching IDT.

[Use case of page fault events]

Two events, for user and kernel spaces, are introduced at the beginning of
page fault handler.

- User space event
There is a request of page fault event for user space as below.

http://marc.info/?l=linux-mm&m=136807959830182&w=2
http://marc.info/?l=linux-mm&m=136807959130175&w=2

- Kernel space event:
Overhead in kernel space is measurable by enabling it.

[Creating IDT]

A way to create IDT is as below.

- Introduce set_intr_gate_raw() to register just non-trace handler to IDT.
This is used at boot time which tracing is disabled.
- Make set_intr_gate() macro so that it can register trace handler to
trace IDT and non-trace handler to normal IDT.

Signed-off-by: Seiji Aguchi <[email protected]>
---
arch/x86/include/asm/desc.h | 33 +++++++++++++++++---
arch/x86/include/asm/hw_irq.h | 14 ++++++++-
arch/x86/include/asm/trace/exceptions.h | 51 +++++++++++++++++++++++++++++++
arch/x86/include/asm/traps.h | 22 +++++++++++++
arch/x86/kernel/entry_32.S | 10 ++++++
arch/x86/kernel/entry_64.S | 13 +++++++-
arch/x86/kernel/head64.c | 2 +-
arch/x86/kernel/irqinit.c | 2 +-
arch/x86/kernel/kvm.c | 2 +-
arch/x86/kernel/traps.c | 28 ++++++++--------
arch/x86/mm/Makefile | 2 +
arch/x86/mm/fault.c | 22 +++++++++++++
12 files changed, 177 insertions(+), 24 deletions(-)
create mode 100644 arch/x86/include/asm/trace/exceptions.h

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index b90e5df..c04302b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -327,10 +327,28 @@ static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
{
write_idt_entry(trace_idt_table, entry, gate);
}
+
+static inline void _trace_set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate_desc s;
+
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+ /*
+ * does not need to be atomic because it is only done once at
+ * setup time
+ */
+ write_trace_idt_entry(gate, &s);
+}
#else
static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
{
}
+
+static inline void _trace_set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+}
#endif

static inline void _set_gate(int gate, unsigned type, void *addr,
@@ -353,12 +371,20 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
* Pentium F0 0F bugfix can have resulted in the mapped
* IDT being write-protected.
*/
-static inline void set_intr_gate(unsigned int n, void *addr)
+static inline void set_intr_gate_raw(unsigned int n, void *addr)
{
BUG_ON((unsigned)n > 0xFF);
_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
}

+#define set_intr_gate(n, addr) \
+ do { \
+ BUG_ON((unsigned)n > 0xFF); \
+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); \
+ _trace_set_gate(n, GATE_INTERRUPT, trace_##addr, 0, 0, \
+ __KERNEL_CS); \
+ } while (0)
+
extern int first_system_vector;
/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
extern unsigned long used_vectors[];
@@ -395,10 +421,7 @@ static inline void trace_set_intr_gate(unsigned int gate, void *addr)
#define __trace_alloc_intr_gate(n, addr)
#endif

-static inline void __alloc_intr_gate(unsigned int n, void *addr)
-{
- set_intr_gate(n, addr);
-}
+#define __alloc_intr_gate(n, addr) set_intr_gate(n, addr)

#define alloc_intr_gate(n, addr) \
do { \
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e4ac559..fbd73b7 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -89,10 +89,22 @@ extern void trace_reschedule_interrupt(void);
extern void trace_threshold_interrupt(void);
extern void trace_call_function_interrupt(void);
extern void trace_call_function_single_interrupt(void);
+#else /* CONFIG_TRACING */
+#define trace_apic_timer_interrupt apic_timer_interrupt
+#define trace_x86_platform_ipi x86_platform_ipi
+#define trace_error_interrupt error_interrupt
+#define trace_irq_work_interrupt irq_work_interrupt
+#define trace_spurious_interrupt spurious_interrupt
+#define trace_thermal_interrupt thermal_interrupt
+#define trace_reschedule_interrupt reschedule_interrupt
+#define trace_threshold_interrupt threshold_interrupt
+#define trace_call_function_interrupt call_function_interrupt
+#define trace_call_function_single_interrupt call_function_single_interrupt
+#endif
+
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
#define trace_reboot_interrupt reboot_interrupt
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
-#endif /* CONFIG_TRACING */

/* IOAPIC */
#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
new file mode 100644
index 0000000..660fcf1
--- /dev/null
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -0,0 +1,51 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM exceptions
+
+#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_FAULT_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_irq_vector_regfunc(void);
+extern void trace_irq_vector_unregfunc(void);
+
+DECLARE_EVENT_CLASS(x86_exceptions,
+
+ TP_PROTO(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code),
+
+ TP_ARGS(address, regs, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, address )
+ __field( struct pt_regs *, regs )
+ __field( unsigned long, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->regs = regs;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address=0x%lx regs=0x%p error_code=0x%lx",
+ __entry->address, __entry->regs, __entry->error_code) );
+
+#define DEFINE_PAGE_FAULT_EVENT(name) \
+DEFINE_EVENT_FN(x86_exceptions, name, \
+ TP_PROTO(unsigned long address, struct pt_regs *regs, \
+ unsigned long error_code), \
+ TP_ARGS(address, regs, error_code), \
+ trace_irq_vector_regfunc, \
+ trace_irq_vector_unregfunc);
+
+DEFINE_PAGE_FAULT_EVENT(user_page_fault);
+DEFINE_PAGE_FAULT_EVENT(kernel_page_fault);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE exceptions
+#endif /* _TRACE_PAGE_FAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 88eae2a..adf9258 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -41,6 +41,25 @@ asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);

+#ifdef CONFIG_TRACING
+asmlinkage void trace_page_fault(void);
+#else
+#define trace_page_fault page_fault
+#endif
+#define trace_divide_error divide_error
+#define trace_bounds bounds
+#define trace_invalid_op invalid_op
+#define trace_device_not_available device_not_available
+#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
+#define trace_invalid_TSS invalid_TSS
+#define trace_segment_not_present segment_not_present
+#define trace_general_protection general_protection
+#define trace_spurious_interrupt_bug spurious_interrupt_bug
+#define trace_coprocessor_error coprocessor_error
+#define trace_alignment_check alignment_check
+#define trace_simd_coprocessor_error simd_coprocessor_error
+#define trace_async_page_fault async_page_fault
+
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
dotraplinkage void do_nmi(struct pt_regs *, long);
@@ -59,6 +78,9 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+#ifdef CONFIG_TRACING
+dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
+#endif
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2cfbc3a..c9eb4e2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1244,6 +1244,16 @@ return_to_handler:
*/
.pushsection .kprobes.text, "ax"

+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+ RING0_EC_FRAME
+ ASM_CLAC
+ pushl_cfi $trace_do_page_fault
+ jmp error_code
+ CFI_ENDPROC
+END(trace_page_fault)
+#endif
+
ENTRY(page_fault)
RING0_EC_FRAME
ASM_CLAC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1b69951..5136404 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1295,6 +1295,17 @@ ENTRY(\sym)
END(\sym)
.endm

+#ifdef CONFIG_TRACING
+.macro trace_errorentry sym do_sym
+errorentry trace(\sym) trace(\do_sym)
+errorentry \sym \do_sym
+.endm
+#else
+.macro trace_errorentry sym do_sym
+errorentry \sym \do_sym
+.endm
+#endif
+
/* error code is on the stack already */
.macro paranoiderrorentry sym do_sym
ENTRY(\sym)
@@ -1497,7 +1508,7 @@ zeroentry xen_int3 do_int3
errorentry xen_stack_segment do_stack_segment
#endif
errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
+trace_errorentry page_fault do_page_fault
#ifdef CONFIG_KVM_GUEST
errorentry async_page_fault do_async_page_fault
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 55b6761..67a0649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -162,7 +162,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();

for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, &early_idt_handlers[i]);
+ set_intr_gate_raw(i, &early_idt_handlers[i]);
load_idt((const struct desc_ptr *)&idt_descr);

copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc..2ca2354 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -206,7 +206,7 @@ void __init native_init_IRQ(void)
i = FIRST_EXTERNAL_VECTOR;
for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate_raw(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}

if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a96d32c..12b384e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -462,7 +462,7 @@ static struct notifier_block kvm_cpu_notifier = {

static void __init kvm_apf_trap_init(void)
{
- set_intr_gate(14, &async_page_fault);
+ set_intr_gate(14, async_page_fault);
}

void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1b23a1c..eadd251 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -709,7 +709,7 @@ void __init early_trap_init(void)
/* int3 can be called from all */
set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
#ifdef CONFIG_X86_32
- set_intr_gate(X86_TRAP_PF, &page_fault);
+ set_intr_gate(X86_TRAP_PF, page_fault);
#endif
load_idt(&idt_descr);
}
@@ -717,7 +717,7 @@ void __init early_trap_init(void)
void __init early_trap_pf_init(void)
{
#ifdef CONFIG_X86_64
- set_intr_gate(X86_TRAP_PF, &page_fault);
+ set_intr_gate(X86_TRAP_PF, page_fault);
#endif
}

@@ -733,30 +733,30 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif

- set_intr_gate(X86_TRAP_DE, &divide_error);
+ set_intr_gate(X86_TRAP_DE, divide_error);
set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
set_system_intr_gate(X86_TRAP_OF, &overflow);
- set_intr_gate(X86_TRAP_BR, &bounds);
- set_intr_gate(X86_TRAP_UD, &invalid_op);
- set_intr_gate(X86_TRAP_NM, &device_not_available);
+ set_intr_gate(X86_TRAP_BR, bounds);
+ set_intr_gate(X86_TRAP_UD, invalid_op);
+ set_intr_gate(X86_TRAP_NM, device_not_available);
#ifdef CONFIG_X86_32
set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
- set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
- set_intr_gate(X86_TRAP_TS, &invalid_TSS);
- set_intr_gate(X86_TRAP_NP, &segment_not_present);
+ set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
+ set_intr_gate(X86_TRAP_TS, invalid_TSS);
+ set_intr_gate(X86_TRAP_NP, segment_not_present);
set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(X86_TRAP_GP, &general_protection);
- set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
- set_intr_gate(X86_TRAP_MF, &coprocessor_error);
- set_intr_gate(X86_TRAP_AC, &alignment_check);
+ set_intr_gate(X86_TRAP_GP, general_protection);
+ set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
+ set_intr_gate(X86_TRAP_MF, coprocessor_error);
+ set_intr_gate(X86_TRAP_AC, alignment_check);
#ifdef CONFIG_X86_MCE
set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
- set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
+ set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);

/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5f..6a19ad9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)

+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4a..f515154 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,9 @@
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_START */

+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
/*
* Page fault error code bits:
*
@@ -1230,3 +1233,22 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code);
exception_exit(prev_state);
}
+
+static void trace_page_fault_entries(struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_user_page_fault(read_cr2(), regs, error_code);
+ else
+ trace_kernel_page_fault(read_cr2(), regs, error_code);
+}
+
+dotraplinkage void __kprobes
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ enum ctx_state prev_state;
+ prev_state = exception_enter();
+ trace_page_fault_entries(regs, error_code);
+ __do_page_fault(regs, error_code);
+ exception_exit(prev_state);
+}
--
1.7.1


2013-08-09 19:33:58

by Seiji Aguchi

[permalink] [raw]
Subject: RE: [RFC][PATCH] Introduce page fault tracepoint

Any comment?

> -----Original Message-----
> From: Seiji Aguchi [mailto:[email protected]]
> Sent: Tuesday, July 30, 2013 6:53 PM
> To: [email protected]; [email protected]
> Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; Tomoki Sekiyama
> Subject: [RFC][PATCH] Introduce page fault tracepoint
>
> This patch introduces page fault tracepoints to x86 architecture
> by switching IDT.
>
> [Use case of page fault events]
>
> Two events, for user and kernel spaces, are introduced at the beginning of
> page fault handler.
>
> - User space event
> There is a request of page fault event for user space as below.
>
> http://marc.info/?l=linux-mm&m=136807959830182&w=2
> http://marc.info/?l=linux-mm&m=136807959130175&w=2
>
> - Kernel space event:
> Overhead in kernel space is measurable by enabling it.
>
> [Creating IDT]
>
> A way to create IDT is as below.
>
> - Introduce set_intr_gate_raw() to register just non-trace handler to IDT.
> This is used at boot time which tracing is disabled.
> - Make set_intr_gate() macro so that it can register trace handler to
> trace IDT and non-trace handler to normal IDT.
>
> Signed-off-by: Seiji Aguchi <[email protected]>
> ---
> arch/x86/include/asm/desc.h | 33 +++++++++++++++++---
> arch/x86/include/asm/hw_irq.h | 14 ++++++++-
> arch/x86/include/asm/trace/exceptions.h | 51 +++++++++++++++++++++++++++++++
> arch/x86/include/asm/traps.h | 22 +++++++++++++
> arch/x86/kernel/entry_32.S | 10 ++++++
> arch/x86/kernel/entry_64.S | 13 +++++++-
> arch/x86/kernel/head64.c | 2 +-
> arch/x86/kernel/irqinit.c | 2 +-
> arch/x86/kernel/kvm.c | 2 +-
> arch/x86/kernel/traps.c | 28 ++++++++--------
> arch/x86/mm/Makefile | 2 +
> arch/x86/mm/fault.c | 22 +++++++++++++
> 12 files changed, 177 insertions(+), 24 deletions(-)
> create mode 100644 arch/x86/include/asm/trace/exceptions.h
>
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index b90e5df..c04302b 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -327,10 +327,28 @@ static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
> {
> write_idt_entry(trace_idt_table, entry, gate);
> }
> +
> +static inline void _trace_set_gate(int gate, unsigned type, void *addr,
> + unsigned dpl, unsigned ist, unsigned seg)
> +{
> + gate_desc s;
> +
> + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
> + /*
> + * does not need to be atomic because it is only done once at
> + * setup time
> + */
> + write_trace_idt_entry(gate, &s);
> +}
> #else
> static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
> {
> }
> +
> +static inline void _trace_set_gate(int gate, unsigned type, void *addr,
> + unsigned dpl, unsigned ist, unsigned seg)
> +{
> +}
> #endif
>
> static inline void _set_gate(int gate, unsigned type, void *addr,
> @@ -353,12 +371,20 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
> * Pentium F0 0F bugfix can have resulted in the mapped
> * IDT being write-protected.
> */
> -static inline void set_intr_gate(unsigned int n, void *addr)
> +static inline void set_intr_gate_raw(unsigned int n, void *addr)
> {
> BUG_ON((unsigned)n > 0xFF);
> _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
> }
>
> +#define set_intr_gate(n, addr) \
> + do { \
> + BUG_ON((unsigned)n > 0xFF); \
> + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); \
> + _trace_set_gate(n, GATE_INTERRUPT, trace_##addr, 0, 0, \
> + __KERNEL_CS); \
> + } while (0)
> +
> extern int first_system_vector;
> /* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
> extern unsigned long used_vectors[];
> @@ -395,10 +421,7 @@ static inline void trace_set_intr_gate(unsigned int gate, void *addr)
> #define __trace_alloc_intr_gate(n, addr)
> #endif
>
> -static inline void __alloc_intr_gate(unsigned int n, void *addr)
> -{
> - set_intr_gate(n, addr);
> -}
> +#define __alloc_intr_gate(n, addr) set_intr_gate(n, addr)
>
> #define alloc_intr_gate(n, addr) \
> do { \
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index e4ac559..fbd73b7 100644
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -89,10 +89,22 @@ extern void trace_reschedule_interrupt(void);
> extern void trace_threshold_interrupt(void);
> extern void trace_call_function_interrupt(void);
> extern void trace_call_function_single_interrupt(void);
> +#else /* CONFIG_TRACING */
> +#define trace_apic_timer_interrupt apic_timer_interrupt
> +#define trace_x86_platform_ipi x86_platform_ipi
> +#define trace_error_interrupt error_interrupt
> +#define trace_irq_work_interrupt irq_work_interrupt
> +#define trace_spurious_interrupt spurious_interrupt
> +#define trace_thermal_interrupt thermal_interrupt
> +#define trace_reschedule_interrupt reschedule_interrupt
> +#define trace_threshold_interrupt threshold_interrupt
> +#define trace_call_function_interrupt call_function_interrupt
> +#define trace_call_function_single_interrupt call_function_single_interrupt
> +#endif
> +
> #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
> #define trace_reboot_interrupt reboot_interrupt
> #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
> -#endif /* CONFIG_TRACING */
>
> /* IOAPIC */
> #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
> diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
> new file mode 100644
> index 0000000..660fcf1
> --- /dev/null
> +++ b/arch/x86/include/asm/trace/exceptions.h
> @@ -0,0 +1,51 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM exceptions
> +
> +#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_PAGE_FAULT_H
> +
> +#include <linux/tracepoint.h>
> +
> +extern void trace_irq_vector_regfunc(void);
> +extern void trace_irq_vector_unregfunc(void);
> +
> +DECLARE_EVENT_CLASS(x86_exceptions,
> +
> + TP_PROTO(unsigned long address, struct pt_regs *regs,
> + unsigned long error_code),
> +
> + TP_ARGS(address, regs, error_code),
> +
> + TP_STRUCT__entry(
> + __field( unsigned long, address )
> + __field( struct pt_regs *, regs )
> + __field( unsigned long, error_code )
> + ),
> +
> + TP_fast_assign(
> + __entry->address = address;
> + __entry->regs = regs;
> + __entry->error_code = error_code;
> + ),
> +
> + TP_printk("address=0x%lx regs=0x%p error_code=0x%lx",
> + __entry->address, __entry->regs, __entry->error_code) );
> +
> +#define DEFINE_PAGE_FAULT_EVENT(name) \
> +DEFINE_EVENT_FN(x86_exceptions, name, \
> + TP_PROTO(unsigned long address, struct pt_regs *regs, \
> + unsigned long error_code), \
> + TP_ARGS(address, regs, error_code), \
> + trace_irq_vector_regfunc, \
> + trace_irq_vector_unregfunc);
> +
> +DEFINE_PAGE_FAULT_EVENT(user_page_fault);
> +DEFINE_PAGE_FAULT_EVENT(kernel_page_fault);
> +
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH .
> +#define TRACE_INCLUDE_FILE exceptions
> +#endif /* _TRACE_PAGE_FAULT_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 88eae2a..adf9258 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -41,6 +41,25 @@ asmlinkage void machine_check(void);
> #endif /* CONFIG_X86_MCE */
> asmlinkage void simd_coprocessor_error(void);
>
> +#ifdef CONFIG_TRACING
> +asmlinkage void trace_page_fault(void);
> +#else
> +#define trace_page_fault page_fault
> +#endif
> +#define trace_divide_error divide_error
> +#define trace_bounds bounds
> +#define trace_invalid_op invalid_op
> +#define trace_device_not_available device_not_available
> +#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
> +#define trace_invalid_TSS invalid_TSS
> +#define trace_segment_not_present segment_not_present
> +#define trace_general_protection general_protection
> +#define trace_spurious_interrupt_bug spurious_interrupt_bug
> +#define trace_coprocessor_error coprocessor_error
> +#define trace_alignment_check alignment_check
> +#define trace_simd_coprocessor_error simd_coprocessor_error
> +#define trace_async_page_fault async_page_fault
> +
> dotraplinkage void do_divide_error(struct pt_regs *, long);
> dotraplinkage void do_debug(struct pt_regs *, long);
> dotraplinkage void do_nmi(struct pt_regs *, long);
> @@ -59,6 +78,9 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
> #endif
> dotraplinkage void do_general_protection(struct pt_regs *, long);
> dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
> +#ifdef CONFIG_TRACING
> +dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
> +#endif
> dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
> dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
> dotraplinkage void do_alignment_check(struct pt_regs *, long);
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index 2cfbc3a..c9eb4e2 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -1244,6 +1244,16 @@ return_to_handler:
> */
> .pushsection .kprobes.text, "ax"
>
> +#ifdef CONFIG_TRACING
> +ENTRY(trace_page_fault)
> + RING0_EC_FRAME
> + ASM_CLAC
> + pushl_cfi $trace_do_page_fault
> + jmp error_code
> + CFI_ENDPROC
> +END(trace_page_fault)
> +#endif
> +
> ENTRY(page_fault)
> RING0_EC_FRAME
> ASM_CLAC
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 1b69951..5136404 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1295,6 +1295,17 @@ ENTRY(\sym)
> END(\sym)
> .endm
>
> +#ifdef CONFIG_TRACING
> +.macro trace_errorentry sym do_sym
> +errorentry trace(\sym) trace(\do_sym)
> +errorentry \sym \do_sym
> +.endm
> +#else
> +.macro trace_errorentry sym do_sym
> +errorentry \sym \do_sym
> +.endm
> +#endif
> +
> /* error code is on the stack already */
> .macro paranoiderrorentry sym do_sym
> ENTRY(\sym)
> @@ -1497,7 +1508,7 @@ zeroentry xen_int3 do_int3
> errorentry xen_stack_segment do_stack_segment
> #endif
> errorentry general_protection do_general_protection
> -errorentry page_fault do_page_fault
> +trace_errorentry page_fault do_page_fault
> #ifdef CONFIG_KVM_GUEST
> errorentry async_page_fault do_async_page_fault
> #endif
> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
> index 55b6761..67a0649 100644
> --- a/arch/x86/kernel/head64.c
> +++ b/arch/x86/kernel/head64.c
> @@ -162,7 +162,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
> clear_bss();
>
> for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
> - set_intr_gate(i, &early_idt_handlers[i]);
> + set_intr_gate_raw(i, &early_idt_handlers[i]);
> load_idt((const struct desc_ptr *)&idt_descr);
>
> copy_bootdata(__va(real_mode_data));
> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
> index a2a1fbc..2ca2354 100644
> --- a/arch/x86/kernel/irqinit.c
> +++ b/arch/x86/kernel/irqinit.c
> @@ -206,7 +206,7 @@ void __init native_init_IRQ(void)
> i = FIRST_EXTERNAL_VECTOR;
> for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
> /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
> - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
> + set_intr_gate_raw(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
> }
>
> if (!acpi_ioapic && !of_ioapic)
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index a96d32c..12b384e 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -462,7 +462,7 @@ static struct notifier_block kvm_cpu_notifier = {
>
> static void __init kvm_apf_trap_init(void)
> {
> - set_intr_gate(14, &async_page_fault);
> + set_intr_gate(14, async_page_fault);
> }
>
> void __init kvm_guest_init(void)
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 1b23a1c..eadd251 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -709,7 +709,7 @@ void __init early_trap_init(void)
> /* int3 can be called from all */
> set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
> #ifdef CONFIG_X86_32
> - set_intr_gate(X86_TRAP_PF, &page_fault);
> + set_intr_gate(X86_TRAP_PF, page_fault);
> #endif
> load_idt(&idt_descr);
> }
> @@ -717,7 +717,7 @@ void __init early_trap_init(void)
> void __init early_trap_pf_init(void)
> {
> #ifdef CONFIG_X86_64
> - set_intr_gate(X86_TRAP_PF, &page_fault);
> + set_intr_gate(X86_TRAP_PF, page_fault);
> #endif
> }
>
> @@ -733,30 +733,30 @@ void __init trap_init(void)
> early_iounmap(p, 4);
> #endif
>
> - set_intr_gate(X86_TRAP_DE, &divide_error);
> + set_intr_gate(X86_TRAP_DE, divide_error);
> set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
> /* int4 can be called from all */
> set_system_intr_gate(X86_TRAP_OF, &overflow);
> - set_intr_gate(X86_TRAP_BR, &bounds);
> - set_intr_gate(X86_TRAP_UD, &invalid_op);
> - set_intr_gate(X86_TRAP_NM, &device_not_available);
> + set_intr_gate(X86_TRAP_BR, bounds);
> + set_intr_gate(X86_TRAP_UD, invalid_op);
> + set_intr_gate(X86_TRAP_NM, device_not_available);
> #ifdef CONFIG_X86_32
> set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
> #else
> set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
> #endif
> - set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
> - set_intr_gate(X86_TRAP_TS, &invalid_TSS);
> - set_intr_gate(X86_TRAP_NP, &segment_not_present);
> + set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
> + set_intr_gate(X86_TRAP_TS, invalid_TSS);
> + set_intr_gate(X86_TRAP_NP, segment_not_present);
> set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
> - set_intr_gate(X86_TRAP_GP, &general_protection);
> - set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
> - set_intr_gate(X86_TRAP_MF, &coprocessor_error);
> - set_intr_gate(X86_TRAP_AC, &alignment_check);
> + set_intr_gate(X86_TRAP_GP, general_protection);
> + set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
> + set_intr_gate(X86_TRAP_MF, coprocessor_error);
> + set_intr_gate(X86_TRAP_AC, alignment_check);
> #ifdef CONFIG_X86_MCE
> set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
> #endif
> - set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
> + set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
>
> /* Reserve all the builtin and the syscall vector: */
> for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 23d8e5f..6a19ad9 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
> CFLAGS_physaddr.o := $(nostackp)
> CFLAGS_setup_nx.o := $(nostackp)
>
> +CFLAGS_fault.o := -I$(src)/../include/asm/trace
> +
> obj-$(CONFIG_X86_PAT) += pat_rbtree.o
> obj-$(CONFIG_SMP) += tlb.o
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 654be4a..f515154 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -20,6 +20,9 @@
> #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
> #include <asm/fixmap.h> /* VSYSCALL_START */
>
> +#define CREATE_TRACE_POINTS
> +#include <asm/trace/exceptions.h>
> +
> /*
> * Page fault error code bits:
> *
> @@ -1230,3 +1233,22 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
> __do_page_fault(regs, error_code);
> exception_exit(prev_state);
> }
> +
> +static void trace_page_fault_entries(struct pt_regs *regs,
> + unsigned long error_code)
> +{
> + if (user_mode(regs))
> + trace_user_page_fault(read_cr2(), regs, error_code);
> + else
> + trace_kernel_page_fault(read_cr2(), regs, error_code);
> +}
> +
> +dotraplinkage void __kprobes
> +trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
> +{
> + enum ctx_state prev_state;
> + prev_state = exception_enter();
> + trace_page_fault_entries(regs, error_code);
> + __do_page_fault(regs, error_code);
> + exception_exit(prev_state);
> +}
> --
> 1.7.1

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2013-08-22 14:16:03

by Steven Rostedt

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce page fault tracepoint

On Tue, 30 Jul 2013 18:52:33 -0400
Seiji Aguchi <[email protected]> wrote:

/* IOAPIC */
> #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
> diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
> new file mode 100644
> index 0000000..660fcf1
> --- /dev/null
> +++ b/arch/x86/include/asm/trace/exceptions.h
> @@ -0,0 +1,51 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM exceptions
> +
> +#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_PAGE_FAULT_H
> +
> +#include <linux/tracepoint.h>
> +
> +extern void trace_irq_vector_regfunc(void);
> +extern void trace_irq_vector_unregfunc(void);
> +
> +DECLARE_EVENT_CLASS(x86_exceptions,
> +
> + TP_PROTO(unsigned long address, struct pt_regs *regs,
> + unsigned long error_code),
> +
> + TP_ARGS(address, regs, error_code),
> +
> + TP_STRUCT__entry(
> + __field( unsigned long, address )
> + __field( struct pt_regs *, regs )
> + __field( unsigned long, error_code )
> + ),
> +
> + TP_fast_assign(
> + __entry->address = address;
> + __entry->regs = regs;
> + __entry->error_code = error_code;
> + ),
> +
> + TP_printk("address=0x%lx regs=0x%p error_code=0x%lx",
> + __entry->address, __entry->regs, __entry->error_code) );

Printing the regs pointer is rather useless. This is specific for x86,
why not print the ip of where it happened and the faulting address
itself? Note, you only need to change the TP_printk() to do that. For
efficiency reasons, only pass in regs.

-- Steve

2013-08-22 14:45:53

by Seiji Aguchi

[permalink] [raw]
Subject: RE: [RFC][PATCH] Introduce page fault tracepoint

> Printing the regs pointer is rather useless. This is specific for x86,
> why not print the ip of where it happened and the faulting address
> itself?

Thank you for reviewing.
I will change the regs pointer to ip.

> Note, you only need to change the TP_printk() to do that. For
> efficiency reasons, only pass in regs.

OK. Will change the TP_printk().

Seiji