Change from v2
- Print entry->ip instead of entry->regs->ip to avoid kernel crash.
- Use %pf instead of 0x%lx to print address and ip.
This patch introduces page fault tracepoints to x86 architecture
by switching IDT.
[Use case of page fault events]
Two events, for user and kernel spaces, are introduced at the beginning of
page fault handler.
- User space event
There is a request of page fault event for user space as below.
http://marc.info/?l=linux-mm&m=136807959830182&w=2
http://marc.info/?l=linux-mm&m=136807959130175&w=2
- Kernel space event:
Overhead in kernel space is measurable by enabling it.
[Creating IDT]
A way to create IDT is as below.
- Introduce set_intr_gate_raw() to register just non-trace handler to IDT.
This is used at boot time which tracing is disabled.
- Make set_intr_gate() macro so that it can register trace handler to
trace IDT and non-trace handler to normal IDT.
Signed-off-by: Seiji Aguchi <[email protected]>
---
arch/x86/include/asm/desc.h | 33 +++++++++++++++++----
arch/x86/include/asm/hw_irq.h | 14 ++++++++-
arch/x86/include/asm/trace/exceptions.h | 52 +++++++++++++++++++++++++++++++++
arch/x86/include/asm/traps.h | 22 ++++++++++++++
arch/x86/kernel/entry_32.S | 10 +++++++
arch/x86/kernel/entry_64.S | 13 ++++++++-
arch/x86/kernel/head64.c | 2 +-
arch/x86/kernel/irqinit.c | 2 +-
arch/x86/kernel/kvm.c | 2 +-
arch/x86/kernel/traps.c | 28 +++++++++---------
arch/x86/mm/Makefile | 2 ++
arch/x86/mm/fault.c | 22 ++++++++++++++
12 files changed, 178 insertions(+), 24 deletions(-)
create mode 100644 arch/x86/include/asm/trace/exceptions.h
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index b90e5df..c04302b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -327,10 +327,28 @@ static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
{
write_idt_entry(trace_idt_table, entry, gate);
}
+
+static inline void _trace_set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate_desc s;
+
+ pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+ /*
+ * does not need to be atomic because it is only done once at
+ * setup time
+ */
+ write_trace_idt_entry(gate, &s);
+}
#else
static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
{
}
+
+static inline void _trace_set_gate(int gate, unsigned type, void *addr,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+}
#endif
static inline void _set_gate(int gate, unsigned type, void *addr,
@@ -353,12 +371,20 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
* Pentium F0 0F bugfix can have resulted in the mapped
* IDT being write-protected.
*/
-static inline void set_intr_gate(unsigned int n, void *addr)
+static inline void set_intr_gate_raw(unsigned int n, void *addr)
{
BUG_ON((unsigned)n > 0xFF);
_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
}
+#define set_intr_gate(n, addr) \
+ do { \
+ BUG_ON((unsigned)n > 0xFF); \
+ _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); \
+ _trace_set_gate(n, GATE_INTERRUPT, trace_##addr, 0, 0, \
+ __KERNEL_CS); \
+ } while (0)
+
extern int first_system_vector;
/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
extern unsigned long used_vectors[];
@@ -395,10 +421,7 @@ static inline void trace_set_intr_gate(unsigned int gate, void *addr)
#define __trace_alloc_intr_gate(n, addr)
#endif
-static inline void __alloc_intr_gate(unsigned int n, void *addr)
-{
- set_intr_gate(n, addr);
-}
+#define __alloc_intr_gate(n, addr) set_intr_gate(n, addr)
#define alloc_intr_gate(n, addr) \
do { \
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 92b3bae..c856e69 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -89,10 +89,22 @@ extern void trace_reschedule_interrupt(void);
extern void trace_threshold_interrupt(void);
extern void trace_call_function_interrupt(void);
extern void trace_call_function_single_interrupt(void);
+#else /* CONFIG_TRACING */
+#define trace_apic_timer_interrupt apic_timer_interrupt
+#define trace_x86_platform_ipi x86_platform_ipi
+#define trace_error_interrupt error_interrupt
+#define trace_irq_work_interrupt irq_work_interrupt
+#define trace_spurious_interrupt spurious_interrupt
+#define trace_thermal_interrupt thermal_interrupt
+#define trace_reschedule_interrupt reschedule_interrupt
+#define trace_threshold_interrupt threshold_interrupt
+#define trace_call_function_interrupt call_function_interrupt
+#define trace_call_function_single_interrupt call_function_single_interrupt
+#endif
+
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
#define trace_reboot_interrupt reboot_interrupt
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
-#endif /* CONFIG_TRACING */
/* IOAPIC */
#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
new file mode 100644
index 0000000..86540c0
--- /dev/null
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -0,0 +1,52 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM exceptions
+
+#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_FAULT_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_irq_vector_regfunc(void);
+extern void trace_irq_vector_unregfunc(void);
+
+DECLARE_EVENT_CLASS(x86_exceptions,
+
+ TP_PROTO(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code),
+
+ TP_ARGS(address, regs, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, address )
+ __field( unsigned long, ip )
+ __field( unsigned long, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->ip = regs->ip;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address=%pf ip=%pf error_code=0x%lx",
+ (void *)__entry->address, (void *)__entry->ip,
+ __entry->error_code) );
+
+#define DEFINE_PAGE_FAULT_EVENT(name) \
+DEFINE_EVENT_FN(x86_exceptions, name, \
+ TP_PROTO(unsigned long address, struct pt_regs *regs, \
+ unsigned long error_code), \
+ TP_ARGS(address, regs, error_code), \
+ trace_irq_vector_regfunc, \
+ trace_irq_vector_unregfunc);
+
+DEFINE_PAGE_FAULT_EVENT(user_page_fault);
+DEFINE_PAGE_FAULT_EVENT(kernel_page_fault);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE exceptions
+#endif /* _TRACE_PAGE_FAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 7036cb6..a400a22 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -37,6 +37,25 @@ asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);
+#ifdef CONFIG_TRACING
+asmlinkage void trace_page_fault(void);
+#else
+#define trace_page_fault page_fault
+#endif
+#define trace_divide_error divide_error
+#define trace_bounds bounds
+#define trace_invalid_op invalid_op
+#define trace_device_not_available device_not_available
+#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
+#define trace_invalid_TSS invalid_TSS
+#define trace_segment_not_present segment_not_present
+#define trace_general_protection general_protection
+#define trace_spurious_interrupt_bug spurious_interrupt_bug
+#define trace_coprocessor_error coprocessor_error
+#define trace_alignment_check alignment_check
+#define trace_simd_coprocessor_error simd_coprocessor_error
+#define trace_async_page_fault async_page_fault
+
dotraplinkage void do_divide_error(struct pt_regs *, long);
dotraplinkage void do_debug(struct pt_regs *, long);
dotraplinkage void do_nmi(struct pt_regs *, long);
@@ -55,6 +74,9 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+#ifdef CONFIG_TRACING
+dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
+#endif
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2cfbc3a..c9eb4e2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1244,6 +1244,16 @@ return_to_handler:
*/
.pushsection .kprobes.text, "ax"
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+ RING0_EC_FRAME
+ ASM_CLAC
+ pushl_cfi $trace_do_page_fault
+ jmp error_code
+ CFI_ENDPROC
+END(trace_page_fault)
+#endif
+
ENTRY(page_fault)
RING0_EC_FRAME
ASM_CLAC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1b69951..5136404 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1295,6 +1295,17 @@ ENTRY(\sym)
END(\sym)
.endm
+#ifdef CONFIG_TRACING
+.macro trace_errorentry sym do_sym
+errorentry trace(\sym) trace(\do_sym)
+errorentry \sym \do_sym
+.endm
+#else
+.macro trace_errorentry sym do_sym
+errorentry \sym \do_sym
+.endm
+#endif
+
/* error code is on the stack already */
.macro paranoiderrorentry sym do_sym
ENTRY(\sym)
@@ -1497,7 +1508,7 @@ zeroentry xen_int3 do_int3
errorentry xen_stack_segment do_stack_segment
#endif
errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
+trace_errorentry page_fault do_page_fault
#ifdef CONFIG_KVM_GUEST
errorentry async_page_fault do_async_page_fault
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1be8e43..aebb2bf 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -162,7 +162,7 @@ asmlinkage void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, &early_idt_handlers[i]);
+ set_intr_gate_raw(i, &early_idt_handlers[i]);
load_idt((const struct desc_ptr *)&idt_descr);
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc..2ca2354 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -206,7 +206,7 @@ void __init native_init_IRQ(void)
i = FIRST_EXTERNAL_VECTOR;
for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate_raw(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
if (!acpi_ioapic && !of_ioapic)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 697b93a..ba202ee 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -464,7 +464,7 @@ static struct notifier_block kvm_cpu_notifier = {
static void __init kvm_apf_trap_init(void)
{
- set_intr_gate(14, &async_page_fault);
+ set_intr_gate(14, async_page_fault);
}
void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8c8093b..1c9d0ad 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -713,7 +713,7 @@ void __init early_trap_init(void)
/* int3 can be called from all */
set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
#ifdef CONFIG_X86_32
- set_intr_gate(X86_TRAP_PF, &page_fault);
+ set_intr_gate(X86_TRAP_PF, page_fault);
#endif
load_idt(&idt_descr);
}
@@ -721,7 +721,7 @@ void __init early_trap_init(void)
void __init early_trap_pf_init(void)
{
#ifdef CONFIG_X86_64
- set_intr_gate(X86_TRAP_PF, &page_fault);
+ set_intr_gate(X86_TRAP_PF, page_fault);
#endif
}
@@ -737,30 +737,30 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
- set_intr_gate(X86_TRAP_DE, ÷_error);
+ set_intr_gate(X86_TRAP_DE, divide_error);
set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
set_system_intr_gate(X86_TRAP_OF, &overflow);
- set_intr_gate(X86_TRAP_BR, &bounds);
- set_intr_gate(X86_TRAP_UD, &invalid_op);
- set_intr_gate(X86_TRAP_NM, &device_not_available);
+ set_intr_gate(X86_TRAP_BR, bounds);
+ set_intr_gate(X86_TRAP_UD, invalid_op);
+ set_intr_gate(X86_TRAP_NM, device_not_available);
#ifdef CONFIG_X86_32
set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
- set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
- set_intr_gate(X86_TRAP_TS, &invalid_TSS);
- set_intr_gate(X86_TRAP_NP, &segment_not_present);
+ set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
+ set_intr_gate(X86_TRAP_TS, invalid_TSS);
+ set_intr_gate(X86_TRAP_NP, segment_not_present);
set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(X86_TRAP_GP, &general_protection);
- set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
- set_intr_gate(X86_TRAP_MF, &coprocessor_error);
- set_intr_gate(X86_TRAP_AC, &alignment_check);
+ set_intr_gate(X86_TRAP_GP, general_protection);
+ set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
+ set_intr_gate(X86_TRAP_MF, coprocessor_error);
+ set_intr_gate(X86_TRAP_AC, alignment_check);
#ifdef CONFIG_X86_MCE
set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
- set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
+ set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5f..6a19ad9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4a..f515154 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,9 @@
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_START */
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
/*
* Page fault error code bits:
*
@@ -1230,3 +1233,22 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
__do_page_fault(regs, error_code);
exception_exit(prev_state);
}
+
+static void trace_page_fault_entries(struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_user_page_fault(read_cr2(), regs, error_code);
+ else
+ trace_kernel_page_fault(read_cr2(), regs, error_code);
+}
+
+dotraplinkage void __kprobes
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ enum ctx_state prev_state;
+ prev_state = exception_enter();
+ trace_page_fault_entries(regs, error_code);
+ __do_page_fault(regs, error_code);
+ exception_exit(prev_state);
+}
--
1.8.2.1
Peter,
Any comment?
Seiji
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf Of Seiji Aguchi
> Sent: Monday, September 09, 2013 5:56 PM
> To: [email protected]; [email protected]
> Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; Tomoki Sekiyama
> Subject: [PATCH v3] Introduce page fault tracepoint
>
> Change from v2
> - Print entry->ip instead of entry->regs->ip to avoid kernel crash.
> - Use %pf instead of 0x%lx to print address and ip.
>
> This patch introduces page fault tracepoints to x86 architecture
> by switching IDT.
>
> [Use case of page fault events]
>
> Two events, for user and kernel spaces, are introduced at the beginning of
> page fault handler.
>
> - User space event
> There is a request of page fault event for user space as below.
>
> http://marc.info/?l=linux-mm&m=136807959830182&w=2
> http://marc.info/?l=linux-mm&m=136807959130175&w=2
>
> - Kernel space event:
> Overhead in kernel space is measurable by enabling it.
>
> [Creating IDT]
>
> A way to create IDT is as below.
>
> - Introduce set_intr_gate_raw() to register just non-trace handler to IDT.
> This is used at boot time which tracing is disabled.
> - Make set_intr_gate() macro so that it can register trace handler to
> trace IDT and non-trace handler to normal IDT.
>
> Signed-off-by: Seiji Aguchi <[email protected]>
> ---
> arch/x86/include/asm/desc.h | 33 +++++++++++++++++----
> arch/x86/include/asm/hw_irq.h | 14 ++++++++-
> arch/x86/include/asm/trace/exceptions.h | 52 +++++++++++++++++++++++++++++++++
> arch/x86/include/asm/traps.h | 22 ++++++++++++++
> arch/x86/kernel/entry_32.S | 10 +++++++
> arch/x86/kernel/entry_64.S | 13 ++++++++-
> arch/x86/kernel/head64.c | 2 +-
> arch/x86/kernel/irqinit.c | 2 +-
> arch/x86/kernel/kvm.c | 2 +-
> arch/x86/kernel/traps.c | 28 +++++++++---------
> arch/x86/mm/Makefile | 2 ++
> arch/x86/mm/fault.c | 22 ++++++++++++++
> 12 files changed, 178 insertions(+), 24 deletions(-)
> create mode 100644 arch/x86/include/asm/trace/exceptions.h
>
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index b90e5df..c04302b 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -327,10 +327,28 @@ static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
> {
> write_idt_entry(trace_idt_table, entry, gate);
> }
> +
> +static inline void _trace_set_gate(int gate, unsigned type, void *addr,
> + unsigned dpl, unsigned ist, unsigned seg)
> +{
> + gate_desc s;
> +
> + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
> + /*
> + * does not need to be atomic because it is only done once at
> + * setup time
> + */
> + write_trace_idt_entry(gate, &s);
> +}
> #else
> static inline void write_trace_idt_entry(int entry, const gate_desc *gate)
> {
> }
> +
> +static inline void _trace_set_gate(int gate, unsigned type, void *addr,
> + unsigned dpl, unsigned ist, unsigned seg)
> +{
> +}
> #endif
>
> static inline void _set_gate(int gate, unsigned type, void *addr,
> @@ -353,12 +371,20 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
> * Pentium F0 0F bugfix can have resulted in the mapped
> * IDT being write-protected.
> */
> -static inline void set_intr_gate(unsigned int n, void *addr)
> +static inline void set_intr_gate_raw(unsigned int n, void *addr)
> {
> BUG_ON((unsigned)n > 0xFF);
> _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
> }
>
> +#define set_intr_gate(n, addr) \
> + do { \
> + BUG_ON((unsigned)n > 0xFF); \
> + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); \
> + _trace_set_gate(n, GATE_INTERRUPT, trace_##addr, 0, 0, \
> + __KERNEL_CS); \
> + } while (0)
> +
> extern int first_system_vector;
> /* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
> extern unsigned long used_vectors[];
> @@ -395,10 +421,7 @@ static inline void trace_set_intr_gate(unsigned int gate, void *addr)
> #define __trace_alloc_intr_gate(n, addr)
> #endif
>
> -static inline void __alloc_intr_gate(unsigned int n, void *addr)
> -{
> - set_intr_gate(n, addr);
> -}
> +#define __alloc_intr_gate(n, addr) set_intr_gate(n, addr)
>
> #define alloc_intr_gate(n, addr) \
> do { \
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index 92b3bae..c856e69 100644
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -89,10 +89,22 @@ extern void trace_reschedule_interrupt(void);
> extern void trace_threshold_interrupt(void);
> extern void trace_call_function_interrupt(void);
> extern void trace_call_function_single_interrupt(void);
> +#else /* CONFIG_TRACING */
> +#define trace_apic_timer_interrupt apic_timer_interrupt
> +#define trace_x86_platform_ipi x86_platform_ipi
> +#define trace_error_interrupt error_interrupt
> +#define trace_irq_work_interrupt irq_work_interrupt
> +#define trace_spurious_interrupt spurious_interrupt
> +#define trace_thermal_interrupt thermal_interrupt
> +#define trace_reschedule_interrupt reschedule_interrupt
> +#define trace_threshold_interrupt threshold_interrupt
> +#define trace_call_function_interrupt call_function_interrupt
> +#define trace_call_function_single_interrupt call_function_single_interrupt
> +#endif
> +
> #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
> #define trace_reboot_interrupt reboot_interrupt
> #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
> -#endif /* CONFIG_TRACING */
>
> /* IOAPIC */
> #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
> diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
> new file mode 100644
> index 0000000..86540c0
> --- /dev/null
> +++ b/arch/x86/include/asm/trace/exceptions.h
> @@ -0,0 +1,52 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM exceptions
> +
> +#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_PAGE_FAULT_H
> +
> +#include <linux/tracepoint.h>
> +
> +extern void trace_irq_vector_regfunc(void);
> +extern void trace_irq_vector_unregfunc(void);
> +
> +DECLARE_EVENT_CLASS(x86_exceptions,
> +
> + TP_PROTO(unsigned long address, struct pt_regs *regs,
> + unsigned long error_code),
> +
> + TP_ARGS(address, regs, error_code),
> +
> + TP_STRUCT__entry(
> + __field( unsigned long, address )
> + __field( unsigned long, ip )
> + __field( unsigned long, error_code )
> + ),
> +
> + TP_fast_assign(
> + __entry->address = address;
> + __entry->ip = regs->ip;
> + __entry->error_code = error_code;
> + ),
> +
> + TP_printk("address=%pf ip=%pf error_code=0x%lx",
> + (void *)__entry->address, (void *)__entry->ip,
> + __entry->error_code) );
> +
> +#define DEFINE_PAGE_FAULT_EVENT(name) \
> +DEFINE_EVENT_FN(x86_exceptions, name, \
> + TP_PROTO(unsigned long address, struct pt_regs *regs, \
> + unsigned long error_code), \
> + TP_ARGS(address, regs, error_code), \
> + trace_irq_vector_regfunc, \
> + trace_irq_vector_unregfunc);
> +
> +DEFINE_PAGE_FAULT_EVENT(user_page_fault);
> +DEFINE_PAGE_FAULT_EVENT(kernel_page_fault);
> +
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH .
> +#define TRACE_INCLUDE_FILE exceptions
> +#endif /* _TRACE_PAGE_FAULT_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 7036cb6..a400a22 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -37,6 +37,25 @@ asmlinkage void machine_check(void);
> #endif /* CONFIG_X86_MCE */
> asmlinkage void simd_coprocessor_error(void);
>
> +#ifdef CONFIG_TRACING
> +asmlinkage void trace_page_fault(void);
> +#else
> +#define trace_page_fault page_fault
> +#endif
> +#define trace_divide_error divide_error
> +#define trace_bounds bounds
> +#define trace_invalid_op invalid_op
> +#define trace_device_not_available device_not_available
> +#define trace_coprocessor_segment_overrun coprocessor_segment_overrun
> +#define trace_invalid_TSS invalid_TSS
> +#define trace_segment_not_present segment_not_present
> +#define trace_general_protection general_protection
> +#define trace_spurious_interrupt_bug spurious_interrupt_bug
> +#define trace_coprocessor_error coprocessor_error
> +#define trace_alignment_check alignment_check
> +#define trace_simd_coprocessor_error simd_coprocessor_error
> +#define trace_async_page_fault async_page_fault
> +
> dotraplinkage void do_divide_error(struct pt_regs *, long);
> dotraplinkage void do_debug(struct pt_regs *, long);
> dotraplinkage void do_nmi(struct pt_regs *, long);
> @@ -55,6 +74,9 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *);
> #endif
> dotraplinkage void do_general_protection(struct pt_regs *, long);
> dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
> +#ifdef CONFIG_TRACING
> +dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
> +#endif
> dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
> dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
> dotraplinkage void do_alignment_check(struct pt_regs *, long);
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index 2cfbc3a..c9eb4e2 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -1244,6 +1244,16 @@ return_to_handler:
> */
> .pushsection .kprobes.text, "ax"
>
> +#ifdef CONFIG_TRACING
> +ENTRY(trace_page_fault)
> + RING0_EC_FRAME
> + ASM_CLAC
> + pushl_cfi $trace_do_page_fault
> + jmp error_code
> + CFI_ENDPROC
> +END(trace_page_fault)
> +#endif
> +
> ENTRY(page_fault)
> RING0_EC_FRAME
> ASM_CLAC
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 1b69951..5136404 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1295,6 +1295,17 @@ ENTRY(\sym)
> END(\sym)
> .endm
>
> +#ifdef CONFIG_TRACING
> +.macro trace_errorentry sym do_sym
> +errorentry trace(\sym) trace(\do_sym)
> +errorentry \sym \do_sym
> +.endm
> +#else
> +.macro trace_errorentry sym do_sym
> +errorentry \sym \do_sym
> +.endm
> +#endif
> +
> /* error code is on the stack already */
> .macro paranoiderrorentry sym do_sym
> ENTRY(\sym)
> @@ -1497,7 +1508,7 @@ zeroentry xen_int3 do_int3
> errorentry xen_stack_segment do_stack_segment
> #endif
> errorentry general_protection do_general_protection
> -errorentry page_fault do_page_fault
> +trace_errorentry page_fault do_page_fault
> #ifdef CONFIG_KVM_GUEST
> errorentry async_page_fault do_async_page_fault
> #endif
> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
> index 1be8e43..aebb2bf 100644
> --- a/arch/x86/kernel/head64.c
> +++ b/arch/x86/kernel/head64.c
> @@ -162,7 +162,7 @@ asmlinkage void __init x86_64_start_kernel(char * real_mode_data)
> clear_bss();
>
> for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
> - set_intr_gate(i, &early_idt_handlers[i]);
> + set_intr_gate_raw(i, &early_idt_handlers[i]);
> load_idt((const struct desc_ptr *)&idt_descr);
>
> copy_bootdata(__va(real_mode_data));
> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
> index a2a1fbc..2ca2354 100644
> --- a/arch/x86/kernel/irqinit.c
> +++ b/arch/x86/kernel/irqinit.c
> @@ -206,7 +206,7 @@ void __init native_init_IRQ(void)
> i = FIRST_EXTERNAL_VECTOR;
> for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
> /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
> - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
> + set_intr_gate_raw(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
> }
>
> if (!acpi_ioapic && !of_ioapic)
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 697b93a..ba202ee 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -464,7 +464,7 @@ static struct notifier_block kvm_cpu_notifier = {
>
> static void __init kvm_apf_trap_init(void)
> {
> - set_intr_gate(14, &async_page_fault);
> + set_intr_gate(14, async_page_fault);
> }
>
> void __init kvm_guest_init(void)
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 8c8093b..1c9d0ad 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -713,7 +713,7 @@ void __init early_trap_init(void)
> /* int3 can be called from all */
> set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
> #ifdef CONFIG_X86_32
> - set_intr_gate(X86_TRAP_PF, &page_fault);
> + set_intr_gate(X86_TRAP_PF, page_fault);
> #endif
> load_idt(&idt_descr);
> }
> @@ -721,7 +721,7 @@ void __init early_trap_init(void)
> void __init early_trap_pf_init(void)
> {
> #ifdef CONFIG_X86_64
> - set_intr_gate(X86_TRAP_PF, &page_fault);
> + set_intr_gate(X86_TRAP_PF, page_fault);
> #endif
> }
>
> @@ -737,30 +737,30 @@ void __init trap_init(void)
> early_iounmap(p, 4);
> #endif
>
> - set_intr_gate(X86_TRAP_DE, ÷_error);
> + set_intr_gate(X86_TRAP_DE, divide_error);
> set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
> /* int4 can be called from all */
> set_system_intr_gate(X86_TRAP_OF, &overflow);
> - set_intr_gate(X86_TRAP_BR, &bounds);
> - set_intr_gate(X86_TRAP_UD, &invalid_op);
> - set_intr_gate(X86_TRAP_NM, &device_not_available);
> + set_intr_gate(X86_TRAP_BR, bounds);
> + set_intr_gate(X86_TRAP_UD, invalid_op);
> + set_intr_gate(X86_TRAP_NM, device_not_available);
> #ifdef CONFIG_X86_32
> set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
> #else
> set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
> #endif
> - set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
> - set_intr_gate(X86_TRAP_TS, &invalid_TSS);
> - set_intr_gate(X86_TRAP_NP, &segment_not_present);
> + set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
> + set_intr_gate(X86_TRAP_TS, invalid_TSS);
> + set_intr_gate(X86_TRAP_NP, segment_not_present);
> set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
> - set_intr_gate(X86_TRAP_GP, &general_protection);
> - set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
> - set_intr_gate(X86_TRAP_MF, &coprocessor_error);
> - set_intr_gate(X86_TRAP_AC, &alignment_check);
> + set_intr_gate(X86_TRAP_GP, general_protection);
> + set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
> + set_intr_gate(X86_TRAP_MF, coprocessor_error);
> + set_intr_gate(X86_TRAP_AC, alignment_check);
> #ifdef CONFIG_X86_MCE
> set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
> #endif
> - set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
> + set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
>
> /* Reserve all the builtin and the syscall vector: */
> for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 23d8e5f..6a19ad9 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)
> CFLAGS_physaddr.o := $(nostackp)
> CFLAGS_setup_nx.o := $(nostackp)
>
> +CFLAGS_fault.o := -I$(src)/../include/asm/trace
> +
> obj-$(CONFIG_X86_PAT) += pat_rbtree.o
> obj-$(CONFIG_SMP) += tlb.o
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 654be4a..f515154 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -20,6 +20,9 @@
> #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
> #include <asm/fixmap.h> /* VSYSCALL_START */
>
> +#define CREATE_TRACE_POINTS
> +#include <asm/trace/exceptions.h>
> +
> /*
> * Page fault error code bits:
> *
> @@ -1230,3 +1233,22 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
> __do_page_fault(regs, error_code);
> exception_exit(prev_state);
> }
> +
> +static void trace_page_fault_entries(struct pt_regs *regs,
> + unsigned long error_code)
> +{
> + if (user_mode(regs))
> + trace_user_page_fault(read_cr2(), regs, error_code);
> + else
> + trace_kernel_page_fault(read_cr2(), regs, error_code);
> +}
> +
> +dotraplinkage void __kprobes
> +trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
> +{
> + enum ctx_state prev_state;
> + prev_state = exception_enter();
> + trace_page_fault_entries(regs, error_code);
> + __do_page_fault(regs, error_code);
> + exception_exit(prev_state);
> +}
> --
> 1.8.2.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On 09/09/2013 02:55 PM, Seiji Aguchi wrote:
> Change from v2
> - Print entry->ip instead of entry->regs->ip to avoid kernel crash.
> - Use %pf instead of 0x%lx to print address and ip.
>
> This patch introduces page fault tracepoints to x86 architecture
> by switching IDT.
>
> [Use case of page fault events]
>
> Two events, for user and kernel spaces, are introduced at the beginning of
> page fault handler.
>
> - User space event
> There is a request of page fault event for user space as below.
>
> http://marc.info/?l=linux-mm&m=136807959830182&w=2
> http://marc.info/?l=linux-mm&m=136807959130175&w=2
>
For permanence, please use links of the form:
http://lkml.kernel.org/r/message-id
(Yes, they currently point to marc.info, but can be redirected to point
to any archive.)
> - Kernel space event:
> Overhead in kernel space is measurable by enabling it.
>
> [Creating IDT]
>
> A way to create IDT is as below.
>
> - Introduce set_intr_gate_raw() to register just non-trace handler to IDT.
> This is used at boot time which tracing is disabled.
> - Make set_intr_gate() macro so that it can register trace handler to
> trace IDT and non-trace handler to normal IDT.
>
This is needlessly confusing, which is apart of why reviewing this patch
took a lot more time than it should.
Please break this patch into two: one which sets up the tracing IDT and
one to create the #PF tracepoint. The assumption is, I am assuming,
there will be more.
-hpa
Thank you for reviewing.
> > http://marc.info/?l=linux-mm&m=136807959830182&w=2
> > http://marc.info/?l=linux-mm&m=136807959130175&w=2
> >
>
> For permanence, please use links of the form:
>
> http://lkml.kernel.org/r/message-id
>
> (Yes, they currently point to marc.info, but can be redirected to point
> to any archive.)
I will fix it.
> This is needlessly confusing, which is apart of why reviewing this patch
> took a lot more time than it should.
>
> Please break this patch into two: one which sets up the tracing IDT and
> one to create the #PF tracepoint. The assumption is, I am assuming,
> there will be more.
OK. I will divide the patch into two or more to make the review smooth.
Seiji
>
> -hpa
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?