Despire the current efforts to read CR2 before tracing happens there
still exist a number of possible holes:
idtentry page_fault do_page_fault has_error_code=1
call error_entry
TRACE_IRQS_OFF
call trace_hardirqs_off*
#PF // modifies CR2
CALL_enter_from_user_mode
__context_tracking_exit()
trace_user_exit(0)
#PF // modifies CR2
call do_page_fault
address = read_cr2(); /* whoopsie */
And similar for i386.
Fix it by pulling the CR2 read into the entry code, before any of that
stuff gets a chance to run and ruin things.
Reported-by: He Zhe <[email protected]>
Reported-by: Eiichi Tsukata <[email protected]>
Debugged-by: Steven Rostedt <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/entry/entry_32.S | 25 ++++++++++++++++++++++---
arch/x86/entry/entry_64.S | 35 ++++++++++++++++++-----------------
arch/x86/include/asm/kvm_para.h | 2 +-
arch/x86/include/asm/traps.h | 4 ++--
arch/x86/kernel/kvm.c | 8 ++++----
arch/x86/kernel/traps.c | 6 +-----
arch/x86/mm/fault.c | 28 ++++++++++------------------
7 files changed, 58 insertions(+), 50 deletions(-)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
ENTRY(page_fault)
ASM_CLAC
- pushl $do_page_fault
- ALIGN
- jmp common_exception
+ pushl $0; /* %gs's slot on the stack */
+
+ SAVE_ALL switch_stacks=1 skip_gs=1
+
+ ENCODE_FRAME_POINTER
+ UNWIND_ESPFIX_STACK
+
+ /* fixup %gs */
+ GS_TO_REG %ecx
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
+
+ GET_CR2_INTO(%ecx) # might clobber %eax
+
+ /* fixup orig %eax */
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+
+ TRACE_IRQS_OFF
+ movl %esp, %eax # pt_regs pointer
+ call do_page_fault
+ jmp ret_from_exception
END(page_fault)
common_exception:
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -865,7 +865,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
-.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
+.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
.if \paranoid
call paranoid_entry
@@ -875,12 +875,21 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
.endif
UNWIND_HINT_REGS
- .if \paranoid
+ .if \read_cr2
+ GET_CR2_INTO(%rdx); /* can clobber %rax */
+ .endif
+
.if \shift_ist != -1
TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
.else
TRACE_IRQS_OFF
.endif
+
+ .if \paranoid == 0
+ testb $3, CS(%rsp)
+ jz .Lfrom_kernel_no_context_tracking_\@
+ CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
.endif
movq %rsp, %rdi /* pt_regs pointer */
@@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
* fresh stack. (This is for #DB, which has a nasty habit
* of recursing.)
* @create_gap: create a 6-word stack gap when coming from kernel mode.
+ * @read_cr2: load CR2 into the 3rd argument; done before calling any C code
*
* idtentry generates an IDT stub that sets up a usable kernel context,
* creates struct pt_regs, and calls @do_sym. The stub has the following
@@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
* @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -985,7 +995,7 @@ ENTRY(\sym)
.Lfrom_usermode_no_gap_\@:
.endif
- idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
+ idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
.if \paranoid == 1
/*
@@ -994,7 +1004,7 @@ ENTRY(\sym)
* run in real process context if user_mode(regs).
*/
.Lfrom_usermode_switch_stack_\@:
- idtentry_part \do_sym, \has_error_code, 0
+ idtentry_part \do_sym, \has_error_code, \read_cr2, 0
.endif
_ASM_NOKPROBE(\sym)
@@ -1006,7 +1016,7 @@ idtentry overflow do_overflow has_er
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1179,10 +1189,10 @@ idtentry xenint3 do_int3 has_error_co
#endif
idtentry general_protection do_general_protection has_error_code=1
-idtentry page_fault do_page_fault has_error_code=1
+idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
#endif
#ifdef CONFIG_X86_MCE
@@ -1337,18 +1347,9 @@ ENTRY(error_entry)
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
-
- /*
- * We need to tell lockdep that IRQs are off. We can't do this until
- * we fix gsbase, and we should do it before enter_from_user_mode
- * (which can take locks).
- */
- TRACE_IRQS_OFF
- CALL_enter_from_user_mode
ret
.Lerror_entry_done:
- TRACE_IRQS_OFF
ret
/*
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, i
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
+void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init kvm_spinlock_init(void);
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
#ifdef CONFIG_X86_64
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code);
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address);
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
void __init trap_init(void);
#endif
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code);
+dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
enum ctx_state prev_state;
switch (kvm_read_and_reset_pf_reason()) {
default:
- do_page_fault(regs, error_code);
+ do_page_fault(regs, error_code, address);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
prev_state = exception_enter();
- kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+ kvm_async_pf_task_wait((u32)address, !user_mode(regs));
exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
- kvm_async_pf_task_wake((u32)read_cr2());
+ kvm_async_pf_task_wake((u32)address);
rcu_irq_exit();
break;
}
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_o
#ifdef CONFIG_X86_64
/* Runs on IST stack */
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
-#ifdef CONFIG_VMAP_STACK
- unsigned long cr2;
-#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struc
* stack even if the actual trigger for the double fault was
* something else.
*/
- cr2 = read_cr2();
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1513,7 +1513,7 @@ NOKPROBE_SYMBOL(do_user_addr_fault);
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
-static noinline void
+static __always_inline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
@@ -1528,35 +1528,27 @@ __do_page_fault(struct pt_regs *regs, un
else
do_user_addr_fault(regs, hw_error_code, address);
}
-NOKPROBE_SYMBOL(__do_page_fault);
-static nokprobe_inline void
-trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
+static __always_inline void
+trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
+ if (!trace_pagefault_enabled())
+ return;
+
if (user_mode(regs))
trace_page_fault_user(address, regs, error_code);
else
trace_page_fault_kernel(address, regs, error_code);
}
-/*
- * We must have this function blacklisted from kprobes, tagged with notrace
- * and call read_cr2() before calling anything else. To avoid calling any
- * kind of tracing machinery before we've observed the CR2 value.
- *
- * exception_{enter,exit}() contains all sorts of tracepoints.
- */
-dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+dotraplinkage void
+do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- unsigned long address = read_cr2(); /* Get the faulting address */
enum ctx_state prev_state;
prev_state = exception_enter();
- if (trace_pagefault_enabled())
- trace_page_fault_entries(address, regs, error_code);
-
+ trace_page_fault_entries(regs, error_code, address);
__do_page_fault(regs, error_code, address);
exception_exit(prev_state);
}
On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <[email protected]> wrote:
>
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:
So this whole series disturbs me for the simple reason that I thought
tracing was supposed to save/restore cr2 and make it unnecessary to
worry about this in non-tracing code.
That is very much what the NMI code explicitly does. Why shouldn't all
the other tracing code do the same thing in case they can take page
faults?
So I don't think the patches are wrong per se, but this seems to solve
it at the wrong level.
Linus
> On Jul 4, 2019, at 7:18 PM, Linus Torvalds <[email protected]> wrote:
>
>> On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <[email protected]> wrote:
>>
>> Despire the current efforts to read CR2 before tracing happens there
>> still exist a number of possible holes:
>
> So this whole series disturbs me for the simple reason that I thought
> tracing was supposed to save/restore cr2 and make it unnecessary to
> worry about this in non-tracing code.
>
> That is very much what the NMI code explicitly does. Why shouldn't all
> the other tracing code do the same thing in case they can take page
> faults?
>
If nothing else, MOV to CR2 is architecturally serializing, so, unless there’s some fancy unwinding involved, this will be quite slow.
On Fri, Jul 5, 2019 at 12:16 PM Andy Lutomirski <[email protected]> wrote:
>
> If nothing else, MOV to CR2 is architecturally serializing, so, unless there’s some fancy unwinding involved, this will be quite slow.
That's why the NMI code does this:
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
so that it normally only does a read. Only if you actually took a page
fault will it restore cr2 to the old value (and if you took a page
fault the performance issues will be _there_, not in the "restore cr2"
part)
Linus
On Fri, Jul 05, 2019 at 11:18:51AM +0900, Linus Torvalds wrote:
> On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <[email protected]> wrote:
> >
> > Despire the current efforts to read CR2 before tracing happens there
> > still exist a number of possible holes:
>
> So this whole series disturbs me for the simple reason that I thought
> tracing was supposed to save/restore cr2 and make it unnecessary to
> worry about this in non-tracing code.
>
> That is very much what the NMI code explicitly does. Why shouldn't all
> the other tracing code do the same thing in case they can take page
> faults?
>
> So I don't think the patches are wrong per se, but this seems to solve
> it at the wrong level.
My thinking is that that results in far too many sites which we have to
fix and a possibly fragility of interface. Invariably we'll get multiple
interface for the same thing, one which preserves CR2 and one which
doesn't -- in the name of performance. And then someone uses the wrong
one, and we're back where we started.
Conversely, this way we get to fix it in one place.
Also; all previous attempts at fixing this have been about pushing the
read_cr2() earlier; notably:
0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")
And I'm thinking that with exception of this patch, the rest are
worthwhile cleanups regardless.
Also; while looking at this, if we do continue with the C wrappers from
the very last patch, we can do horrible things like this on top and move
the read_cr2() back into C code.
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -826,7 +826,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
-.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
+.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
.if \paranoid
call paranoid_entry
@@ -836,10 +836,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
.endif
UNWIND_HINT_REGS
- .if \read_cr2
- GET_CR2_INTO(%rdx); /* can clobber %rax */
- .endif
-
.if \has_error_code
movq ORIG_RAX(%rsp), %rsi /* get error code */
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
@@ -885,7 +881,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
* fresh stack. (This is for #DB, which has a nasty habit
* of recursing.)
* @create_gap: create a 6-word stack gap when coming from kernel mode.
- * @read_cr2: load CR2 into the 3rd argument; done before calling any C code
*
* idtentry generates an IDT stub that sets up a usable kernel context,
* creates struct pt_regs, and calls @do_sym. The stub has the following
@@ -910,7 +905,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
* @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -948,7 +943,7 @@ ENTRY(\sym)
.Lfrom_usermode_no_gap_\@:
.endif
- idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
+ idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
.if \paranoid == 1
/*
@@ -957,7 +952,7 @@ ENTRY(\sym)
* run in real process context if user_mode(regs).
*/
.Lfrom_usermode_switch_stack_\@:
- idtentry_part \do_sym, \has_error_code, \read_cr2, 0
+ idtentry_part \do_sym, \has_error_code, paranoid=0
.endif
_ASM_NOKPROBE(\sym)
@@ -969,7 +964,7 @@ idtentry overflow do_overflow has_er
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1142,10 +1137,10 @@ idtentry xenint3 do_int3 has_error_co
#endif
idtentry general_protection do_general_protection has_error_code=1
-idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
+idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
#ifdef CONFIG_X86_MCE
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -22,20 +22,34 @@
#define CALL_enter_from_user_mode(_regs)
#endif
+#define __IDT_NR1 1
+#define __IDT_NR2 2
+#define __IDT_NR3 2
+
+#define IDT_NR(n) __IDT_NR##n
+
+#define __IDT_TRAP1(t1,a1)
+#define __IDT_TRAP2(t1,a1,t2,a2)
+#define __IDT_TRAP3(t1,a1,t2,a2,t3,a3) t3 a3 = read_cr2()
+
+#define IDT_TRAP(n,...) __IDT_TRAP##n(__VA_ARGS__)
+
#define IDTENTRYx(n, name, ...) \
static notrace void __idt_##name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)); \
NOKPROBE_SYMBOL(__idt_##name); \
- dotraplinkage notrace void name(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)) \
+ dotraplinkage notrace void name(__IDT_MAP(__IDT_NR(n), __IDT_DECL, __VA_ARGS__)) \
{ \
__IDT_MAP(n, __IDT_TEST, __VA_ARGS__); \
+ __IDT_TRAP(n, __VA_ARGS__); \
trace_hardirqs_off(); \
CALL_enter_from_user_mode(regs); \
__idt_##name(__IDT_MAP(n, __IDT_ARGS, __VA_ARGS__)); \
} \
NOKPROBE_SYMBOL(name); \
- dotraplinkage notrace void name##_paranoid(__IDT_MAP(n, __IDT_DECL, __VA_ARGS__)) \
+ dotraplinkage notrace void name##_paranoid(__IDT_MAP(__IDT_NR(n), __IDT_DECL, __VA_ARGS__)) \
{ \
__IDT_MAP(n, __IDT_TEST, __VA_ARGS__); \
+ __IDT_TRAP(n, __VA_ARGS__); \
trace_hardirqs_off(); \
__idt_##name(__IDT_MAP(n, __IDT_ARGS, __VA_ARGS__)); \
} \
On 2019/07/05 11:18, Linus Torvalds wrote:
> On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <[email protected]> wrote:
>>
>> Despire the current efforts to read CR2 before tracing happens there
>> still exist a number of possible holes:
>
> So this whole series disturbs me for the simple reason that I thought
> tracing was supposed to save/restore cr2 and make it unnecessary to
> worry about this in non-tracing code.
>
> That is very much what the NMI code explicitly does. Why shouldn't all
> the other tracing code do the same thing in case they can take page
> faults?
>
> So I don't think the patches are wrong per se, but this seems to solve
> it at the wrong level.
>
> Linus
>
Steven previously tried to fix it by saving CR2 in TRACE_IRQS_OFF:
https://lore.kernel.org/lkml/[email protected]/
But hit the following WARNING:
https://lore.kernel.org/lkml/[email protected]/
I tried to find out the root cause of the WARNING, and found that it is
caused by touching trace point(TRACE_IRQS_OFF) before search_binary_handler()
at exeve.
To prevent userstack trace code from reading user stack before it becomes ready,
checking current->in_execve in stack_trace_save_user() can help Steven's approach,
though trace_sched_process_exec() is called before current->in_execve = 0 so it changes
current behavior.
The PoC code is as follows:
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 2abf27d7df6b..30fa6e1b7a87 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -116,10 +116,12 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
const struct pt_regs *regs)
{
const void __user *fp = (const void __user *)regs->bp;
+ unsigned long address;
if (!consume_entry(cookie, regs->ip, false))
return;
+ address = read_cr2();
while (1) {
struct stack_frame_user frame;
@@ -131,11 +133,14 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
break;
if (frame.ret_addr) {
if (!consume_entry(cookie, frame.ret_addr, false))
- return;
+ break;
}
if (fp == frame.next_fp)
break;
fp = frame.next_fp;
}
+
+ if (address != read_cr2())
+ write_cr2(address);
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 36139de0a3c4..489d33bb5d28 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -230,6 +230,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
/* Trace user stack if not a kernel thread */
if (!current->mm)
return 0;
+ /* current can reach some trace points before its stack is ready */
+ if (current->in_execve)
+ return 0;
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
return c.len;
On Fri, Jul 5, 2019 at 6:50 AM Peter Zijlstra <[email protected]> wrote:
>
> Also; all previous attempts at fixing this have been about pushing the
> read_cr2() earlier; notably:
>
> 0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
> d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")
I think both of those are because people - again - felt like tracing
can validly corrupt CPU state, and then they fix up the symptoms
rather than the cause.
Which I disagree with.
> And I'm thinking that with exception of this patch, the rest are
> worthwhile cleanups regardless.
I don't have any issues with the patches themselves, I agree that they
are probably good on their own.
I *do* have issues with the "tracing can change CPU state so we need
to deal with it" model, though. I think that mode of thinking is
wrong. I don't believe tracing should ever change core CPU state and
that be considered ok.
> Also; while looking at this, if we do continue with the C wrappers from
> the very last patch, we can do horrible things like this on top and move
> the read_cr2() back into C code.
Again, I don't think that is the problem. I think it's a much more
fundamental problem in thinking that core code should be changed
because tracing is broken garbage and didn't do things right.
I see Eiichi's patch, and it makes me go "that looks better" - simply
because it fixes the fundamental issue, rather than working around the
symptoms.
Linus
On Sat, 6 Jul 2019 14:41:22 -0700
Linus Torvalds <[email protected]> wrote:
> On Fri, Jul 5, 2019 at 6:50 AM Peter Zijlstra <[email protected]> wrote:
> >
> > Also; all previous attempts at fixing this have been about pushing the
> > read_cr2() earlier; notably:
> >
> > 0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
> > d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")
>
> I think both of those are because people - again - felt like tracing
> can validly corrupt CPU state, and then they fix up the symptoms
> rather than the cause.
>
> Which I disagree with.
>
> > And I'm thinking that with exception of this patch, the rest are
> > worthwhile cleanups regardless.
>
> I don't have any issues with the patches themselves, I agree that they
> are probably good on their own.
>
> I *do* have issues with the "tracing can change CPU state so we need
> to deal with it" model, though. I think that mode of thinking is
> wrong. I don't believe tracing should ever change core CPU state and
> that be considered ok.
>
> > Also; while looking at this, if we do continue with the C wrappers from
> > the very last patch, we can do horrible things like this on top and move
> > the read_cr2() back into C code.
>
> Again, I don't think that is the problem. I think it's a much more
> fundamental problem in thinking that core code should be changed
> because tracing is broken garbage and didn't do things right.
>
> I see Eiichi's patch, and it makes me go "that looks better" - simply
> because it fixes the fundamental issue, rather than working around the
> symptoms.
>
We also have to deal with reading vmalloc'd data as that can fault too.
The perf ring buffer IIUC is vmalloc, so if perf records in one of
these locations, then the reading of the vmalloc area has a potential
to fault corrupting the CR2 register as well. Or have we changed
vmalloc to no longer fault?
-- Steve
On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <[email protected]> wrote:
>
> We also have to deal with reading vmalloc'd data as that can fault too.
Ahh, that may be a better reason for PeterZ's patches and reading cr2
very early from asm code than the stack trace case. It's why the page
fault handler delayed enabling interrupts, after all (which is one big
reason NMI was special).
Linus
On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <[email protected]> wrote:
>
> On Sat, 6 Jul 2019 14:41:22 -0700
> Linus Torvalds <[email protected]> wrote:
>
> > On Fri, Jul 5, 2019 at 6:50 AM Peter Zijlstra <[email protected]> wrote:
> > >
> > > Also; all previous attempts at fixing this have been about pushing the
> > > read_cr2() earlier; notably:
> > >
> > > 0ac09f9f8cd1 ("x86, trace: Fix CR2 corruption when tracing page faults")
> > > d4078e232267 ("x86, trace: Further robustify CR2 handling vs tracing")
> >
> > I think both of those are because people - again - felt like tracing
> > can validly corrupt CPU state, and then they fix up the symptoms
> > rather than the cause.
> >
> > Which I disagree with.
> >
> > > And I'm thinking that with exception of this patch, the rest are
> > > worthwhile cleanups regardless.
> >
> > I don't have any issues with the patches themselves, I agree that they
> > are probably good on their own.
> >
> > I *do* have issues with the "tracing can change CPU state so we need
> > to deal with it" model, though. I think that mode of thinking is
> > wrong. I don't believe tracing should ever change core CPU state and
> > that be considered ok.
> >
> > > Also; while looking at this, if we do continue with the C wrappers from
> > > the very last patch, we can do horrible things like this on top and move
> > > the read_cr2() back into C code.
> >
> > Again, I don't think that is the problem. I think it's a much more
> > fundamental problem in thinking that core code should be changed
> > because tracing is broken garbage and didn't do things right.
> >
> > I see Eiichi's patch, and it makes me go "that looks better" - simply
> > because it fixes the fundamental issue, rather than working around the
> > symptoms.
> >
>
> We also have to deal with reading vmalloc'd data as that can fault too.
> The perf ring buffer IIUC is vmalloc, so if perf records in one of
> these locations, then the reading of the vmalloc area has a potential
> to fault corrupting the CR2 register as well. Or have we changed
> vmalloc to no longer fault?
>
What context is that happening in? If the tracepoint-saves-CR2 patch
is in, then it should be fine if it nests inside of tracing, right?
And NMI needs to save and restore CR2 no matter what, since it can
happen before we can possibly save CR2. For that matter, MCE should
save and restore CR2 as well.
All that being said, I do like the idea of moving all this crud (IRQ
tracing, CR2 handling, and context tracking) into C. I haven't had
time to review that part of Peter's series yet, but I should get to it
soon.
On Sat, Jul 6, 2019 at 3:41 PM Linus Torvalds
<[email protected]> wrote:
>
> On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <[email protected]> wrote:
> >
> > We also have to deal with reading vmalloc'd data as that can fault too.
>
> Ahh, that may be a better reason for PeterZ's patches and reading cr2
> very early from asm code than the stack trace case.
Hmm. Another alternative might be to simply just make our vmalloc page
fault handling more robust.
Right now, if we take a vmalloc page fault in an inconvenient spot, it
is fatal because it corrupts the cr2 in the outer context.
However, it doesn't *need* to be fatal. Who cares if the outer context
cr2 gets corrupted? We probably *shouldn't* care - it's an odd and
unusual case, and the outer context could just handle the wrong
vmalloc-address cr2 fine (it's going to be a no-op, since the inner
page fault will have handled it already), return, and then re-fault.
The only reason it's fatal right now is that we care much too deeply about
(a) the error code
(b) the pt_regs state
when we handle vmalloc faults.
So one option is that we simply handle the vmalloc faults _without_
caring about the error code and the pt_regs state, because even if the
error code or the pt_regs implies that the fault comes from user
space, the cr2 value might be due to a vmalloc fault from the inner
kernel page fault that corrupted cr2.
Right now vmalloc faults are already special and need to be handled
without holding any locks etc. We'd just make them even more special,
and say that we might have a vmalloc area fault from any context.
IOW, somethinig like the attached patch would make nesting vmalloc
faults harmless. Sure, we'll do the "vmalloc fault" twice, and return
and re-do the original page fault, but this is a very unusual case, so
from a performance angle we don't really care.
But I guess the "read cr2 early" is fine too..
Linus
> On Jul 6, 2019, at 6:08 PM, Linus Torvalds <[email protected]> wrote:
>
> On Sat, Jul 6, 2019 at 3:41 PM Linus Torvalds
> <[email protected]> wrote:
>>
>>> On Sat, Jul 6, 2019 at 3:27 PM Steven Rostedt <[email protected]> wrote:
>>>
>>> We also have to deal with reading vmalloc'd data as that can fault too.
>>
>> Ahh, that may be a better reason for PeterZ's patches and reading cr2
>> very early from asm code than the stack trace case.
>
> Hmm. Another alternative might be to simply just make our vmalloc page
> fault handling more robust.
>
> Right now, if we take a vmalloc page fault in an inconvenient spot, it
> is fatal because it corrupts the cr2 in the outer context.
>
> However, it doesn't *need* to be fatal. Who cares if the outer context
> cr2 gets corrupted? We probably *shouldn't* care - it's an odd and
> unusual case, and the outer context could just handle the wrong
> vmalloc-address cr2 fine (it's going to be a no-op, since the inner
> page fault will have handled it already), return, and then re-fault.
>
> The only reason it's fatal right now is that we care much too deeply about
>
> (a) the error code
> (b) the pt_regs state
>
> when we handle vmalloc faults.
>
> So one option is that we simply handle the vmalloc faults _without_
> caring about the error code and the pt_regs state, because even if the
> error code or the pt_regs implies that the fault comes from user
> space, the cr2 value might be due to a vmalloc fault from the inner
> kernel page fault that corrupted cr2.
>
> Right now vmalloc faults are already special and need to be handled
> without holding any locks etc. We'd just make them even more special,
> and say that we might have a vmalloc area fault from any context.
>
> IOW, somethinig like the attached patch would make nesting vmalloc
> faults harmless. Sure, we'll do the "vmalloc fault" twice, and return
> and re-do the original page fault, but this is a very unusual case, so
> from a performance angle we don't really care.
Eww. I would like to be able to rely on fault into being correct. Also, your patch won’t do so well if the fault is from user mode, I think.
>
> But I guess the "read cr2 early" is fine too..
>
> Linus
> <patch.diff>
On 2019/07/07 7:27, Steven Rostedt wrote:
>
> We also have to deal with reading vmalloc'd data as that can fault too.
> The perf ring buffer IIUC is vmalloc, so if perf records in one of
> these locations, then the reading of the vmalloc area has a potential
> to fault corrupting the CR2 register as well. Or have we changed
> vmalloc to no longer fault?
>
> -- Steve
>
It seems that perf ring buffer does not normally use vmalloc.
It depends on CONFIG_PERF_USE_VMALLOC introduced by the following commit:
commit 906010b2134e14a2e377decbadd357b3d0ab9c6a
Author: Peter Zijlstra <[email protected]>
Date: Mon Sep 21 16:08:49 2009 +0200
perf_event: Provide vmalloc() based mmap() backing
Some architectures such as Sparc, ARM and MIPS (basically
everything with flush_dcache_page()) need to deal with dcache
aliases by carefully placing pages in both kernel and user maps.
These architectures typically have to use vmalloc_user() for this.
However, on other architectures, vmalloc() is not needed and has
the downsides of being more restricted and slower than regular
allocations.
Signed-off-by: Peter Zijlstra <[email protected]>
Acked-by: David Miller <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Paul Mackerras <[email protected]>
LKML-Reference: <1254830228.21044.272.camel@laptop>
Signed-off-by: Ingo Molnar <[email protected]>
On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <[email protected]> wrote:
>
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:
>
> idtentry page_fault do_page_fault has_error_code=1
> call error_entry
> TRACE_IRQS_OFF
> call trace_hardirqs_off*
> #PF // modifies CR2
>
> CALL_enter_from_user_mode
> __context_tracking_exit()
> trace_user_exit(0)
> #PF // modifies CR2
>
> call do_page_fault
> address = read_cr2(); /* whoopsie */
>
> And similar for i386.
>
> Fix it by pulling the CR2 read into the entry code, before any of that
> stuff gets a chance to run and ruin things.
Reviewed-by: Andy Lutomirski <[email protected]>
Subject to the discussion as to whether this is the right approach at all.
On Sun, Jul 7, 2019 at 8:10 AM Andy Lutomirski <[email protected]> wrote:
>
> On Thu, Jul 4, 2019 at 1:03 PM Peter Zijlstra <[email protected]> wrote:
> >
> > Despire the current efforts to read CR2 before tracing happens there
> > still exist a number of possible holes:
> >
> > idtentry page_fault do_page_fault has_error_code=1
> > call error_entry
> > TRACE_IRQS_OFF
> > call trace_hardirqs_off*
> > #PF // modifies CR2
> >
> > CALL_enter_from_user_mode
> > __context_tracking_exit()
> > trace_user_exit(0)
> > #PF // modifies CR2
> >
> > call do_page_fault
> > address = read_cr2(); /* whoopsie */
> >
> > And similar for i386.
> >
> > Fix it by pulling the CR2 read into the entry code, before any of that
> > stuff gets a chance to run and ruin things.
>
> Reviewed-by: Andy Lutomirski <[email protected]>
>
> Subject to the discussion as to whether this is the right approach at all.
FWIW, I'm leaning toward suggesting that we apply the trivial tracing
fix and backport *that*. Then, in -tip, we could revert it and apply
this patch instead.
On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <[email protected]> wrote:
>
> FWIW, I'm leaning toward suggesting that we apply the trivial tracing
> fix and backport *that*. Then, in -tip, we could revert it and apply
> this patch instead.
You don't have to have the same fix in stable as in -tip.
It's fine to send something to stable that says "Fixed differently by
commit XYZ upstream". The main thing is to make sure that stable
doesn't have fixes that then get lost upstream (which we used to have
long long ago).
Linus
On Sat, Jul 06, 2019 at 08:07:22PM +0900, Eiichi Tsukata wrote:
>
>
> On 2019/07/05 11:18, Linus Torvalds wrote:
> > On Fri, Jul 5, 2019 at 5:03 AM Peter Zijlstra <[email protected]> wrote:
> >>
> >> Despire the current efforts to read CR2 before tracing happens there
> >> still exist a number of possible holes:
> >
> > So this whole series disturbs me for the simple reason that I thought
> > tracing was supposed to save/restore cr2 and make it unnecessary to
> > worry about this in non-tracing code.
> >
> > That is very much what the NMI code explicitly does. Why shouldn't all
> > the other tracing code do the same thing in case they can take page
> > faults?
> >
> > So I don't think the patches are wrong per se, but this seems to solve
> > it at the wrong level.
This solves it all at one site. If we're going to sprinkle CR2
save/restore over 'all' sites we're bound to miss some and we'll be
playing catch-up forever :/
> Steven previously tried to fix it by saving CR2 in TRACE_IRQS_OFF:
> https://lore.kernel.org/lkml/[email protected]/
That misses the context tracking tracepoint, which is also before we
load CR2.
> To prevent userstack trace code from reading user stack before it becomes ready,
> checking current->in_execve in stack_trace_save_user() can help Steven's approach,
> though trace_sched_process_exec() is called before current->in_execve = 0 so it changes
> current behavior.
>
> The PoC code is as follows:
>
> diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
> index 2abf27d7df6b..30fa6e1b7a87 100644
> --- a/arch/x86/kernel/stacktrace.c
> +++ b/arch/x86/kernel/stacktrace.c
> @@ -116,10 +116,12 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> const struct pt_regs *regs)
> {
> const void __user *fp = (const void __user *)regs->bp;
> + unsigned long address;
>
> if (!consume_entry(cookie, regs->ip, false))
> return;
>
> + address = read_cr2();
> while (1) {
> struct stack_frame_user frame;
>
> @@ -131,11 +133,14 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
> break;
> if (frame.ret_addr) {
> if (!consume_entry(cookie, frame.ret_addr, false))
> - return;
> + break;
> }
> if (fp == frame.next_fp)
> break;
> fp = frame.next_fp;
> }
> +
> + if (address != read_cr2())
> + write_cr2(address);
> }
>
And this misses the perf unwinder, which we can reach if we attach perf
to the tracepoint. We can most likely also do user accesses from
kprobes, which we can similarly attach to tracepoints, and there's eBPF,
and who knows what else...
Do we really want to go put CR2 save/restore around every single thing
that can cause a fault (any fault) and is reachable from a tracepoint?
That's going to be a long list of things ... :/
Or are we going to put the CR2 save/restore on every single tracepoint?
But then we also need it on the mcount/fentry stubs and we again have
multiple places.
Whereas if we stick it in the entry path, like I proposed, we fix it in
one location and we're done.
On 2019/07/08 16:48, Peter Zijlstra wrote:
...
>
> Or are we going to put the CR2 save/restore on every single tracepoint?
> But then we also need it on the mcount/fentry stubs and we again have
> multiple places.
>
> Whereas if we stick it in the entry path, like I proposed, we fix it in
> one location and we're done.
>
Thanks to your detailed comments and the discussion, I've come to realize
that your solution "save CR2 early in the entry" is the simplest and reasonable.
By the way, is there possibility that the WARNING(#GP in execve(2)) which Steven
previously hit? : https://lore.kernel.org/lkml/[email protected]/
Even if there were, it will *Not* be the bug introduced by this patch series,
but the bug revealed by this series.
Thanks,
Eiichi
On 2019/07/08 17:58, Eiichi Tsukata wrote:
>
> By the way, is there possibility that the WARNING(#GP in execve(2)) which Steven
> previously hit? : https://lore.kernel.org/lkml/[email protected]/
>
> Even if there were, it will *Not* be the bug introduced by this patch series,
> but the bug revealed by this series.
>
I reproduced with the kernel:
- https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=x86/core&id=057b48d544b384c47ed921f616128b802ab1fdc3
Reproducer:
# echo 1 > events/preemptirq/irq_disable/enable
# echo userstacktrace > trace_options
# service sshd restart
[ 20.338200] ------------[ cut here ]------------
[ 20.339471] General protection fault in user access. Non-canonical address?
[ 20.339488] WARNING: CPU: 1 PID: 1957 at arch/x86/mm/extable.c:126 ex_handler_uaccess+0x52/0x60
[ 20.342550] Modules linked in:
[ 20.343209] CPU: 1 PID: 1957 Comm: systemctl Tainted: G W 5.2.0-rc7+ #48
[ 20.344935] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
[ 20.346688] RIP: 0010:ex_handler_uaccess+0x52/0x60
[ 20.347667] Code: c4 08 b8 01 00 00 00 5b 5d c3 80 3d b6 8c 94 01 00 75 db 48 c7 c7 b8 e4 6f aa 48 89 75 f0 c6 05 ad
[ 20.351508] RSP: 0018:ffffb28940453688 EFLAGS: 00010082
[ 20.352707] RAX: 0000000000000000 RBX: ffffffffaa2023c0 RCX: 0000000000000001
[ 20.354478] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffaa7dab8d
[ 20.355706] RBP: ffffb28940453698 R08: 0000000000000000 R09: 0000000000000001
[ 20.356665] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb28940453728
[ 20.357594] R13: 0000000000000000 R14: 000000000000000d R15: 0000000000000000
[ 20.358876] FS: 00007fec9fa248c0(0000) GS:ffff921d3d800000(0000) knlGS:0000000000000000
[ 20.360573] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 20.361792] CR2: 0000000000000004 CR3: 0000000074d4e000 CR4: 00000000000006e0
[ 20.363300] Call Trace:
[ 20.363830] fixup_exception+0x4a/0x61
[ 20.364637] __idt_do_general_protection+0x65/0x1d0
[ 20.365684] do_general_protection+0x1e/0x30
[ 20.366654] general_protection+0x1e/0x30
[ 20.367513] RIP: 0010:arch_stack_walk_user+0x7a/0x100
[ 20.368583] Code: 00 01 1f 00 0f 85 8d 00 00 00 49 8b 87 98 16 00 00 48 83 e8 10 49 39 c6 77 32 41 83 87 d0 15 00 04
[ 20.372797] RSP: 0018:ffffb289404537d0 EFLAGS: 00010046
[ 20.374027] RAX: 0000000000000000 RBX: ffff921d383bcb00 RCX: 0074726174736572
[ 20.375674] RDX: ffff921d38393b84 RSI: 7265732e64687373 RDI: ffffb28940453818
[ 20.377179] RBP: ffffb28940453808 R08: 0000000000000001 R09: ffff921d3d169c00
[ 20.378681] R10: 0000000000000b60 R11: ffff921d38393b70 R12: ffffb28940453818
[ 20.380195] R13: ffffb28940453f58 R14: 0074726174736572 R15: ffff921d383bcb00
[ 20.381703] ? profile_setup.cold+0x9b/0x9b
[ 20.382604] stack_trace_save_user+0x60/0x7d
[ 20.383520] trace_buffer_unlock_commit_regs+0x17b/0x220
[ 20.384686] trace_event_buffer_commit+0x6b/0x200
[ 20.385741] trace_event_raw_event_preemptirq_template+0x7b/0xc0
[ 20.387067] ? get_page_from_freelist+0x10a/0x13b0
[ 20.388129] ? __alloc_pages_nodemask+0x178/0x380
[ 20.389132] trace_hardirqs_off+0xc6/0x100
[ 20.390006] get_page_from_freelist+0x10a/0x13b0
[ 20.390997] ? kvm_clock_read+0x18/0x30
[ 20.391813] ? sched_clock+0x9/0x10
[ 20.392647] __alloc_pages_nodemask+0x178/0x380
[ 20.393706] alloc_pages_current+0x87/0xe0
[ 20.394609] __page_cache_alloc+0xcd/0x110
[ 20.395491] __do_page_cache_readahead+0xa1/0x1a0
[ 20.396547] ondemand_readahead+0x220/0x540
[ 20.397486] page_cache_sync_readahead+0x35/0x50
[ 20.398511] generic_file_read_iter+0x8d8/0xbd0
[ 20.399335] ? __might_sleep+0x4b/0x80
[ 20.400110] ext4_file_read_iter+0x35/0x40
[ 20.400937] new_sync_read+0x10f/0x1a0
[ 20.401833] __vfs_read+0x29/0x40
[ 20.402608] vfs_read+0xc0/0x170
[ 20.403319] kernel_read+0x31/0x50
[ 20.404128] prepare_binprm+0x150/0x190
[ 20.404766] __do_execve_file.isra.0+0x4c0/0xaa0
[ 20.405559] __x64_sys_execve+0x39/0x50
[ 20.406173] do_syscall_64+0x55/0x1b0
[ 20.406762] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 20.407703] RIP: 0033:0x7fec9f0ee647
[ 20.408441] Code: ff 76 e0 f7 d8 64 41 89 01 eb d8 0f 1f 84 00 00 00 00 00 f7 d8 64 41 89 01 eb d7 0f 1f 84 00 00 08
[ 20.411636] RSP: 002b:00007ffe7f316228 EFLAGS: 00000202 ORIG_RAX: 000000000000003b
[ 20.412686] RAX: ffffffffffffffda RBX: 00007fec9f8105a4 RCX: 00007fec9f0ee647
[ 20.414047] RDX: 00007ffe7f3167d8 RSI: 00007ffe7f316230 RDI: 00007fec9f73cea0
[ 20.415048] RBP: 00007ffe7f316480 R08: 0000000000000030 R09: 0000000000000001
[ 20.416076] R10: 00007ffe7f316498 R11: 0000000000000202 R12: 00007fec9f73cea0
[ 20.417556] R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
[ 20.419032] irq event stamp: 832
[ 20.419493] hardirqs last enabled at (831): [<ffffffffa94b9f2e>] __find_get_block+0xde/0x360
[ 20.420649] hardirqs last disabled at (832): [<ffffffffa9305ad2>] rcu_irq_enter_irqson+0x12/0x30
[ 20.422179] softirqs last enabled at (436): [<ffffffffaa20037f>] __do_softirq+0x37f/0x472
[ 20.423286] softirqs last disabled at (291): [<ffffffffa927bce3>] irq_exit+0xb3/0xd0
[ 20.424296] ---[ end trace cdb84a67edcdc420 ]---
On 2019/07/08 18:42, Eiichi Tsukata wrote:
>
>
> On 2019/07/08 17:58, Eiichi Tsukata wrote:
>
>>
>> By the way, is there possibility that the WARNING(#GP in execve(2)) which Steven
>> previously hit? : https://lore.kernel.org/lkml/[email protected]/
>>
>> Even if there were, it will *Not* be the bug introduced by this patch series,
>> but the bug revealed by this series.
>>
>
> I reproduced with the kernel:
> - https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=x86/core&id=057b48d544b384c47ed921f616128b802ab1fdc3
>
> Reproducer:
>
> # echo 1 > events/preemptirq/irq_disable/enable
> # echo userstacktrace > trace_options
> # service sshd restart
>
> [ 20.338200] ------------[ cut here ]------------
> [ 20.339471] General protection fault in user access. Non-canonical address?
> [ 20.339488] WARNING: CPU: 1 PID: 1957 at arch/x86/mm/extable.c:126 ex_handler_uaccess+0x52/0x60
> [ 20.342550] Modules linked in:
> [ 20.343209] CPU: 1 PID: 1957 Comm: systemctl Tainted: G W 5.2.0-rc7+ #48
> [ 20.344935] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
> [ 20.346688] RIP: 0010:ex_handler_uaccess+0x52/0x60
> [ 20.347667] Code: c4 08 b8 01 00 00 00 5b 5d c3 80 3d b6 8c 94 01 00 75 db 48 c7 c7 b8 e4 6f aa 48 89 75 f0 c6 05 ad
> [ 20.351508] RSP: 0018:ffffb28940453688 EFLAGS: 00010082
> [ 20.352707] RAX: 0000000000000000 RBX: ffffffffaa2023c0 RCX: 0000000000000001
> [ 20.354478] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffaa7dab8d
> [ 20.355706] RBP: ffffb28940453698 R08: 0000000000000000 R09: 0000000000000001
> [ 20.356665] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb28940453728
> [ 20.357594] R13: 0000000000000000 R14: 000000000000000d R15: 0000000000000000
> [ 20.358876] FS: 00007fec9fa248c0(0000) GS:ffff921d3d800000(0000) knlGS:0000000000000000
> [ 20.360573] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 20.361792] CR2: 0000000000000004 CR3: 0000000074d4e000 CR4: 00000000000006e0
> [ 20.363300] Call Trace:
> [ 20.363830] fixup_exception+0x4a/0x61
> [ 20.364637] __idt_do_general_protection+0x65/0x1d0
> [ 20.365684] do_general_protection+0x1e/0x30
> [ 20.366654] general_protection+0x1e/0x30
> [ 20.367513] RIP: 0010:arch_stack_walk_user+0x7a/0x100
> [ 20.368583] Code: 00 01 1f 00 0f 85 8d 00 00 00 49 8b 87 98 16 00 00 48 83 e8 10 49 39 c6 77 32 41 83 87 d0 15 00 04
> [ 20.372797] RSP: 0018:ffffb289404537d0 EFLAGS: 00010046
> [ 20.374027] RAX: 0000000000000000 RBX: ffff921d383bcb00 RCX: 0074726174736572
> [ 20.375674] RDX: ffff921d38393b84 RSI: 7265732e64687373 RDI: ffffb28940453818
> [ 20.377179] RBP: ffffb28940453808 R08: 0000000000000001 R09: ffff921d3d169c00
> [ 20.378681] R10: 0000000000000b60 R11: ffff921d38393b70 R12: ffffb28940453818
> [ 20.380195] R13: ffffb28940453f58 R14: 0074726174736572 R15: ffff921d383bcb00
> [ 20.381703] ? profile_setup.cold+0x9b/0x9b
> [ 20.382604] stack_trace_save_user+0x60/0x7d
> [ 20.383520] trace_buffer_unlock_commit_regs+0x17b/0x220
> [ 20.384686] trace_event_buffer_commit+0x6b/0x200
> [ 20.385741] trace_event_raw_event_preemptirq_template+0x7b/0xc0
> [ 20.387067] ? get_page_from_freelist+0x10a/0x13b0
> [ 20.388129] ? __alloc_pages_nodemask+0x178/0x380
> [ 20.389132] trace_hardirqs_off+0xc6/0x100
> [ 20.390006] get_page_from_freelist+0x10a/0x13b0
> [ 20.390997] ? kvm_clock_read+0x18/0x30
> [ 20.391813] ? sched_clock+0x9/0x10
> [ 20.392647] __alloc_pages_nodemask+0x178/0x380
> [ 20.393706] alloc_pages_current+0x87/0xe0
> [ 20.394609] __page_cache_alloc+0xcd/0x110
> [ 20.395491] __do_page_cache_readahead+0xa1/0x1a0
> [ 20.396547] ondemand_readahead+0x220/0x540
> [ 20.397486] page_cache_sync_readahead+0x35/0x50
> [ 20.398511] generic_file_read_iter+0x8d8/0xbd0
> [ 20.399335] ? __might_sleep+0x4b/0x80
> [ 20.400110] ext4_file_read_iter+0x35/0x40
> [ 20.400937] new_sync_read+0x10f/0x1a0
> [ 20.401833] __vfs_read+0x29/0x40
> [ 20.402608] vfs_read+0xc0/0x170
> [ 20.403319] kernel_read+0x31/0x50
The cause was obvious as Linus already said in:
https://lore.kernel.org/lkml/CAHk-=whvxJjBBOmQSsrB-xHkc6xm8zGHsBRgpxh14UsEY_g+nw@mail.gmail.com/
stack_trace_save_user() is called after set_fs(KERNEL_DS).
I don't know why systemctl alyways hit this, but user space process can make up stack
as it like, so we have to check it by ourselves?
ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
mm_segment_t old_fs;
ssize_t result;
old_fs = get_fs();
set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
result = vfs_read(file, (void __user *)buf, count, pos);
set_fs(old_fs);
return result;
}
EXPORT_SYMBOL(kernel_read);
> [ 20.404128] prepare_binprm+0x150/0x190
> [ 20.404766] __do_execve_file.isra.0+0x4c0/0xaa0
> [ 20.405559] __x64_sys_execve+0x39/0x50
> [ 20.406173] do_syscall_64+0x55/0x1b0
> [ 20.406762] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [ 20.407703] RIP: 0033:0x7fec9f0ee647
> [ 20.408441] Code: ff 76 e0 f7 d8 64 41 89 01 eb d8 0f 1f 84 00 00 00 00 00 f7 d8 64 41 89 01 eb d7 0f 1f 84 00 00 08
> [ 20.411636] RSP: 002b:00007ffe7f316228 EFLAGS: 00000202 ORIG_RAX: 000000000000003b
> [ 20.412686] RAX: ffffffffffffffda RBX: 00007fec9f8105a4 RCX: 00007fec9f0ee647
> [ 20.414047] RDX: 00007ffe7f3167d8 RSI: 00007ffe7f316230 RDI: 00007fec9f73cea0
> [ 20.415048] RBP: 00007ffe7f316480 R08: 0000000000000030 R09: 0000000000000001
> [ 20.416076] R10: 00007ffe7f316498 R11: 0000000000000202 R12: 00007fec9f73cea0
> [ 20.417556] R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
> [ 20.419032] irq event stamp: 832
> [ 20.419493] hardirqs last enabled at (831): [<ffffffffa94b9f2e>] __find_get_block+0xde/0x360
> [ 20.420649] hardirqs last disabled at (832): [<ffffffffa9305ad2>] rcu_irq_enter_irqson+0x12/0x30
> [ 20.422179] softirqs last enabled at (436): [<ffffffffaa20037f>] __do_softirq+0x37f/0x472
> [ 20.423286] softirqs last disabled at (291): [<ffffffffa927bce3>] irq_exit+0xb3/0xd0
> [ 20.424296] ---[ end trace cdb84a67edcdc420 ]---
>
[ added stable folks ]
On Sun, 7 Jul 2019 11:17:09 -0700
Linus Torvalds <[email protected]> wrote:
> On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <[email protected]> wrote:
> >
> > FWIW, I'm leaning toward suggesting that we apply the trivial tracing
> > fix and backport *that*. Then, in -tip, we could revert it and apply
> > this patch instead.
>
> You don't have to have the same fix in stable as in -tip.
>
> It's fine to send something to stable that says "Fixed differently by
> commit XYZ upstream". The main thing is to make sure that stable
> doesn't have fixes that then get lost upstream (which we used to have
> long long ago).
>
But isn't it easier for them to just pull the quick fix in, if it is in
your tree? That is, it shouldn't be too hard to make the "quick fix"
that gets backported on your tree (and probably better testing), and
then add the proper fix on top of it. The stable folks will then just
use the commit sha to know what to take, and feel more confident about
taking it.
-- Steve
On Wed, Jul 10, 2019 at 04:27:09PM -0400, Steven Rostedt wrote:
>
> [ added stable folks ]
>
> On Sun, 7 Jul 2019 11:17:09 -0700
> Linus Torvalds <[email protected]> wrote:
>
> > On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <[email protected]> wrote:
> > >
> > > FWIW, I'm leaning toward suggesting that we apply the trivial tracing
> > > fix and backport *that*. Then, in -tip, we could revert it and apply
> > > this patch instead.
> >
> > You don't have to have the same fix in stable as in -tip.
> >
> > It's fine to send something to stable that says "Fixed differently by
> > commit XYZ upstream". The main thing is to make sure that stable
> > doesn't have fixes that then get lost upstream (which we used to have
> > long long ago).
> >
>
> But isn't it easier for them to just pull the quick fix in, if it is in
> your tree? That is, it shouldn't be too hard to make the "quick fix"
> that gets backported on your tree (and probably better testing), and
> then add the proper fix on top of it. The stable folks will then just
> use the commit sha to know what to take, and feel more confident about
> taking it.
It all depends on what the "quick fix" is. The reason I want to take
the exact same patch that is in Linus's tree is that 95% of the time
that we do a "one off" patch for stable only, it's wrong. We _ALWAYS_
get it wrong somehow, it's crazy how bad we are at this. I don't know
why this is, but we have the stats to prove it.
Because of this, I now require the "one off" stable only fixes to get a
bunch of people reviewing it and write up a bunch of explaination as to
why this is the way it is and why we can't just take whatever is in
mainline.
thanks,
greg k-h
On Wed, Jul 10, 2019 at 04:27:09PM -0400, Steven Rostedt wrote:
>
>[ added stable folks ]
>
>On Sun, 7 Jul 2019 11:17:09 -0700
>Linus Torvalds <[email protected]> wrote:
>
>> On Sun, Jul 7, 2019 at 8:11 AM Andy Lutomirski <[email protected]> wrote:
>> >
>> > FWIW, I'm leaning toward suggesting that we apply the trivial tracing
>> > fix and backport *that*. Then, in -tip, we could revert it and apply
>> > this patch instead.
>>
>> You don't have to have the same fix in stable as in -tip.
>>
>> It's fine to send something to stable that says "Fixed differently by
>> commit XYZ upstream". The main thing is to make sure that stable
>> doesn't have fixes that then get lost upstream (which we used to have
>> long long ago).
>>
>
>But isn't it easier for them to just pull the quick fix in, if it is in
>your tree? That is, it shouldn't be too hard to make the "quick fix"
>that gets backported on your tree (and probably better testing), and
>then add the proper fix on top of it. The stable folks will then just
>use the commit sha to know what to take, and feel more confident about
>taking it.
I'd say that if the "final" fix is significantly different than what
we'll end up with upstream then just do as Linus said and send us a
separate backport.
If we try doing the apply fix/revert etc games it'll just be more
difficult later on to parse what has happened. On the other hand, if we
have a clear explanation in the backported commit as to how it's
different from upstream and the reasons for doing so it'll make future
us happy when we try to apply fixes on top of it.
--
Thanks,
Sasha
On Wed, Jul 10, 2019 at 04:27:09PM -0400, Steven Rostedt wrote:
> But isn't it easier for them to just pull the quick fix in, if it is in
Steve, I've not yet seen a quick fix that actually fixes all the
problems.
Your initial one only fixes the IRQ tracing one, but leaves the context
tracking one wide open.