This is the x86_64 variant of the i386 fix I just submitted. I think
iret can only produce these faults when returning to user mode in a
32-bit process. The failure mode is even more mysterious on x86_64,
because it exits with -9999&0x7f instead of 11 (SIGSEGV), so it says
"Unknown signal 113 (core dumped)" when it exits without actually
trying to dump a core file.
Thanks,
Roland
Signed-off-by: Roland McGrath <[email protected]>
--- linux-2.6/arch/x86_64/kernel/entry.S
+++ linux-2.6/arch/x86_64/kernel/entry.S
@@ -471,14 +470,6 @@ iret_label:
.section __ex_table,"a"
.quad iret_label,bad_iret
.previous
- .section .fixup,"ax"
- /* force a signal here? this matches i386 behaviour */
- /* running with kernel gs */
-bad_iret:
- movq $-9999,%rdi /* better code? */
- jmp do_exit
- .previous
-
/* edi: workmask, edx: work */
retint_careful:
bt $TIF_NEED_RESCHED,%edx
@@ -522,6 +513,31 @@ retint_kernel:
#endif
CFI_ENDPROC
+ /*
+ * Traps in iret mean that userland tried to restore a bogus
+ * cs, eip, ss, esp, or eflags. Some kinds of bogosity just cause
+ * a trap after the iret returns, but some will cause a trap in
+ * iret itself. We want to treat those as if the restored user
+ * state is what caused that trap, i.e. produce the appropriate signal.
+ * Since normal .fixup code doesn't have access to the trap info,
+ * traps.c has a special case for iret. It's already generated the
+ * signal before we resume at bad_iret. Now we just need to recover
+ * the whole frame we were trying to restore, so it can be seen on
+ * our stack by the debugger.
+ */
+ .section .fixup,"ax"
+ /* running with kernel gs */
+ENTRY(bad_iret)
+ CFI_STARTPROC simple
+ CFI_DEF_CFA rsp,(SS+8-RIP)
+ CFI_REL_OFFSET rip,0
+ CFI_REL_OFFSET rsp,(RSP-RIP)
+ SAVE_ARGS 8
+ jmp exit_intr
+ CFI_ENDPROC
+ .previous
+
+
/*
* APIC interrupts.
*/
@@ -826,7 +842,10 @@ paranoid_swapgs:
swapgs
paranoid_restore:
RESTORE_ALL 8
- iretq
+1: iretq
+ .section __ex_table,"a"
+ .quad 1b,bad_iret
+ .previous
paranoid_userspace:
cli
GET_THREAD_INFO(%rcx)
--- linux-2.6/arch/x86_64/kernel/traps.c
+++ linux-2.6/arch/x86_64/kernel/traps.c
@@ -404,6 +404,44 @@ void die_nmi(char *str, struct pt_regs *
do_exit(SIGSEGV);
}
+
+/*
+ * When we get an exception in the iret instructions in entry.S, whatever
+ * fault it is really belongs to the user state we are restoring. We want
+ * to turn it into a signal. To make that signal's info exactly match what
+ * this same kind of fault in a user instruction would show, the fixup
+ * needs to know the trapno and error code. But those are lost when we get
+ * back to the fixup entrypoint. So we have a special case for the iret
+ * fixups, and generate the signal here like a normal user trap would.
+ * Then the fixup code restores the pt_regs on the base of the stack to
+ * the bogus user state it was trying to return to, before handling the signal.
+ */
+extern void bad_iret(void); /* entry.S label for code in .fixup */
+static inline int is_iret(struct pt_regs *regs)
+{
+ return (regs->rip == (unsigned long)&bad_iret);
+}
+
+/*
+ * If the iret was actually trying to return to kernel mode,
+ * that should be an oops.
+ */
+static inline int iret_to_user(struct pt_regs *regs)
+{
+ /*
+ * The frame being restored was all popped off and restored except
+ * the last five words that iret pops. Instead of popping, it
+ * pushed another trap frame, clobbering the part of the old one
+ * that we had already restored. So the restored registers are now
+ * all back in the new trap frame, but the rip et al show the
+ * in-kernel state at the iret instruction. The bad state we tried
+ * to restore with iret is still on the old stack below.
+ */
+ struct pt_regs *oregs = container_of((unsigned long *) regs->rsp,
+ struct pt_regs, rip);
+
+ return likely((oregs->cs & 3) == 3);
+}
static void do_trap(int trapnr, int signr, char *str,
struct pt_regs * regs, long error_code, siginfo_t *info)
{
@@ -423,7 +461,9 @@ static void do_trap(int trapnr, int sign
#endif
if ((regs->cs & 3) != 0) {
- struct task_struct *tsk = current;
+ struct task_struct *tsk;
+ user_trap:
+ tsk = current;
if (exception_trace && unhandled_signal(tsk, signr))
printk(KERN_INFO
@@ -447,7 +487,13 @@ static void do_trap(int trapnr, int sign
fixup = search_exception_tables(regs->rip);
if (fixup) {
regs->rip = fixup->fixup;
- } else
- die(str, regs, error_code);
+ if (!is_iret(regs))
+ return;
+ if (iret_to_user(regs)) {
+ local_irq_enable();
+ goto user_trap;
+ }
+ }
+ die(str, regs, error_code);
return;
}
@@ -523,8 +569,10 @@ asmlinkage void do_general_protection(st
}
#endif
- if ((regs->cs & 3)!=0) {
- struct task_struct *tsk = current;
+ if ((regs->cs & 3) != 0) {
+ struct task_struct *tsk;
+ user_gp:
+ tsk = current;
if (exception_trace && unhandled_signal(tsk, SIGSEGV))
printk(KERN_INFO
@@ -544,7 +592,12 @@ asmlinkage void do_general_protection(st
fixup = search_exception_tables(regs->rip);
if (fixup) {
regs->rip = fixup->fixup;
- return;
+ if (!is_iret(regs))
+ return;
+ if (iret_to_user(regs)) {
+ local_irq_enable();
+ goto user_gp;
+ }
}
if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
On Sat, Apr 23, 2005 at 05:50:37PM -0700, Roland McGrath wrote:
> This is the x86_64 variant of the i386 fix I just submitted. I think
> iret can only produce these faults when returning to user mode in a
> 32-bit process. The failure mode is even more mysterious on x86_64,
> because it exits with -9999&0x7f instead of 11 (SIGSEGV), so it says
> "Unknown signal 113 (core dumped)" when it exits without actually
> trying to dump a core file.
I agree that handling this better is a good idea.
But I really hate your is_iret hack in traps.c. Cant you just force the signal
after fixing up the stack? I dont want such a ugly complicated special case
there that only handles this extremly exotic case.
If you cant do it in a cleaner way it would be better not to fix it.
But I suppose forcing a signal directly is doable.
-Andi
On Mon, 25 Apr 2005, Andi Kleen wrote:
>
> But I really hate your is_iret hack in traps.c.
I agree.
Why don't you just do
pushl $0
pushl $do_iret_error
jmp error_code
And just have a "do_iret_error()" handler in traps.c which does the right
thing? That seems like the _sane_ thing to do, and that way there is never
any confusion about what is going on.
Linus
> Why don't you just do
>
> pushl $0
> pushl $do_iret_error
> jmp error_code
I quote from the comment in the code:
We want
* to turn it into a signal. To make that signal's info exactly match what
* this same kind of fault in a user instruction would show, the fixup
* needs to know the trapno and error code. But those are lost when we get
* back to the fixup entrypoint.
The error code is not always 0, it might be a bad segment value. I think
the kernel ought to give accurate information about the fault consistently
no matter where it occurs, so I did not want to pretend the error code is 0.
I certainly think it would be cleaner if the fixup code could access the
fault information directly. However, it's arguably not so clean to have a
do_iret_error function that replicates the work of do_trap and
do_general_protection. The iret case is really not so much a special case
for what to do, but a special case for how you determine whether the
vanilla user-mode thing is done or the vanilla kernel-mode thing is done.
Thanks,
Roland
On Mon, 25 Apr 2005, Roland McGrath wrote:
>
> We want
> * to turn it into a signal. To make that signal's info exactly match what
> * this same kind of fault in a user instruction would show, the fixup
> * needs to know the trapno and error code. But those are lost when we get
> * back to the fixup entrypoint.
Bah.
The _information_ is there, it's right on the stack-frame in the original
CS/SS etc.
I'll take reliable and obvious code _any_ day over hacky code just to get
me an "error code" for something that just isn't very interesting.
In other words, your code is not only ugly, it's also the kind of code
that makes you go "how does that work".
I think it's a lot more important that the kernel obviously work
correctly.
Linus
What would you think about a general hack to let given fixup table entries
say the code wants the trap and error info made available (pushed on the
stack or whatever)? Conversely, would there be any harm in always setting
->thread.error_code and ->thread.trap_no for a kernel trap?
Thanks,
Roland
On Mon, 25 Apr 2005, Roland McGrath wrote:
>
> What would you think about a general hack to let given fixup table entries
> say the code wants the trap and error info made available (pushed on the
> stack or whatever)?
Yes, that sounds fine, if there is just some sane way to indicate that
(normally the fixup eip is close to the faulting eip, so there should be
tons of bits available, but it would need to be something clean? For iret,
I guess we could just set fixup-eip to zero, which would mean "return to
eip+1 and set error", but that sounds pretty hacky too)
> Conversely, would there be any harm in always setting
> ->thread.error_code and ->thread.trap_no for a kernel trap?
Page-ins etc in kernel mode are normal, and they shouldn't set the thread
state. So I think the "turn it into a trap" thing is nicer.
Linus
I was never very happy with the special-case check for iret_exc either.
But for the first crack, I went for the fix that didn't touch other
infrastructure code.
The fault.c changes here are really not necessary for the bug fix at all,
it will never be used there. But to make it a clean infrastructure
upgrade, I made every caller of fixup_exception consistently pass in the
complete info it uses for signals in the user-mode case.
A user process can cause a kernel-mode fault in the iret instruction, by
setting an invalid %cs and such, via ptrace or sigreturn. The fixup code
now calls do_exit(11), so a process will die with SIGSEGV but never
generate a core dump. This is vastly more confusing in a multithreaded
program, because do_exit just kills that one thread and so it appears to
mysteriously disappear when it calls sigreturn.
This patch makes faults in iret produce the normal signals that would
result from the same errors when executing some user-mode instruction.
To accomplish this, I've extended the exception_table mechanism to support
"special fixups". Instead of a PC location to jump to, these have a
function called in the trap handler context and passed the full trap details.
Signed-off-by: Roland McGrath <[email protected]>
---
commit 66b1fbcee2c94a5fd9d199e48a25d5f6382f6f05
tree cb5e97a950137e15a400501ab3aa95fe539d0a45
parent f6bbd250c19529f6efebc6c8bd7d3495b98e5eca
author Roland McGrath <[email protected]> 1114589292 -0700
committer Roland McGrath <[email protected]> 1114589292 -0700
Index: arch/i386/kernel/entry.S
===================================================================
--- 23cf0cf93e225498eea50480a44b038be19b333d/arch/i386/kernel/entry.S (mode:100644 sha1:3c73dc865ead3ad2323098a3b78b0fc8042e7489)
+++ cb5e97a950137e15a400501ab3aa95fe539d0a45/arch/i386/kernel/entry.S (mode:100644 sha1:d8e602ca337b434e48f2014048f7ecdef221198f)
@@ -75,6 +75,9 @@
NT_MASK = 0x00004000
VM_MASK = 0x00020000
+/* This marks an __ex_table entry that has an exception_fixup_t handler. */
+#define SPECIAL_FIXUP(function) (function - __PAGE_OFFSET)
+
#ifdef CONFIG_PREEMPT
#define preempt_stop cli
#else
@@ -257,18 +260,27 @@
RESTORE_REGS
addl $4, %esp
1: iret
+ /*
+ * Traps in iret mean that userland tried to restore a bogus
+ * cs, eip, ss, esp, or eflags. Some kinds of bogosity just cause
+ * a trap after the iret returns, but some will cause a trap in
+ * iret itself. We want to treat those as if the restored user
+ * state is what caused that trap, i.e. produce the appropriate signal.
+ * Since normal .fixup code doesn't have access to the trap info,
+ * traps.c has a special case for iret. It's already generated the
+ * signal before we resume at iret_exc. Now we just need to recover
+ * the whole frame we were trying to restore, so it can be seen on
+ * our stack by the debugger.
+ */
.section .fixup,"ax"
-iret_exc:
- sti
- movl $__USER_DS, %edx
- movl %edx, %ds
- movl %edx, %es
- movl $11,%eax
- call do_exit
+ENTRY(iret_exc)
+ pushl $0 # orig_eax was lost
+ SAVE_ALL
+ jmp ret_from_exception
.previous
.section __ex_table,"a"
.align 4
- .long 1b,iret_exc
+ .long 1b,SPECIAL_FIXUP(fixup_iret)
.previous
ldt_ss:
@@ -293,7 +305,7 @@
1: iret
.section __ex_table,"a"
.align 4
- .long 1b,iret_exc
+ .long 1b,SPECIAL_FIXUP(fixup_iret)
.previous
# perform work that needs to be done immediately before resumption
@@ -589,7 +601,7 @@
1: iret
.section __ex_table,"a"
.align 4
- .long 1b,iret_exc
+ .long 1b,SPECIAL_FIXUP(fixup_iret)
.previous
ENTRY(int3)
Index: arch/i386/kernel/traps.c
===================================================================
--- 23cf0cf93e225498eea50480a44b038be19b333d/arch/i386/kernel/traps.c (mode:100644 sha1:6c0e383915b6a0d9fc516a04f328020d18b575b5)
+++ cb5e97a950137e15a400501ab3aa95fe539d0a45/arch/i386/kernel/traps.c (mode:100644 sha1:12b09606ace52f8d8525b6e4f1d2987ab8873308)
@@ -357,6 +357,60 @@
die(str, regs, err);
}
+/*
+ * When we get an exception in the iret instructions in entry.S, whatever
+ * fault it is really belongs to the user state we are restoring. We want
+ * to turn it into a signal. To make that signal's info exactly match what
+ * this same kind of fault in a user instruction would show, the fixup
+ * needs to know the trapno and error code. So, we use this special fixup
+ * hook for the iret instructions. If the iret was returning to user mode,
+ * we force a signal like the same exception in user mode would have. Then
+ * the fixup code at iret_exc (entry.S) restores the pt_regs on the base of
+ * the stack to the bogus user state it was trying to return to, before
+ * handling the signal.
+ */
+int fixup_iret(struct pt_regs *regs, int trapnr, int signr,
+ unsigned long error_code, siginfo_t *info)
+{
+ /*
+ * The frame being restored was all popped off and restored except
+ * the last five words that iret pops. Instead of popping, it
+ * pushed another trap frame, clobbering the part of the old one
+ * that we had already restored. So the restored registers are now
+ * all back in the new trap frame, but the eip et al show the
+ * in-kernel state at the iret instruction. The bad state we tried
+ * to restore with iret is still on the stack, right below our
+ * current trap frame. The current trap frame was for an in-kernel
+ * trap, and so doesn't include the esp and ss words--so ®s->esp
+ * is where %esp was before the iret.
+ */
+ struct pt_regs *oregs = container_of(®s->esp, struct pt_regs, eip);
+ struct task_struct *tsk = current;
+
+ if (likely((oregs->xcs & 3) == 3)) {
+ /*
+ * The iret was trying to return to user mode.
+ * Turn this into a signal like do_trap would normally do.
+ */
+ extern void iret_exc(void);
+ local_irq_enable();
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ regs->eip = (unsigned long) &iret_exc;
+ return 1;
+ }
+
+ /*
+ * The iret was actually trying to return to kernel mode,
+ * so in fact we didn't handle this exception.
+ */
+ return 0;
+}
+
static void do_trap(int trapnr, int signr, char *str, int vm86,
struct pt_regs * regs, long error_code, siginfo_t *info)
{
@@ -381,7 +435,7 @@
}
kernel_trap: {
- if (!fixup_exception(regs))
+ if (!fixup_exception(regs, trapnr, signr, error_code, info))
die(str, regs, error_code);
return;
}
@@ -501,7 +555,7 @@
return;
gp_in_kernel:
- if (!fixup_exception(regs)) {
+ if (!fixup_exception(regs, 13, SIGSEGV, error_code, NULL)) {
if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
return;
Index: arch/i386/mm/extable.c
===================================================================
--- 23cf0cf93e225498eea50480a44b038be19b333d/arch/i386/mm/extable.c (mode:100644 sha1:f706449319c4577e5d944561b295d199449bcc02)
+++ cb5e97a950137e15a400501ab3aa95fe539d0a45/arch/i386/mm/extable.c (mode:100644 sha1:14eec19b079d5afcee76547b4e9a7e33ec384cff)
@@ -7,7 +7,8 @@
#include <linux/spinlock.h>
#include <asm/uaccess.h>
-int fixup_exception(struct pt_regs *regs)
+int fixup_exception(struct pt_regs *regs, int trapnr, int signr,
+ unsigned long error_code, siginfo_t *info)
{
const struct exception_table_entry *fixup;
@@ -28,6 +29,10 @@
fixup = search_exception_tables(regs->eip);
if (fixup) {
+ if (fixup_is_special(fixup)) {
+ exception_fixup_t *hook = special_fixup_handler(fixup);
+ return (*hook) (regs, trapnr, signr, error_code, info);
+ }
regs->eip = fixup->fixup;
return 1;
}
Index: arch/i386/mm/fault.c
===================================================================
--- 23cf0cf93e225498eea50480a44b038be19b333d/arch/i386/mm/fault.c (mode:100644 sha1:a509237c4815ed6f8443e85b973dda2a0f2c0688)
+++ cb5e97a950137e15a400501ab3aa95fe539d0a45/arch/i386/mm/fault.c (mode:100644 sha1:be670e7a9593e5a349408262400d6e31296db92d)
@@ -374,6 +374,12 @@
up_read(&mm->mmap_sem);
bad_area_nosemaphore:
+ /* Set up info passed to force_sig_info or to fixup_exception. */
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ /* info.si_code has been set above */
+ info.si_addr = (void __user *)address;
+
/* User mode accesses just cause a SIGSEGV */
if (error_code & 4) {
/*
@@ -387,10 +393,6 @@
/* Kernel addresses are always protection faults */
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void __user *)address;
force_sig_info(SIGSEGV, &info, tsk);
return;
}
@@ -413,7 +415,7 @@
no_context:
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
+ if (fixup_exception(regs, 14, info.si_signo, error_code, &info))
return;
/*
@@ -481,11 +483,20 @@
printk("VM: killing process %s\n", tsk->comm);
if (error_code & 4)
do_exit(SIGKILL);
+ info.si_signo = SIGKILL;
+ info.si_errno = 0;
+ info.si_addr = (void __user *)address;
goto no_context;
do_sigbus:
up_read(&mm->mmap_sem);
+ /* Set up info passed to force_sig_info or to fixup_exception. */
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+
/* Kernel mode? Handle exceptions or die */
if (!(error_code & 4))
goto no_context;
@@ -497,14 +508,15 @@
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
force_sig_info(SIGBUS, &info, tsk);
return;
vmalloc_fault:
+ /* Set up info passed to fixup_exception. */
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ /* info.si_code has been set above */
+ info.si_addr = (void __user *)address;
{
/*
* Synchronize this task's top level page-table
Index: include/asm-i386/uaccess.h
===================================================================
--- 23cf0cf93e225498eea50480a44b038be19b333d/include/asm-i386/uaccess.h (mode:100644 sha1:886867aea947094185fcac17610acc67f9310caf)
+++ cb5e97a950137e15a400501ab3aa95fe539d0a45/include/asm-i386/uaccess.h (mode:100644 sha1:50cd8556e8825a5c6f42ec4dc062e952c27317d3)
@@ -124,8 +124,19 @@
{
unsigned long insn, fixup;
};
-
-extern int fixup_exception(struct pt_regs *regs);
+static inline int fixup_is_special(const struct exception_table_entry *entry)
+{
+ return unlikely(entry->fixup < PAGE_OFFSET);
+}
+struct siginfo;
+typedef int exception_fixup_t(struct pt_regs *regs, int trapnr, int signr,
+ unsigned long err, struct siginfo *info);
+static inline exception_fixup_t *
+special_fixup_handler(const struct exception_table_entry *entry)
+{
+ return (exception_fixup_t *) (entry->fixup + PAGE_OFFSET);
+}
+extern exception_fixup_t fixup_exception;
/*
* These are the main single-value transfer routines. They automatically
Here is the x86-64 version of the new patch (I just sent the i386 one).
The same comments apply here about the fault.c changes. I made a
fixup_exception function like i386's, replacing the duplication of its
contents in several places rather than adding new code to all the duplicates.
A user process can cause a kernel-mode fault in the iret instruction, by
setting an invalid %cs and such, via ptrace or sigreturn. The fixup code
now calls do_exit(11), so a process will die with SIGSEGV but never
generate a core dump. This is vastly more confusing in a multithreaded
program, because do_exit just kills that one thread and so it appears to
mysteriously disappear when it calls sigreturn.
This patch makes faults in iret produce the normal signals that would
result from the same errors when executing some user-mode instruction.
To accomplish this, I've extended the exception_table mechanism to support
"special fixups". Instead of a PC location to jump to, these have a
function called in the trap handler context and passed the full trap details.
Signed-off-by: Roland McGrath <[email protected]>
--- linux-2.6/arch/x86_64/kernel/entry.S (mode:100644 sha1:3233a15cc4e074c00b75569f21c2844ee280b214)
+++ linux-2.6/arch/x86_64/kernel/entry.S (mode:100644 sha1:85d0b453385807901a446ed0e8f6c5b2d96e0ca5)
@@ -44,6 +44,9 @@
.code64
+/* This marks an __ex_table entry that has an exception_fixup_t handler. */
+#define SPECIAL_FIXUP(function) (function - __PAGE_OFFSET)
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
@@ -459,14 +462,7 @@
iretq
.section __ex_table,"a"
- .quad iret_label,bad_iret
- .previous
- .section .fixup,"ax"
- /* force a signal here? this matches i386 behaviour */
- /* running with kernel gs */
-bad_iret:
- movq $-9999,%rdi /* better code? */
- jmp do_exit
+ .quad iret_label,SPECIAL_FIXUP(fixup_iret)
.previous
/* edi: workmask, edx: work */
@@ -509,6 +508,31 @@
#endif
CFI_ENDPROC
+ /*
+ * Traps in iret mean that userland tried to restore a bogus
+ * cs, eip, ss, esp, or eflags. Some kinds of bogosity just cause
+ * a trap after the iret returns, but some will cause a trap in
+ * iret itself. We want to treat those as if the restored user
+ * state is what caused that trap, i.e. produce the appropriate signal.
+ * Since normal .fixup code doesn't have access to the trap info,
+ * traps.c has a special case for iret. It's already generated the
+ * signal before we resume at iret_exc. Now we just need to recover
+ * the whole frame we were trying to restore, so it can be seen on
+ * our stack by the debugger.
+ */
+ .section .fixup,"ax"
+ /* running with kernel gs */
+ENTRY(iret_exc)
+ CFI_STARTPROC simple
+ CFI_DEF_CFA rsp,(SS+8-RIP)
+ CFI_REL_OFFSET rip,0
+ CFI_REL_OFFSET rsp,(RSP-RIP)
+ SAVE_ARGS 8
+ jmp exit_intr
+ CFI_ENDPROC
+ .previous
+
+
/*
* APIC interrupts.
*/
@@ -819,7 +843,10 @@
swapgs
paranoid_restore:
RESTORE_ALL 8
- iretq
+1: iretq
+ .section __ex_table,"a"
+ .quad 1b,SPECIAL_FIXUP(fixup_iret)
+ .previous
paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl threadinfo_flags(%rcx),%ebx
--- linux-2.6/arch/x86_64/kernel/traps.c (mode:100644 sha1:65a37f52c56ef2c0760f2e3db9dfec9312a74d88)
+++ linux-2.6/arch/x86_64/kernel/traps.c (mode:100644 sha1:9d199565e3a72768dfd8c352860e3d13306669f2)
@@ -417,6 +417,59 @@
do_exit(SIGSEGV);
}
+/*
+ * When we get an exception in the iret instructions in entry.S, whatever
+ * fault it is really belongs to the user state we are restoring. We want
+ * to turn it into a signal. To make that signal's info exactly match what
+ * this same kind of fault in a user instruction would show, the fixup
+ * needs to know the trapno and error code. So, we use this special fixup
+ * hook for the iret instructions. If the iret was returning to user mode,
+ * we force a signal like the same exception in user mode would have. Then
+ * the fixup code at iret_exc (entry.S) restores the pt_regs on the base of
+ * the stack to the bogus user state it was trying to return to, before
+ * handling the signal.
+ */
+int fixup_iret(struct pt_regs *regs, int trapnr, int signr,
+ unsigned long error_code, siginfo_t *info)
+{
+ /*
+ * The frame being restored was all popped off and restored except
+ * the last five words that iret pops. Instead of popping, it
+ * pushed another trap frame, clobbering the part of the old one
+ * that we had already restored. So the restored registers are now
+ * all back in the new trap frame, but the rip et al show the
+ * in-kernel state at the iret instruction. The bad state we tried
+ * to restore with iret is still on the old stack below.
+ */
+ struct pt_regs *oregs = container_of((unsigned long *) regs->rsp,
+ struct pt_regs, rip);
+ struct task_struct *tsk = current;
+
+ if (likely((oregs->cs & 3) == 3)) {
+ /*
+ * The iret was trying to return to user mode.
+ * Turn this into a signal like do_trap would normally do.
+ */
+ extern void iret_exc(void);
+ local_irq_enable();
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ regs->rip = (unsigned long) &iret_exc;
+ return 1;
+ }
+
+ /*
+ * The iret was actually trying to return to kernel mode,
+ * so in fact we didn't handle this exception.
+ */
+ return 0;
+}
+
+
static void do_trap(int trapnr, int signr, char *str,
struct pt_regs * regs, long error_code, siginfo_t *info)
{
@@ -455,15 +508,8 @@
/* kernel trap */
- {
- const struct exception_table_entry *fixup;
- fixup = search_exception_tables(regs->rip);
- if (fixup) {
- regs->rip = fixup->fixup;
- } else
- die(str, regs, error_code);
- return;
- }
+ if (!fixup_exception(regs, trapnr, signr, error_code, info))
+ die(str, regs, error_code);
}
#define DO_ERROR(trapnr, signr, str, name) \
@@ -520,7 +566,7 @@
}
#endif
- if ((regs->cs & 3)!=0) {
+ if ((regs->cs & 3) != 0) {
struct task_struct *tsk = current;
if (exception_trace && unhandled_signal(tsk, SIGSEGV))
@@ -536,18 +582,12 @@
}
/* kernel gp */
- {
- const struct exception_table_entry *fixup;
- fixup = search_exception_tables(regs->rip);
- if (fixup) {
- regs->rip = fixup->fixup;
- return;
- }
- if (notify_die(DIE_GPF, "general protection fault", regs,
- error_code, 13, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
- }
+ if (fixup_exception(regs, 13, SIGSEGV, error_code, NULL))
+ return;
+ if (notify_die(DIE_GPF, "general protection fault", regs,
+ error_code, 13, SIGSEGV) == NOTIFY_STOP)
+ return;
+ die("general protection fault", regs, error_code);
}
static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
@@ -729,12 +769,8 @@
static int kernel_math_error(struct pt_regs *regs, char *str)
{
- const struct exception_table_entry *fixup;
- fixup = search_exception_tables(regs->rip);
- if (fixup) {
- regs->rip = fixup->fixup;
+ if (fixup_exception(regs, 16, SIGFPE, 0, NULL))
return 1;
- }
notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE);
/* Illegal floating point operation in the kernel */
die(str, regs, 0);
--- linux-2.6/arch/x86_64/mm/extable.c (mode:100644 sha1:2d78f9fb403542b72771075502c4c662cc06ca35)
+++ linux-2.6/arch/x86_64/mm/extable.c (mode:100644 sha1:e8fd080fffb380dd4e60ea08f8b4e086f63563dd)
@@ -33,3 +33,21 @@
}
return NULL;
}
+
+int fixup_exception(struct pt_regs *regs, int trapnr, int signr,
+ unsigned long error_code, siginfo_t *info)
+{
+ const struct exception_table_entry *fixup;
+
+ fixup = search_exception_tables(regs->rip);
+ if (fixup) {
+ if (fixup_is_special(fixup)) {
+ exception_fixup_t *hook = special_fixup_handler(fixup);
+ return (*hook) (regs, trapnr, signr, error_code, info);
+ }
+ regs->rip = fixup->fixup;
+ return 1;
+ }
+
+ return 0;
+}
--- linux-2.6/arch/x86_64/mm/fault.c (mode:100644 sha1:e3f90b6abb8d5fabfac28799e8a493f59759eed0)
+++ linux-2.6/arch/x86_64/mm/fault.c (mode:100644 sha1:92168ed10c1cdb4ca1eee76b1b4b67bb91dfa984)
@@ -298,7 +298,6 @@
struct mm_struct *mm;
struct vm_area_struct * vma;
unsigned long address;
- const struct exception_table_entry *fixup;
int write;
siginfo_t info;
@@ -456,6 +455,12 @@
up_read(&mm->mmap_sem);
bad_area_nosemaphore:
+ /* Set up info passed to force_sig_info or to fixup_exception. */
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ /* info.si_code has been set above */
+ info.si_addr = (void __user *)address;
+
/* User mode accesses just cause a SIGSEGV */
if (error_code & 4) {
if (is_prefetch(regs, address, error_code))
@@ -483,10 +488,6 @@
/* Kernel addresses are always protection faults */
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void __user *)address;
force_sig_info(SIGSEGV, &info, tsk);
return;
}
@@ -494,11 +495,8 @@
no_context:
/* Are we prepared to handle this kernel fault? */
- fixup = search_exception_tables(regs->rip);
- if (fixup) {
- regs->rip = fixup->fixup;
+ if (fixup_exception(regs, 14, info.si_signo, error_code, &info))
return;
- }
/*
* Hall of shame of CPU/BIOS bugs.
@@ -544,11 +542,20 @@
printk("VM: killing process %s\n", tsk->comm);
if (error_code & 4)
do_exit(SIGKILL);
+ info.si_signo = SIGKILL;
+ info.si_errno = 0;
+ info.si_addr = (void __user *)address;
goto no_context;
do_sigbus:
up_read(&mm->mmap_sem);
+ /* Set up info passed to force_sig_info or to fixup_exception. */
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+
/* Kernel mode? Handle exceptions or die */
if (!(error_code & 4))
goto no_context;
@@ -556,10 +563,6 @@
tsk->thread.cr2 = address;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 14;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
force_sig_info(SIGBUS, &info, tsk);
return;
}
--- linux-2.6/include/asm-x86_64/uaccess.h (mode:100644 sha1:48f292752c96bac97900682b98e947651b701d6b)
+++ linux-2.6/include/asm-x86_64/uaccess.h (mode:100644 sha1:721817e4c48297af0306137b659325425d4bf82f)
@@ -73,6 +73,19 @@
{
unsigned long insn, fixup;
};
+static inline int fixup_is_special(const struct exception_table_entry *entry)
+{
+ return unlikely(entry->fixup < PAGE_OFFSET);
+}
+struct siginfo;
+typedef int exception_fixup_t(struct pt_regs *regs, int trapnr, int signr,
+ unsigned long err, struct siginfo *info);
+static inline exception_fixup_t *
+special_fixup_handler(const struct exception_table_entry *entry)
+{
+ return (exception_fixup_t *) (entry->fixup + PAGE_OFFSET);
+}
+extern exception_fixup_t fixup_exception;
#define ARCH_HAS_SEARCH_EXTABLE
> This patch makes faults in iret produce the normal signals that would
> result from the same errors when executing some user-mode instruction.
> To accomplish this, I've extended the exception_table mechanism to support
> "special fixups". Instead of a PC location to jump to, these have a
> function called in the trap handler context and passed the full trap details.
As written earlier I dont like this patch because it is far too complicated.
I would just fake a simple signal in the error handler, all the other
complicated infrastructure seems unnecessary.
Linus, Andrew, please dont apply this patch.
-Andi
On Thu, Apr 28, 2005 at 08:40:12PM -0700, Roland McGrath wrote:
> I was never very happy with the special-case check for iret_exc either.
> But for the first crack, I went for the fix that didn't touch other
> infrastructure code.
>
> The fault.c changes here are really not necessary for the bug fix at all,
> it will never be used there. But to make it a clean infrastructure
> upgrade, I made every caller of fixup_exception consistently pass in the
> complete info it uses for signals in the user-mode case.
>
>
> A user process can cause a kernel-mode fault in the iret instruction, by
> setting an invalid %cs and such, via ptrace or sigreturn. The fixup code
> now calls do_exit(11), so a process will die with SIGSEGV but never
> generate a core dump. This is vastly more confusing in a multithreaded
> program, because do_exit just kills that one thread and so it appears to
> mysteriously disappear when it calls sigreturn.
>
> This patch makes faults in iret produce the normal signals that would
> result from the same errors when executing some user-mode instruction.
> To accomplish this, I've extended the exception_table mechanism to support
> "special fixups". Instead of a PC location to jump to, these have a
> function called in the trap handler context and passed the full trap details.
>
I think this is still far too complicated. Or at least I would
not accept a similar patch for x86-64 :) The infrastructure
seems also needlessly complicated to me, because I dont know
what it should be used for except for this.
Is it really that hard to just cause a signal in the
iret exception handler in entry.S? You can get the trapno
there anyways from orig_rax (at least on x86-64)
And having all that complicated code just to get the trapno
seems quite pointless to me. Especially since nobody
uses it it will be likely bitrotten more often than working
anyways, so it is not very useful. The error code is
in the stack frame anyways and you can get it from there.
And the trapno can just be faked to something.
-Andi
On Thu, 28 Apr 2005, Roland McGrath wrote:
>
> I was never very happy with the special-case check for iret_exc either.
> But for the first crack, I went for the fix that didn't touch other
> infrastructure code.
>
> The fault.c changes here are really not necessary for the bug fix at all,
> it will never be used there. But to make it a clean infrastructure
> upgrade, I made every caller of fixup_exception consistently pass in the
> complete info it uses for signals in the user-mode case.
I really ended up deciding that we can fix it with a simple one-liner
instead, which actually simplifies and cleans up the code, instead of
adding new special cases.
I just committed the appended, which actually removes one line more than
it adds.
Linus
-----
commit a879cbbb34cbecfa9707fbb6e5a00c503ac1ecb9
tree fdf247f8dedea8f04d0989aeab6922ed073eee11
parent c06fec5022ebe014af876da2df4a0eee836e97c8
author Linus Torvalds <[email protected]> Fri, 29 Apr 2005 09:38:44 -0700
committer Linus Torvalds <[email protected]> Fri, 29 Apr 2005 09:38:44 -0700
x86: make traps on 'iret' be debuggable in user space
This makes a trap on the 'iret' that returns us to user space
cause a nice clean SIGSEGV, instead of just a hard (and silent)
exit.
That way a debugger can actually try to see what happened, and
we also properly notify everybody who might be interested about
us being gone.
This loses the error code, but tells the debugger what happened
with ILL_BADSTK in the siginfo.
--- k/arch/i386/kernel/entry.S (mode:100644)
+++ l/arch/i386/kernel/entry.S (mode:100644)
@@ -260,11 +260,9 @@ restore_nocheck:
.section .fixup,"ax"
iret_exc:
sti
- movl $__USER_DS, %edx
- movl %edx, %ds
- movl %edx, %es
- movl $11,%eax
- call do_exit
+ pushl $0 # no error code
+ pushl $do_iret_error
+ jmp error_code
.previous
.section __ex_table,"a"
.align 4
--- k/arch/i386/kernel/traps.c (mode:100644)
+++ l/arch/i386/kernel/traps.c (mode:100644)
@@ -451,6 +451,7 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", inv
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
fastcall void do_general_protection(struct pt_regs * regs, long error_code)
{
> I really ended up deciding that we can fix it with a simple one-liner
> instead, which actually simplifies and cleans up the code, instead of
> adding new special cases.
>
> I just committed the appended, which actually removes one line more than
> it adds.
Thanks looks great. I will add the equivalent to x86-64.
-Andi