2006-03-22 06:48:18

by Chris Wright

[permalink] [raw]
Subject: [RFC PATCH 11/35] Add support for Xen to entry.S.

- change cli/sti
- change test for user mode return to work for kernel mode in ring1
- check hypervisor saved event mask on return from exception
- add entry points for the hypervisor upcall handlers
- avoid math emulation check when running on Xen
- add nmi handler for running on Xen

Signed-off-by: Ian Pratt <[email protected]>
Signed-off-by: Christian Limpach <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
---
arch/i386/kernel/entry.S | 228 +++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 211 insertions(+), 17 deletions(-)

--- xen-subarch-2.6.orig/arch/i386/kernel/entry.S
+++ xen-subarch-2.6/arch/i386/kernel/entry.S
@@ -75,8 +75,42 @@ DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000

+#ifndef CONFIG_XEN
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#else
+#include <xen/interface/xen.h>
+
+EVENT_MASK = 0x2E
+
+/* Pseudo-eflags. */
+NMI_MASK = 0x80000000
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending /* 0 */
+#define evtchn_upcall_mask 1
+
+#define sizeof_vcpu_shift 6
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
+ shl $sizeof_vcpu_shift,%esi ; \
+ addl HYPERVISOR_shared_info,%esi
+#else
+#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
+#endif
+
+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
+#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
+ __DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
+ __ENABLE_INTERRUPTS
+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
+#endif
+
#ifdef CONFIG_PREEMPT
-#define preempt_stop cli
+#define preempt_stop DISABLE_INTERRUPTS
#else
#define preempt_stop
#define resume_kernel restore_nocheck
@@ -145,10 +179,10 @@ ret_from_intr:
GET_THREAD_INFO(%ebp)
movl EFLAGS(%esp), %eax # mix EFLAGS and CS
movb CS(%esp), %al
- testl $(VM_MASK | 3), %eax
+ testl $(VM_MASK | USER_MODE_MASK), %eax
jz resume_kernel
ENTRY(resume_userspace)
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -159,7 +193,7 @@ ENTRY(resume_userspace)

#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
- cli
+ DISABLE_INTERRUPTS
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
@@ -179,7 +213,7 @@ need_resched:
ENTRY(sysenter_entry)
movl TSS_sysenter_esp0(%esp),%esp
sysenter_past_esp:
- sti
+ ENABLE_INTERRUPTS
pushl $(__USER_DS)
pushl %ebp
pushfl
@@ -209,7 +243,7 @@ sysenter_past_esp:
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp)
- cli
+ DISABLE_INTERRUPTS
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
jne syscall_exit_work
@@ -217,7 +251,7 @@ sysenter_past_esp:
movl EIP(%esp), %edx
movl OLDESP(%esp), %ecx
xorl %ebp,%ebp
- sti
+ ENABLE_INTERRUPTS
sysexit


@@ -236,7 +270,7 @@ syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp) # store the return value
syscall_exit:
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -244,6 +278,7 @@ syscall_exit:
jne syscall_exit_work

restore_all:
+#ifndef CONFIG_XEN
movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -254,12 +289,25 @@ restore_all:
cmpl $((4 << 8) | 3), %eax
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
+#else
+restore_nocheck:
+ testl $(VM_MASK|NMI_MASK), EFLAGS(%esp)
+ jnz hypervisor_iret
+ movb EVENT_MASK(%esp), %al
+ notb %al # %al == ~saved_mask
+ GET_VCPU_INFO
+ andb evtchn_upcall_mask(%esi),%al
+ andb $1,%al # %al == mask & ~saved_mask
+ jnz restore_all_enable_events # != 0 => reenable event delivery
+#endif
RESTORE_REGS
addl $4, %esp
1: iret
.section .fixup,"ax"
iret_exc:
- sti
+#ifndef CONFIG_XEN
+ ENABLE_INTERRUPTS
+#endif
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -269,6 +317,7 @@ iret_exc:
.long 1b,iret_exc
.previous

+#ifndef CONFIG_XEN
ldt_ss:
larl OLDSS(%esp), %eax
jnz restore_nocheck
@@ -281,7 +330,7 @@ ldt_ss:
* CPUs, which we can try to work around to make
* dosemu and wine happy. */
subl $8, %esp # reserve space for switch16 pointer
- cli
+ DISABLE_INTERRUPTS
movl %esp, %eax
/* Set up the 16bit stack frame with switch32 pointer on top,
* and a switch16 pointer on top of the current frame. */
@@ -293,6 +342,13 @@ ldt_ss:
.align 4
.long 1b,iret_exc
.previous
+#else
+hypervisor_iret:
+ andl $~NMI_MASK, EFLAGS(%esp)
+ RESTORE_REGS
+ addl $4, %esp
+ jmp hypercall_page + (__HYPERVISOR_iret * 32)
+#endif

# perform work that needs to be done immediately before resumption
ALIGN
@@ -301,7 +357,7 @@ work_pending:
jz work_notifysig
work_resched:
call schedule
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -353,7 +409,7 @@ syscall_trace_entry:
syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
- sti # could let do_syscall_trace() call
+ ENABLE_INTERRUPTS # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
@@ -373,6 +429,7 @@ syscall_badsys:
movl $-ENOSYS,EAX(%esp)
jmp resume_userspace

+#ifndef CONFIG_XEN
#define FIXUP_ESPFIX_STACK \
movl %esp, %eax; \
/* switch to 32bit stack using the pointer on top of 16bit stack */ \
@@ -431,6 +488,9 @@ ENTRY(name) \

/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
+#else
+#define UNWIND_ESPFIX_STACK
+#endif

ENTRY(divide_error)
pushl $0 # no error code
@@ -462,6 +522,126 @@ error_code:
call *%edi
jmp ret_from_exception

+#ifdef CONFIG_XEN
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+ pushl %eax
+ SAVE_ALL
+ movl EIP(%esp),%eax
+ cmpl $scrit,%eax
+ jb 11f
+ cmpl $ecrit,%eax
+ jb critical_region_fixup
+11: push %esp
+ call evtchn_do_upcall
+ add $4,%esp
+ jmp ret_from_intr
+
+ ALIGN
+restore_all_enable_events:
+ __ENABLE_INTERRUPTS
+scrit: /**** START OF CRITICAL REGION ****/
+ __TEST_PENDING
+ jnz 14f # process more events if necessary...
+ RESTORE_REGS
+ addl $4, %esp
+1: iret
+.section .fixup,"ax"
+2: pushl $0
+ pushl $do_iret_error
+ jmp error_code
+.previous
+.section __ex_table,"a"
+ .align 4
+ .long 1b,2b
+.previous
+14: __DISABLE_INTERRUPTS
+ jmp 11b
+ecrit: /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+ addl $critical_fixup_table-scrit,%eax
+ movzbl (%eax),%eax # %eax contains num bytes popped
+ cmpb $0xff,%al # 0xff => vcpu_info critical region
+ jne 15f
+ GET_THREAD_INFO(%ebp)
+ xorl %eax,%eax
+15: mov %esp,%esi
+ add %eax,%esi # %esi points at end of src region
+ mov %esp,%edi
+ add $0x34,%edi # %edi points at end of dst region
+ mov %eax,%ecx
+ shr $2,%ecx # convert words to bytes
+ je 17f # skip loop if nothing to copy
+16: subl $4,%esi # pre-decrementing copy loop
+ subl $4,%edi
+ movl (%esi),%eax
+ movl %eax,(%edi)
+ loop 16b
+17: movl %edi,%esp # final %edi is top of merged stack
+ jmp 11b
+
+critical_fixup_table:
+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING
+ .byte 0xff,0xff # jnz 14f
+ .byte 0x00 # pop %ebx
+ .byte 0x04 # pop %ecx
+ .byte 0x08 # pop %edx
+ .byte 0x0c # pop %esi
+ .byte 0x10 # pop %edi
+ .byte 0x14 # pop %ebp
+ .byte 0x18 # pop %eax
+ .byte 0x1c # pop %ds
+ .byte 0x20 # pop %es
+ .byte 0x24,0x24,0x24 # add $4,%esp
+ .byte 0x28 # iret
+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
+ .byte 0x00,0x00 # jmp 11b
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+1: popl %ds
+2: popl %es
+3: popl %fs
+4: popl %gs
+ subl $4,%esp
+ SAVE_ALL
+ jmp ret_from_exception
+.section .fixup,"ax"; \
+6: movl $0,(%esp); \
+ jmp 1b; \
+7: movl $0,(%esp); \
+ jmp 2b; \
+8: movl $0,(%esp); \
+ jmp 3b; \
+9: movl $0,(%esp); \
+ jmp 4b; \
+.previous; \
+.section __ex_table,"a";\
+ .align 4; \
+ .long 1b,6b; \
+ .long 2b,7b; \
+ .long 3b,8b; \
+ .long 4b,9b; \
+.previous
+#endif
+
ENTRY(coprocessor_error)
pushl $0
pushl $do_coprocessor_error
@@ -475,17 +655,19 @@ ENTRY(simd_coprocessor_error)
ENTRY(device_not_available)
pushl $-1 # mark this as an int
SAVE_ALL
+#ifndef CONFIG_XEN
movl %cr0, %eax
testl $0x4, %eax # EM (math emulation bit)
- jne device_not_available_emulate
- preempt_stop
- call math_state_restore
- jmp ret_from_exception
-device_not_available_emulate:
+ je device_available_emulate
pushl $0 # temporary storage for ORIG_EIP
call math_emulate
addl $4, %esp
jmp ret_from_exception
+device_available_emulate:
+#endif
+ preempt_stop
+ call math_state_restore
+ jmp ret_from_exception

/*
* Debug traps and NMI can happen at the one SYSENTER instruction
@@ -521,6 +703,8 @@ debug_stack_correct:
call do_debug
jmp ret_from_exception
.previous .text
+
+#ifndef CONFIG_XEN
/*
* NMI is doubly nasty. It can happen _while_ we're handling
* a debug fault, and the debug fault hasn't yet been able to
@@ -591,6 +775,16 @@ nmi_16bit_stack:
.align 4
.long 1b,iret_exc
.previous
+#else
+ENTRY(nmi)
+ pushl %eax
+ SAVE_ALL
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ orl $NMI_MASK, EFLAGS(%esp)
+ jmp restore_all
+#endif

KPROBE_ENTRY(int3)
pushl $-1 # mark this as an int

--


2006-03-22 15:03:56

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC PATCH 11/35] Add support for Xen to entry.S.

On Wednesday 22 March 2006 07:30, Chris Wright wrote:

> +
> +/* Offsets into shared_info_t. */
> +#define evtchn_upcall_pending /* 0 */
> +#define evtchn_upcall_mask 1
> +
> +#define sizeof_vcpu_shift 6

This should be all generated in asm-offsets.c

> +#else

This needs a comment explaining why it is different

> +restore_nocheck:
> + testl $(VM_MASK|NMI_MASK), EFLAGS(%esp)
> + jnz hypervisor_iret
> + movb EVENT_MASK(%esp), %al
> + notb %al # %al == ~saved_mask
> + GET_VCPU_INFO
> + andb evtchn_upcall_mask(%esi),%al
> + andb $1,%al # %al == mask & ~saved_mask

This would be faster if you used 32bit because it wouldn't cause
partial register stalls.

> + jnz restore_all_enable_events # != 0 => reenable event delivery
> +#endif
> RESTORE_REGS
> addl $4, %esp
> 1: iret
> .section .fixup,"ax"
> iret_exc:
> - sti
> +#ifndef CONFIG_XEN
> + ENABLE_INTERRUPTS
> +#endif
> pushl $0 # no error code
> pushl $do_iret_error
> jmp error_code
> @@ -269,6 +317,7 @@ iret_exc:
> .long 1b,iret_exc
> .previous
>
> +#ifndef CONFIG_XEN
> ldt_ss:

So are you sure that problem this ugly piece of code tries to work around
isn't in Xen kernels too? Or do you just not care? If yes add a comment.

> larl OLDSS(%esp), %eax
> jnz restore_nocheck
> @@ -281,7 +330,7 @@ ldt_ss:
> * CPUs, which we can try to work around to make
> * dosemu and wine happy. */
> subl $8, %esp # reserve space for switch16 pointer
> - cli
> + DISABLE_INTERRUPTS
> movl %esp, %eax
> /* Set up the 16bit stack frame with switch32 pointer on top,
> * and a switch16 pointer on top of the current frame. */
> @@ -293,6 +342,13 @@ ldt_ss:
> .align 4
> .long 1b,iret_exc
> .previous
> +#else

Needs comment.

> +hypervisor_iret:
> + andl $~NMI_MASK, EFLAGS(%esp)
> + RESTORE_REGS
> + addl $4, %esp
> + jmp hypercall_page + (__HYPERVISOR_iret * 32)
> +#endif
>

>
> ENTRY(divide_error)
> pushl $0 # no error code
> @@ -462,6 +522,126 @@ error_code:
> call *%edi
> jmp ret_from_exception
>
> +#ifdef CONFIG_XEN

It would be nicer to put that one into a separate file

> +# A note on the "critical region" in our callback handler.




> pushl $do_coprocessor_error
> @@ -475,17 +655,19 @@ ENTRY(simd_coprocessor_error)
> ENTRY(device_not_available)
> pushl $-1 # mark this as an int
> SAVE_ALL
> +#ifndef CONFIG_XEN
> movl %cr0, %eax
> testl $0x4, %eax # EM (math emulation bit)
> - jne device_not_available_emulate
> - preempt_stop
> - call math_state_restore
> - jmp ret_from_exception
> -device_not_available_emulate:
> + je device_available_emulate
> pushl $0 # temporary storage for ORIG_EIP
> call math_emulate
> addl $4, %esp
> jmp ret_from_exception
> +device_available_emulate:
> +#endif
> + preempt_stop
> + call math_state_restore
> + jmp ret_from_exception

It's not quite clear why the Xen special case is needed here. Just because
of the CR0 access? If yes better put that one into a macro that just writes
0 to the register for the Xen case.

-Andi

2006-03-22 17:24:58

by Zachary Amsden

[permalink] [raw]
Subject: Re: [Xen-devel] Re: [RFC PATCH 11/35] Add support for Xen to entry.S.

Andi Kleen wrote:
>
>> + jnz restore_all_enable_events # != 0 => reenable event delivery
>> +#endif
>> RESTORE_REGS
>> addl $4, %esp
>> 1: iret
>> .section .fixup,"ax"
>> iret_exc:
>> - sti
>> +#ifndef CONFIG_XEN
>> + ENABLE_INTERRUPTS
>> +#endif
>> pushl $0 # no error code
>> pushl $do_iret_error
>> jmp error_code
>> @@ -269,6 +317,7 @@ iret_exc:
>> .long 1b,iret_exc
>> .previous
>>
>> +#ifndef CONFIG_XEN
>> ldt_ss:
>>
>
> So are you sure that problem this ugly piece of code tries to work around
> isn't in Xen kernels too? Or do you just not care? If yes add a comment.
>

This code would otherwise be broken. ENABLE_INTERRUPTS in Xen requires
access to the data segment, and the data segment is not available at
this point. Plus, it corrupts the %esi register. Hint - use %ebp.

The LDT SS code is broken as well because the iret onto a 16-bit stack
is a pretty crippling blow to transparency in this code. Then, you
don't have data or even stack segments that are reliable for calling out
to hypervisor assist code. We never really fixed this code either in
our implementation, although we did consider several approaches.
Leaving it out does break userspace applications.

Zach