2020-05-16 00:12:55

by Thomas Gleixner

[permalink] [raw]
Subject: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY


Convert the XEN/PV hypercall to IDTENTRY:

- Emit the ASM stub with DECLARE_IDTENTRY
- Remove the ASM idtentry in 64bit
- Remove the open coded ASM entry code in 32bit
- Remove the old prototypes

The handler stubs need to stay in ASM code as it needs corner case handling
and adjustment of the stack pointer.

Provide a new C function which invokes the entry/exit handling and calls
into the XEN handler on the interrupt stack.

The exit code is slightly different from the regular idtentry_exit() on
non-preemptible kernels. If the hypercall is preemptible and need_resched()
is set then XEN provides a preempt hypercall scheduling function. Add it as
conditional path to __idtentry_exit() so the function can be reused.

__idtentry_exit() is forced inlined so on the regular idtentry_exit() path
the extra condition is optimized out by the compiler.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Juergen Gross <[email protected]>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 882ada245bd5..34caf3849632 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,9 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>

+#include <xen/xen-ops.h>
+#include <xen/events.h>
+
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -35,6 +38,7 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/syscall.h>
+#include <asm/irq_stack.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -539,7 +543,8 @@ void noinstr idtentry_enter(struct pt_regs *regs)
}
}

-static __always_inline void __idtentry_exit(struct pt_regs *regs)
+static __always_inline void __idtentry_exit(struct pt_regs *regs,
+ bool preempt_hcall)
{
lockdep_assert_irqs_disabled();

@@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
instrumentation_end();
return;
}
+ } else if (IS_ENABLED(CONFIG_XEN_PV)) {
+ if (preempt_hcall) {
+ /* See CONFIG_PREEMPTION above */
+ instrumentation_begin();
+ rcu_irq_exit_preempt();
+ xen_maybe_preempt_hcall();
+ trace_hardirqs_on();
+ instrumentation_end();
+ return;
+ }
}
/*
* If preemption is disabled then this needs to be done
@@ -612,5 +627,43 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
*/
void noinstr idtentry_exit(struct pt_regs *regs)
{
- __idtentry_exit(regs);
+ __idtentry_exit(regs, false);
+}
+
+#ifdef CONFIG_XEN_PV
+static void __xen_pv_evtchn_do_upcall(void)
+{
+ irq_enter_rcu();
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_hvm_evtchn_do_upcall();
+
+ irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+
+ idtentry_enter(regs);
+ old_regs = set_irq_regs(regs);
+
+ if (!irq_needs_irq_stack(regs)) {
+ instrumentation_begin();
+ __xen_pv_evtchn_do_upcall();
+ instrumentation_end();
+ } else {
+ run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL);
+ }
+
+ set_irq_regs(old_regs);
+
+ if (IS_ENABLED(CONFIG_PREEMPTION)) {
+ __idtentry_exit(regs, false);
+ } else {
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __idtentry_exit(regs, inhcall && need_resched());
+ }
}
+#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7563a87d7539..6ac890d5c9d8 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1298,7 +1298,10 @@ SYM_CODE_END(native_iret)
#endif

#ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
+/*
+ * See comment in entry_64.S for further explanation
+ */
+SYM_FUNC_START(exc_xen_hypervisor_callback)
/*
* Check to see if we got the event in the critical
* region in xen_iret_direct, after we've reenabled
@@ -1315,14 +1318,11 @@ SYM_FUNC_START(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
+
mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
+ call xen_pv_evtchn_do_upcall
+ jmp handle_exception_return
+SYM_FUNC_END(exc_xen_hypervisor_callback)

/*
* Hypervisor uses this for application faults while it executes.
@@ -1464,6 +1464,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
movl %esp, %eax # pt_regs pointer
CALL_NOSPEC edi

+handle_exception_return:
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bdf8391b2f95..3eddf7c6c530 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1067,10 +1067,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt

idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1

-#ifdef CONFIG_XEN_PV
-idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0
-#endif
-
/*
* Reload gs selector with exception handling
* edi: new selector
@@ -1158,9 +1154,10 @@ SYM_FUNC_END(asm_call_on_stack)
* So, on entry to the handler we detect whether we interrupted an
* existing activation in its critical region -- if so, we pop the current
* activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-/* do_hypervisor_callback(struct *pt_regs) */
-SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)

/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -1170,15 +1167,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
movq %rdi, %rsp /* we don't return, adjust the stack frame */
UNWIND_HINT_REGS

- ENTER_IRQ_STACK old_rsp=%r10
- call xen_evtchn_do_upcall
- LEAVE_IRQ_STACK
+ call xen_pv_evtchn_do_upcall

-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp error_exit
-SYM_CODE_END(xen_do_hypervisor_callback)
+ jmp error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)

/*
* Hypervisor uses this for application faults while it executes.
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index e27731092999..fac73bb3577f 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -332,6 +332,13 @@ __visible noinstr void func(struct pt_regs *regs, \
* This avoids duplicate defines and ensures that everything is consistent.
*/

+/*
+ * Dummy trap number so the low level ASM macro vector number checks do not
+ * match which results in emitting plain IDTENTRY stubs without bells and
+ * whistels.
+ */
+#define X86_TRAP_OTHER 0xFFFF
+
/* Simple exception entry points. No hardware error code */
DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error);
DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow);
@@ -373,4 +380,10 @@ DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug);
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
#endif

+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
+#endif
+
+#undef X86_TRAP_OTHER
+
#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1a2d8a50dac4..3566e37241d7 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/numa.h>
+#include <asm/idtentry.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

@@ -993,7 +994,8 @@ static void __init xen_pvmmu_arch_setup(void)
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);

- if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ if (register_callback(CALLBACKTYPE_event,
+ xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();

diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 8fb8a50a28b4..a92259d701c1 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -27,6 +27,7 @@
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
+#include <asm/idtentry.h>
#include <asm/cpu.h>

#include <xen/interface/xen.h>
@@ -347,7 +348,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->event_callback_eip =
- (unsigned long)xen_hypervisor_callback;
+ (unsigned long)xen_asm_exc_xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index bd06ac473170..0f7ff3088065 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -93,7 +93,7 @@ xen_iret_start_crit:

/*
* If there's something pending, mask events again so we can
- * jump back into xen_hypervisor_callback. Otherwise do not
+ * jump back into exc_xen_hypervisor_callback. Otherwise do not
* touch XEN_vcpu_info_mask.
*/
jne 1f
@@ -113,7 +113,7 @@ iret_restore_end:
* Events are masked, so jumping out of the critical region is
* OK.
*/
- je xen_hypervisor_callback
+ je asm_exc_xen_hypervisor_callback

1: iret
xen_iret_end_crit:
@@ -127,7 +127,7 @@ SYM_CODE_END(xen_iret)
.globl xen_iret_start_crit, xen_iret_end_crit

/*
- * This is called by xen_hypervisor_callback in entry_32.S when it sees
+ * This is called by exc_xen_hypervisor_callback in entry_32.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit.
*
@@ -144,7 +144,7 @@ SYM_CODE_END(xen_iret)
* eflags }
* cs } nested exception info
* eip }
- * return address : (into xen_hypervisor_callback)
+ * return address : (into asm_exc_xen_hypervisor_callback)
*
* In order to deliver the nested exception properly, we need to discard the
* nested exception frame such that when we handle the exception, we do it
@@ -152,7 +152,8 @@ SYM_CODE_END(xen_iret)
*
* The only caveat is that if the outer eax hasn't been restored yet (i.e.
* it's still on stack), we need to restore its value here.
- */
+*/
+.pushsection .noinstr.text, "ax"
SYM_CODE_START(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
@@ -181,3 +182,4 @@ SYM_CODE_START(xen_iret_crit_fixup)
2:
ret
SYM_CODE_END(xen_iret_crit_fixup)
+.popsection
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index e46d863bcaa4..19fbbdbcbde9 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -54,7 +54,7 @@ xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
xen_pv_trap entry_INT80_compat
#endif
-xen_pv_trap hypervisor_callback
+xen_pv_trap asm_exc_xen_hypervisor_callback

__INIT
SYM_CODE_START(xen_early_idt_handler_array)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 45a441c33d6d..4eff29ed375e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -8,7 +8,6 @@
#include <xen/xen-ops.h>

/* These are code, but not functions. Defined in entry.S */
-extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];

void xen_sysenter_target(void);
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
index 17240c5325a3..287171a9dc01 100644
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c
@@ -24,7 +24,7 @@
DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);

-asmlinkage __visible void xen_maybe_preempt_hcall(void)
+void xen_maybe_preempt_hcall(void)
{
if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
&& need_resched())) {
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 095be1d66f31..0ed975cf6f79 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -214,6 +214,7 @@ bool xen_running_on_version_or_later(unsigned int major, unsigned int minor);

void xen_efi_runtime_setup(void);

+DECLARE_PER_CPU(bool, xen_in_preemptible_hcall);

#ifdef CONFIG_PREEMPTION

@@ -225,9 +226,9 @@ static inline void xen_preemptible_hcall_end(void)
{
}

-#else
+static inline void xen_maybe_preempt_hcall(void) { }

-DECLARE_PER_CPU(bool, xen_in_preemptible_hcall);
+#else

static inline void xen_preemptible_hcall_begin(void)
{
@@ -239,6 +240,8 @@ static inline void xen_preemptible_hcall_end(void)
__this_cpu_write(xen_in_preemptible_hcall, false);
}

+void xen_maybe_preempt_hcall(void);
+
#endif /* CONFIG_PREEMPTION */

#endif /* INCLUDE_XEN_OPS_H */


2020-05-19 17:08:20

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner <[email protected]> wrote:
>
>
> Convert the XEN/PV hypercall to IDTENTRY:
>
> - Emit the ASM stub with DECLARE_IDTENTRY
> - Remove the ASM idtentry in 64bit
> - Remove the open coded ASM entry code in 32bit
> - Remove the old prototypes
>
> The handler stubs need to stay in ASM code as it needs corner case handling
> and adjustment of the stack pointer.
>
> Provide a new C function which invokes the entry/exit handling and calls
> into the XEN handler on the interrupt stack.
>
> The exit code is slightly different from the regular idtentry_exit() on
> non-preemptible kernels. If the hypercall is preemptible and need_resched()
> is set then XEN provides a preempt hypercall scheduling function. Add it as
> conditional path to __idtentry_exit() so the function can be reused.
>
> __idtentry_exit() is forced inlined so on the regular idtentry_exit() path
> the extra condition is optimized out by the compiler.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: Boris Ostrovsky <[email protected]>
> Cc: Juergen Gross <[email protected]>
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 882ada245bd5..34caf3849632 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -27,6 +27,9 @@
> #include <linux/syscalls.h>
> #include <linux/uaccess.h>
>
> +#include <xen/xen-ops.h>
> +#include <xen/events.h>
> +
> #include <asm/desc.h>
> #include <asm/traps.h>
> #include <asm/vdso.h>
> @@ -35,6 +38,7 @@
> #include <asm/nospec-branch.h>
> #include <asm/io_bitmap.h>
> #include <asm/syscall.h>
> +#include <asm/irq_stack.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/syscalls.h>
> @@ -539,7 +543,8 @@ void noinstr idtentry_enter(struct pt_regs *regs)
> }
> }
>
> -static __always_inline void __idtentry_exit(struct pt_regs *regs)
> +static __always_inline void __idtentry_exit(struct pt_regs *regs,
> + bool preempt_hcall)
> {
> lockdep_assert_irqs_disabled();
>
> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
> instrumentation_end();
> return;
> }
> + } else if (IS_ENABLED(CONFIG_XEN_PV)) {
> + if (preempt_hcall) {
> + /* See CONFIG_PREEMPTION above */
> + instrumentation_begin();
> + rcu_irq_exit_preempt();
> + xen_maybe_preempt_hcall();
> + trace_hardirqs_on();
> + instrumentation_end();
> + return;
> + }

Ewwwww! This shouldn't be taken as a NAK -- it's just an expression of disgust.

> }
> /*
> * If preemption is disabled then this needs to be done
> @@ -612,5 +627,43 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
> */
> void noinstr idtentry_exit(struct pt_regs *regs)
> {
> - __idtentry_exit(regs);
> + __idtentry_exit(regs, false);
> +}
> +
> +#ifdef CONFIG_XEN_PV
> +static void __xen_pv_evtchn_do_upcall(void)
> +{
> + irq_enter_rcu();
> + inc_irq_stat(irq_hv_callback_count);
> +
> + xen_hvm_evtchn_do_upcall();
> +
> + irq_exit_rcu();
> +}
> +
> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs;
> +
> + idtentry_enter(regs);
> + old_regs = set_irq_regs(regs);
> +
> + if (!irq_needs_irq_stack(regs)) {
> + instrumentation_begin();
> + __xen_pv_evtchn_do_upcall();
> + instrumentation_end();
> + } else {
> + run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL);
> + }

Shouldn't this be:

instrumentation_begin();
if (!irq_needs_irq_stack(...))
__blah();
else
run_on_irqstack(__blah, NULL);
instrumentation_end();

or even:

instrumentation_begin();
run_on_irqstack_if_needed(__blah, NULL);
instrumentation_end();


****** BUT *******

I think this is all arse-backwards. This is a giant mess designed to
pretend we support preemption and to emulate normal preemption in a
non-preemptible kernel. I propose one to two massive cleanups:

A: Just delete all of this code. Preemptible hypercalls on
non-preempt kernels will still process interrupts but won't get
preempted. If you want preemption, compile with preemption.

B: Turn this thing around. Specifically, in the one and only case we
care about, we know pretty much exactly what context we got this entry
in: we're running in a schedulable context doing an explicitly
preemptible hypercall, and we have RIP pointing at a SYSCALL
instruction (presumably, but we shouldn't bet on it) in the hypercall
page. Ideally we would change the Xen PV ABI so the hypercall would
return something like EAGAIN instead of auto-restarting and we could
ditch this mess entirely. But the ABI seems to be set in stone or at
least in molasses, so how about just:

idt_entry(exit(regs));
if (inhcall && need_resched())
schedule();

Off the top of my head, I don't see any reason this wouldn't work, and
it's a heck of a lot cleaner. Possibly it should really be:

if (inhcall) {
if (!WARN_ON(regs->ip not in hypercall page))
cond_resched();
}

2020-05-19 19:01:05

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

Andy Lutomirski <[email protected]> writes:
> On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner <[email protected]> wrote:
>> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
>> instrumentation_end();
>> return;
>> }
>> + } else if (IS_ENABLED(CONFIG_XEN_PV)) {
>> + if (preempt_hcall) {
>> + /* See CONFIG_PREEMPTION above */
>> + instrumentation_begin();
>> + rcu_irq_exit_preempt();
>> + xen_maybe_preempt_hcall();
>> + trace_hardirqs_on();
>> + instrumentation_end();
>> + return;
>> + }
>
> Ewwwww! This shouldn't be taken as a NAK -- it's just an expression
> of disgust.

I'm really not proud of it, but that was the least horrible thing I
could come up with.

> Shouldn't this be:
>
> instrumentation_begin();
> if (!irq_needs_irq_stack(...))
> __blah();
> else
> run_on_irqstack(__blah, NULL);
> instrumentation_end();
>
> or even:
>
> instrumentation_begin();
> run_on_irqstack_if_needed(__blah, NULL);
> instrumentation_end();

Yeah. In that case the instrumentation markers are not required as they
will be inside the run....() function.

> ****** BUT *******
>
> I think this is all arse-backwards. This is a giant mess designed to
> pretend we support preemption and to emulate normal preemption in a
> non-preemptible kernel. I propose one to two massive cleanups:
>
> A: Just delete all of this code. Preemptible hypercalls on
> non-preempt kernels will still process interrupts but won't get
> preempted. If you want preemption, compile with preemption.

I'm happy to do so, but the XEN folks might have opinions on that :)

> B: Turn this thing around. Specifically, in the one and only case we
> care about, we know pretty much exactly what context we got this entry
> in: we're running in a schedulable context doing an explicitly
> preemptible hypercall, and we have RIP pointing at a SYSCALL
> instruction (presumably, but we shouldn't bet on it) in the hypercall
> page. Ideally we would change the Xen PV ABI so the hypercall would
> return something like EAGAIN instead of auto-restarting and we could
> ditch this mess entirely. But the ABI seems to be set in stone or at
> least in molasses, so how about just:
>
> idt_entry(exit(regs));
> if (inhcall && need_resched())
> schedule();

Which brings you into the situation that you call schedule() from the
point where we just moved it out. If we would go there we'd need to
ensure that RCU is watching as well. idtentry_exit() might have it
turned off ....

That's why I did it this way to keep the code flow exactly the same for
all these exit variants.

Thanks,

tglx

2020-05-19 19:47:25

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> writes:
> > On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner <[email protected]> wrote:
> >> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
> >> instrumentation_end();
> >> return;
> >> }
> >> + } else if (IS_ENABLED(CONFIG_XEN_PV)) {
> >> + if (preempt_hcall) {
> >> + /* See CONFIG_PREEMPTION above */
> >> + instrumentation_begin();
> >> + rcu_irq_exit_preempt();
> >> + xen_maybe_preempt_hcall();
> >> + trace_hardirqs_on();
> >> + instrumentation_end();
> >> + return;
> >> + }
> >
> > Ewwwww! This shouldn't be taken as a NAK -- it's just an expression
> > of disgust.
>
> I'm really not proud of it, but that was the least horrible thing I
> could come up with.
>
> > Shouldn't this be:
> >
> > instrumentation_begin();
> > if (!irq_needs_irq_stack(...))
> > __blah();
> > else
> > run_on_irqstack(__blah, NULL);
> > instrumentation_end();
> >
> > or even:
> >
> > instrumentation_begin();
> > run_on_irqstack_if_needed(__blah, NULL);
> > instrumentation_end();
>
> Yeah. In that case the instrumentation markers are not required as they
> will be inside the run....() function.
>
> > ****** BUT *******
> >
> > I think this is all arse-backwards. This is a giant mess designed to
> > pretend we support preemption and to emulate normal preemption in a
> > non-preemptible kernel. I propose one to two massive cleanups:
> >
> > A: Just delete all of this code. Preemptible hypercalls on
> > non-preempt kernels will still process interrupts but won't get
> > preempted. If you want preemption, compile with preemption.
>
> I'm happy to do so, but the XEN folks might have opinions on that :)
>
> > B: Turn this thing around. Specifically, in the one and only case we
> > care about, we know pretty much exactly what context we got this entry
> > in: we're running in a schedulable context doing an explicitly
> > preemptible hypercall, and we have RIP pointing at a SYSCALL
> > instruction (presumably, but we shouldn't bet on it) in the hypercall
> > page. Ideally we would change the Xen PV ABI so the hypercall would
> > return something like EAGAIN instead of auto-restarting and we could
> > ditch this mess entirely. But the ABI seems to be set in stone or at
> > least in molasses, so how about just:
> >
> > idt_entry(exit(regs));
> > if (inhcall && need_resched())
> > schedule();
>
> Which brings you into the situation that you call schedule() from the
> point where we just moved it out. If we would go there we'd need to
> ensure that RCU is watching as well. idtentry_exit() might have it
> turned off ....

I don't think this is possible. Once you untangle all the wrappers,
the call sites are effectively:

__this_cpu_write(xen_in_preemptible_hcall, true);
CALL_NOSPEC to the hypercall page
__this_cpu_write(xen_in_preemptible_hcall, false);

I think IF=1 when this happens, but I won't swear to it. RCU had
better be watching.

As I understand it, the one and only situation Xen wants to handle is
that an interrupt gets delivered during the hypercall. The hypervisor
is too clever for its own good and deals with this by rewinding RIP to
the beginning of whatever instruction did the hypercall and delivers
the interrupt, and we end up in this handler. So, if this happens,
the idea is to not only handle the interrupt but to schedule if
scheduling would be useful.

So I don't think we need all this RCU magic. This really ought to be
able to be simplified to:

idtentry_exit();

if (appropriate condition)
schedule();

Obviously we don't want to schedule if this is a nested entry, but we
should be able to rule that out by checking that regs->flags &
X86_EFLAGS_IF and by handling the percpu variable a little more
intelligently. So maybe the right approach is:

bool in_preemptible_hcall = __this_cpu_read(xen_in_preemptible_hcall);
__this_cpu_write(xen_in_preemptible_hcall, false);
idtentry_enter(...);

do the acutal work;

idtentry_exit(...);

if (in_preemptible_hcall) {
assert regs->flags & X86_EFLAGS_IF;
assert that RCU is watching;
assert that we're on the thread stack;
assert whatever else we feel like asserting;
if (need_resched())
schedule();
}

__this_cpu_write(xen_in_preemptible_hcall, in_preemptible_hcall);

And now we don't have a special idtentry_exit() case just for Xen, and
all the mess is entirely contained in the Xen PV code. And we need to
mark all the preemptible hypercalls noinstr. Does this seem
reasonable?

That being said, right now, with or without your patch, I think we're
toast if the preemptible hypercall code gets traced. So maybe the
right thing is to just drop all the magic preemption stuff from your
patch and let the Xen maintainers submit something new (maybe like
what I suggest above) if they want magic preemption back.

2020-05-20 08:10:46

by Juergen Gross

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On 19.05.20 21:44, Andy Lutomirski wrote:
> On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner <[email protected]> wrote:
>>
>> Andy Lutomirski <[email protected]> writes:
>>> On Fri, May 15, 2020 at 5:10 PM Thomas Gleixner <[email protected]> wrote:
>>>> @@ -573,6 +578,16 @@ static __always_inline void __idtentry_exit(struct pt_regs *regs)
>>>> instrumentation_end();
>>>> return;
>>>> }
>>>> + } else if (IS_ENABLED(CONFIG_XEN_PV)) {
>>>> + if (preempt_hcall) {
>>>> + /* See CONFIG_PREEMPTION above */
>>>> + instrumentation_begin();
>>>> + rcu_irq_exit_preempt();
>>>> + xen_maybe_preempt_hcall();
>>>> + trace_hardirqs_on();
>>>> + instrumentation_end();
>>>> + return;
>>>> + }
>>>
>>> Ewwwww! This shouldn't be taken as a NAK -- it's just an expression
>>> of disgust.
>>
>> I'm really not proud of it, but that was the least horrible thing I
>> could come up with.
>>
>>> Shouldn't this be:
>>>
>>> instrumentation_begin();
>>> if (!irq_needs_irq_stack(...))
>>> __blah();
>>> else
>>> run_on_irqstack(__blah, NULL);
>>> instrumentation_end();
>>>
>>> or even:
>>>
>>> instrumentation_begin();
>>> run_on_irqstack_if_needed(__blah, NULL);
>>> instrumentation_end();
>>
>> Yeah. In that case the instrumentation markers are not required as they
>> will be inside the run....() function.
>>
>>> ****** BUT *******
>>>
>>> I think this is all arse-backwards. This is a giant mess designed to
>>> pretend we support preemption and to emulate normal preemption in a
>>> non-preemptible kernel. I propose one to two massive cleanups:
>>>
>>> A: Just delete all of this code. Preemptible hypercalls on
>>> non-preempt kernels will still process interrupts but won't get
>>> preempted. If you want preemption, compile with preemption.
>>
>> I'm happy to do so, but the XEN folks might have opinions on that :)

Indeed. :-)

>>
>>> B: Turn this thing around. Specifically, in the one and only case we
>>> care about, we know pretty much exactly what context we got this entry
>>> in: we're running in a schedulable context doing an explicitly
>>> preemptible hypercall, and we have RIP pointing at a SYSCALL
>>> instruction (presumably, but we shouldn't bet on it) in the hypercall
>>> page. Ideally we would change the Xen PV ABI so the hypercall would
>>> return something like EAGAIN instead of auto-restarting and we could
>>> ditch this mess entirely. But the ABI seems to be set in stone or at
>>> least in molasses, so how about just:
>>>
>>> idt_entry(exit(regs));
>>> if (inhcall && need_resched())
>>> schedule();
>>
>> Which brings you into the situation that you call schedule() from the
>> point where we just moved it out. If we would go there we'd need to
>> ensure that RCU is watching as well. idtentry_exit() might have it
>> turned off ....
>
> I don't think this is possible. Once you untangle all the wrappers,
> the call sites are effectively:
>
> __this_cpu_write(xen_in_preemptible_hcall, true);
> CALL_NOSPEC to the hypercall page
> __this_cpu_write(xen_in_preemptible_hcall, false);
>
> I think IF=1 when this happens, but I won't swear to it. RCU had
> better be watching.

Preemptible hypercalls are never done with interrupts off. To be more
precise: they are only ever done during ioctl() processing.

I can add an ASSERT() to xen_preemptible_hcall_begin() if you want.

>
> As I understand it, the one and only situation Xen wants to handle is
> that an interrupt gets delivered during the hypercall. The hypervisor
> is too clever for its own good and deals with this by rewinding RIP to
> the beginning of whatever instruction did the hypercall and delivers
> the interrupt, and we end up in this handler. So, if this happens,
> the idea is to not only handle the interrupt but to schedule if
> scheduling would be useful.

Correct. More precise: the hypercalls in question can last very long
(up to several seconds) and so they need to be interruptible. As said
before: the interface how this is done is horrible. :-(

>
> So I don't think we need all this RCU magic. This really ought to be
> able to be simplified to:
>
> idtentry_exit();
>
> if (appropriate condition)
> schedule();
>
> Obviously we don't want to schedule if this is a nested entry, but we
> should be able to rule that out by checking that regs->flags &
> X86_EFLAGS_IF and by handling the percpu variable a little more
> intelligently. So maybe the right approach is:
>
> bool in_preemptible_hcall = __this_cpu_read(xen_in_preemptible_hcall);
> __this_cpu_write(xen_in_preemptible_hcall, false);
> idtentry_enter(...);
>
> do the acutal work;
>
> idtentry_exit(...);
>
> if (in_preemptible_hcall) {
> assert regs->flags & X86_EFLAGS_IF;
> assert that RCU is watching;
> assert that we're on the thread stack;
> assert whatever else we feel like asserting;
> if (need_resched())
> schedule();
> }
>
> __this_cpu_write(xen_in_preemptible_hcall, in_preemptible_hcall);
>
> And now we don't have a special idtentry_exit() case just for Xen, and
> all the mess is entirely contained in the Xen PV code. And we need to
> mark all the preemptible hypercalls noinstr. Does this seem
> reasonable?

From my point of view this sounds fine.

>
> That being said, right now, with or without your patch, I think we're
> toast if the preemptible hypercall code gets traced. So maybe the
> right thing is to just drop all the magic preemption stuff from your
> patch and let the Xen maintainers submit something new (maybe like
> what I suggest above) if they want magic preemption back.
>

I'd prefer to not break preemptible hypercall in between.

IMO the patch should be modified along your suggestion. I'd be happy to
test it.


Juergen

2020-05-20 11:42:50

by Andrew Cooper

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On 20/05/2020 09:06, Jürgen Groß wrote:
> On 19.05.20 21:44, Andy Lutomirski wrote:
>> On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner <[email protected]>
>> wrote:
>>>
>>> Andy Lutomirski <[email protected]> writes:
>>>> B: Turn this thing around.  Specifically, in the one and only case we
>>>> care about, we know pretty much exactly what context we got this entry
>>>> in: we're running in a schedulable context doing an explicitly
>>>> preemptible hypercall, and we have RIP pointing at a SYSCALL
>>>> instruction (presumably, but we shouldn't bet on it) in the hypercall
>>>> page.  Ideally we would change the Xen PV ABI so the hypercall would
>>>> return something like EAGAIN instead of auto-restarting and we could
>>>> ditch this mess entirely.  But the ABI seems to be set in stone or at
>>>> least in molasses, so how about just:
>>>>
>>>> idt_entry(exit(regs));
>>>> if (inhcall && need_resched())
>>>>    schedule();
>>>
>>> Which brings you into the situation that you call schedule() from the
>>> point where we just moved it out. If we would go there we'd need to
>>> ensure that RCU is watching as well. idtentry_exit() might have it
>>> turned off ....
>>
>> I don't think this is possible.  Once you untangle all the wrappers,
>> the call sites are effectively:
>>
>> __this_cpu_write(xen_in_preemptible_hcall, true);
>> CALL_NOSPEC to the hypercall page
>> __this_cpu_write(xen_in_preemptible_hcall, false);
>>
>> I think IF=1 when this happens, but I won't swear to it.  RCU had
>> better be watching.
>
> Preemptible hypercalls are never done with interrupts off. To be more
> precise: they are only ever done during ioctl() processing.
>
> I can add an ASSERT() to xen_preemptible_hcall_begin() if you want.
>
>>
>> As I understand it, the one and only situation Xen wants to handle is
>> that an interrupt gets delivered during the hypercall.  The hypervisor
>> is too clever for its own good and deals with this by rewinding RIP to
>> the beginning of whatever instruction did the hypercall and delivers
>> the interrupt, and we end up in this handler.  So, if this happens,
>> the idea is to not only handle the interrupt but to schedule if
>> scheduling would be useful.
>
> Correct. More precise: the hypercalls in question can last very long
> (up to several seconds) and so they need to be interruptible. As said
> before: the interface how this is done is horrible. :-(

Forget seconds.  DOMCTL_domain_kill gets to ~14 minutes for a 2TB domain.

The reason for the existing logic is to be able voluntarily preempt.

It doesn't need to remain the way it is, but some adequate form of
pre-emption does need to stay.

~Andrew

2020-05-20 14:18:06

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

Andy Lutomirski <[email protected]> writes:
> On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner <[email protected]> wrote:
>> Which brings you into the situation that you call schedule() from the
>> point where we just moved it out. If we would go there we'd need to
>> ensure that RCU is watching as well. idtentry_exit() might have it
>> turned off ....
>
> I don't think this is possible. Once you untangle all the wrappers,
> the call sites are effectively:
>
> __this_cpu_write(xen_in_preemptible_hcall, true);
> CALL_NOSPEC to the hypercall page
> __this_cpu_write(xen_in_preemptible_hcall, false);
>
> I think IF=1 when this happens, but I won't swear to it. RCU had
> better be watching.
>
> As I understand it, the one and only situation Xen wants to handle is
> that an interrupt gets delivered during the hypercall. The hypervisor
> is too clever for its own good and deals with this by rewinding RIP to
> the beginning of whatever instruction did the hypercall and delivers
> the interrupt, and we end up in this handler. So, if this happens,
> the idea is to not only handle the interrupt but to schedule if
> scheduling would be useful.
>
> So I don't think we need all this RCU magic. This really ought to be
> able to be simplified to:
>
> idtentry_exit();
>
> if (appropriate condition)
> schedule();

This is exactly the kind of tinkering which causes all kinds of trouble.

idtentry_exit()

if (user_mode(regs)) {
prepare_exit_to_usermode(regs);
} else if (regs->flags & X86_EFLAGS_IF) {
/* Check kernel preemption, if enabled */
if (IS_ENABLED(CONFIG_PREEMPTION)) {
....
}
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
rcu_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
} else {
/* IRQ flags state is correct already. Just tell RCU */
rcu_irq_exit();
}

So in case IF is set then this already told the tracer and lockdep that
interrupts are enabled. And contrary to the ugly version this exit path
does not use rcu_irq_exit_preempt() which is there to warn about crappy
RCU state when trying to schedule.

So we went great length to sanitize _all_ of this and make it consistent
just to say: screw it for that xen thingy.

The extra checks and extra warnings for scheduling come with the
guarantee to bitrot when idtentry_exit() or any logic invoked from there
is changed. It's going to look like this:

/*
* If the below causes problems due to inconsistent state
* or out of sync sanity checks, please complain to
* [email protected] directly.
*/
idtentry_exit();

if (user_mode(regs) || !(regs->flags & X86_FlAGS_IF))
return;

if (!__this_cpu_read(xen_in_preemptible_hcall))
return;

rcu_sanity_check_for_preemption();

if (need_resched()) {
instrumentation_begin();
xen_maybe_preempt_hcall();
trace_hardirqs_on();
instrumentation_end();
}

Of course you need the extra rcu_sanity_check_for_preemption() function
just for this muck.

That's a true win on all ends? I don't think so.

Thanks,

tglx

2020-05-20 15:19:11

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On Wed, May 20, 2020 at 7:13 AM Thomas Gleixner <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> writes:
> > On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner <[email protected]> wrote:
> >> Which brings you into the situation that you call schedule() from the
> >> point where we just moved it out. If we would go there we'd need to
> >> ensure that RCU is watching as well. idtentry_exit() might have it
> >> turned off ....
> >
> > I don't think this is possible. Once you untangle all the wrappers,
> > the call sites are effectively:
> >
> > __this_cpu_write(xen_in_preemptible_hcall, true);
> > CALL_NOSPEC to the hypercall page
> > __this_cpu_write(xen_in_preemptible_hcall, false);
> >
> > I think IF=1 when this happens, but I won't swear to it. RCU had
> > better be watching.
> >
> > As I understand it, the one and only situation Xen wants to handle is
> > that an interrupt gets delivered during the hypercall. The hypervisor
> > is too clever for its own good and deals with this by rewinding RIP to
> > the beginning of whatever instruction did the hypercall and delivers
> > the interrupt, and we end up in this handler. So, if this happens,
> > the idea is to not only handle the interrupt but to schedule if
> > scheduling would be useful.
> >
> > So I don't think we need all this RCU magic. This really ought to be
> > able to be simplified to:
> >
> > idtentry_exit();
> >
> > if (appropriate condition)
> > schedule();
>
> This is exactly the kind of tinkering which causes all kinds of trouble.
>
> idtentry_exit()
>
> if (user_mode(regs)) {
> prepare_exit_to_usermode(regs);
> } else if (regs->flags & X86_EFLAGS_IF) {
> /* Check kernel preemption, if enabled */
> if (IS_ENABLED(CONFIG_PREEMPTION)) {
> ....
> }
> instrumentation_begin();
> /* Tell the tracer that IRET will enable interrupts */
> trace_hardirqs_on_prepare();
> lockdep_hardirqs_on_prepare(CALLER_ADDR0);
> instrumentation_end();
> rcu_irq_exit();
> lockdep_hardirqs_on(CALLER_ADDR0);
> } else {
> /* IRQ flags state is correct already. Just tell RCU */
> rcu_irq_exit();
> }
>
> So in case IF is set then this already told the tracer and lockdep that
> interrupts are enabled. And contrary to the ugly version this exit path
> does not use rcu_irq_exit_preempt() which is there to warn about crappy
> RCU state when trying to schedule.
>
> So we went great length to sanitize _all_ of this and make it consistent
> just to say: screw it for that xen thingy.
>
> The extra checks and extra warnings for scheduling come with the
> guarantee to bitrot when idtentry_exit() or any logic invoked from there
> is changed. It's going to look like this:
>
> /*
> * If the below causes problems due to inconsistent state
> * or out of sync sanity checks, please complain to
> * [email protected] directly.
> */
> idtentry_exit();
>
> if (user_mode(regs) || !(regs->flags & X86_FlAGS_IF))
> return;
>
> if (!__this_cpu_read(xen_in_preemptible_hcall))
> return;
>
> rcu_sanity_check_for_preemption();
>
> if (need_resched()) {
> instrumentation_begin();
> xen_maybe_preempt_hcall();
> trace_hardirqs_on();
> instrumentation_end();
> }
>
> Of course you need the extra rcu_sanity_check_for_preemption() function
> just for this muck.
>
> That's a true win on all ends? I don't think so.

Hmm, fair enough. I guess the IRQ tracing messes a bunch of this logic up.

Let's keep your patch as is and consider cleanups later. One approach
might be to make this work more like extable handling: instead of
trying to schedule from inside the interrupt handler here, patch up
RIP and perhaps some other registers and let the actual Xen code just
do cond_resched(). IOW, try to make this work the way it always
should have:

int ret;
do {
ret = issue_the_hypercall();
cond_resched();
} while (ret == EAGAIN);

2020-05-20 17:25:49

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On Wed, May 20, 2020 at 8:16 AM Andy Lutomirski <[email protected]> wrote:
>
> On Wed, May 20, 2020 at 7:13 AM Thomas Gleixner <[email protected]> wrote:
> >
> > Andy Lutomirski <[email protected]> writes:
> > > On Tue, May 19, 2020 at 11:58 AM Thomas Gleixner <[email protected]> wrote:
> > >> Which brings you into the situation that you call schedule() from the
> > >> point where we just moved it out. If we would go there we'd need to
> > >> ensure that RCU is watching as well. idtentry_exit() might have it
> > >> turned off ....
> > >
> > > I don't think this is possible. Once you untangle all the wrappers,
> > > the call sites are effectively:
> > >
> > > __this_cpu_write(xen_in_preemptible_hcall, true);
> > > CALL_NOSPEC to the hypercall page
> > > __this_cpu_write(xen_in_preemptible_hcall, false);
> > >
> > > I think IF=1 when this happens, but I won't swear to it. RCU had
> > > better be watching.
> > >
> > > As I understand it, the one and only situation Xen wants to handle is
> > > that an interrupt gets delivered during the hypercall. The hypervisor
> > > is too clever for its own good and deals with this by rewinding RIP to
> > > the beginning of whatever instruction did the hypercall and delivers
> > > the interrupt, and we end up in this handler. So, if this happens,
> > > the idea is to not only handle the interrupt but to schedule if
> > > scheduling would be useful.
> > >
> > > So I don't think we need all this RCU magic. This really ought to be
> > > able to be simplified to:
> > >
> > > idtentry_exit();
> > >
> > > if (appropriate condition)
> > > schedule();
> >
> > This is exactly the kind of tinkering which causes all kinds of trouble.
> >
> > idtentry_exit()
> >
> > if (user_mode(regs)) {
> > prepare_exit_to_usermode(regs);
> > } else if (regs->flags & X86_EFLAGS_IF) {
> > /* Check kernel preemption, if enabled */
> > if (IS_ENABLED(CONFIG_PREEMPTION)) {
> > ....
> > }
> > instrumentation_begin();
> > /* Tell the tracer that IRET will enable interrupts */
> > trace_hardirqs_on_prepare();
> > lockdep_hardirqs_on_prepare(CALLER_ADDR0);
> > instrumentation_end();
> > rcu_irq_exit();
> > lockdep_hardirqs_on(CALLER_ADDR0);
> > } else {
> > /* IRQ flags state is correct already. Just tell RCU */
> > rcu_irq_exit();
> > }
> >
> > So in case IF is set then this already told the tracer and lockdep that
> > interrupts are enabled. And contrary to the ugly version this exit path
> > does not use rcu_irq_exit_preempt() which is there to warn about crappy
> > RCU state when trying to schedule.
> >
> > So we went great length to sanitize _all_ of this and make it consistent
> > just to say: screw it for that xen thingy.
> >
> > The extra checks and extra warnings for scheduling come with the
> > guarantee to bitrot when idtentry_exit() or any logic invoked from there
> > is changed. It's going to look like this:
> >
> > /*
> > * If the below causes problems due to inconsistent state
> > * or out of sync sanity checks, please complain to
> > * [email protected] directly.
> > */
> > idtentry_exit();
> >
> > if (user_mode(regs) || !(regs->flags & X86_FlAGS_IF))
> > return;
> >
> > if (!__this_cpu_read(xen_in_preemptible_hcall))
> > return;
> >
> > rcu_sanity_check_for_preemption();
> >
> > if (need_resched()) {
> > instrumentation_begin();
> > xen_maybe_preempt_hcall();
> > trace_hardirqs_on();
> > instrumentation_end();
> > }
> >
> > Of course you need the extra rcu_sanity_check_for_preemption() function
> > just for this muck.
> >
> > That's a true win on all ends? I don't think so.
>
> Hmm, fair enough. I guess the IRQ tracing messes a bunch of this logic up.
>
> Let's keep your patch as is and consider cleanups later. One approach
> might be to make this work more like extable handling: instead of
> trying to schedule from inside the interrupt handler here, patch up
> RIP and perhaps some other registers and let the actual Xen code just
> do cond_resched(). IOW, try to make this work the way it always
> should have:
>
> int ret;
> do {
> ret = issue_the_hypercall();
> cond_resched();
> } while (ret == EAGAIN);

Andrew Cooper pointed out that there is too much magic in Xen for this
to work. So never mind.

2020-05-20 19:19:58

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

Andy Lutomirski <[email protected]> writes:
> Andrew Cooper pointed out that there is too much magic in Xen for this
> to work. So never mind.

:)

But you made me stare more at that stuff and I came up with a way
simpler solution. See below.

Thanks,

tglx

8<--------------

arch/x86/entry/common.c | 75 ++++++++++++++++++++++++++++++++++++++--
arch/x86/entry/entry_32.S | 17 ++++-----
arch/x86/entry/entry_64.S | 22 +++--------
arch/x86/include/asm/idtentry.h | 13 ++++++
arch/x86/xen/setup.c | 4 +-
arch/x86/xen/smp_pv.c | 3 +
arch/x86/xen/xen-asm_32.S | 12 +++---
arch/x86/xen/xen-asm_64.S | 2 -
arch/x86/xen/xen-ops.h | 1
drivers/xen/Makefile | 2 -
drivers/xen/preempt.c | 42 ----------------------
11 files changed, 115 insertions(+), 78 deletions(-)

--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,9 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>

+#include <xen/xen-ops.h>
+#include <xen/events.h>
+
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -35,6 +38,7 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/syscall.h>
+#include <asm/irq_stack.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -539,7 +543,8 @@ void noinstr idtentry_enter(struct pt_re
}
}

-static __always_inline void __idtentry_exit(struct pt_regs *regs)
+static __always_inline void __idtentry_exit(struct pt_regs *regs,
+ bool may_sched)
{
lockdep_assert_irqs_disabled();

@@ -548,7 +553,7 @@ static __always_inline void __idtentry_e
prepare_exit_to_usermode(regs);
} else if (regs->flags & X86_EFLAGS_IF) {
/* Check kernel preemption, if enabled */
- if (IS_ENABLED(CONFIG_PREEMPTION)) {
+ if (IS_ENABLED(CONFIG_PREEMPTION) || may_resched) {
/*
* This needs to be done very carefully.
* idtentry_enter() invoked rcu_irq_enter(). This
@@ -612,5 +617,69 @@ static __always_inline void __idtentry_e
*/
void noinstr idtentry_exit(struct pt_regs *regs)
{
- __idtentry_exit(regs);
+ __idtentry_exit(regs, false);
+}
+
+#ifdef CONFIG_XEN_PV
+
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ boot inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+#else
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+#endif
+
+static void __xen_pv_evtchn_do_upcall(void)
+{
+ irq_enter_rcu();
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_hvm_evtchn_do_upcall();
+
+ irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ bool inhcall;
+
+ idtentry_enter(regs);
+ old_regs = set_irq_regs(regs);
+
+ run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
+
+ set_irq_regs(old_regs);
+
+ inhcall = get_and_clear_inhcall();
+ __idtentry_exit(regs, inhcall);
+ restore_inhcall(inhcall);
}
+#endif /* CONFIG_XEN_PV */
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1298,7 +1298,10 @@ SYM_CODE_END(native_iret)
#endif

#ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
+/*
+ * See comment in entry_64.S for further explanation
+ */
+SYM_FUNC_START(exc_xen_hypervisor_callback)
/*
* Check to see if we got the event in the critical
* region in xen_iret_direct, after we've reenabled
@@ -1315,14 +1318,11 @@ SYM_FUNC_START(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
+
mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
+ call xen_pv_evtchn_do_upcall
+ jmp handle_exception_return
+SYM_FUNC_END(exc_xen_hypervisor_callback)

/*
* Hypervisor uses this for application faults while it executes.
@@ -1464,6 +1464,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exce
movl %esp, %eax # pt_regs pointer
CALL_NOSPEC edi

+handle_exception_return:
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1067,10 +1067,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work

idtentry X86_TRAP_PF page_fault do_page_fault has_error_code=1

-#ifdef CONFIG_XEN_PV
-idtentry 512 /* dummy */ hypervisor_callback xen_do_hypervisor_callback has_error_code=0
-#endif
-
/*
* Reload gs selector with exception handling
* edi: new selector
@@ -1158,9 +1154,10 @@ SYM_FUNC_END(asm_call_on_stack)
* So, on entry to the handler we detect whether we interrupted an
* existing activation in its critical region -- if so, we pop the current
* activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-/* do_hypervisor_callback(struct *pt_regs) */
-SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)

/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -1170,15 +1167,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_c
movq %rdi, %rsp /* we don't return, adjust the stack frame */
UNWIND_HINT_REGS

- ENTER_IRQ_STACK old_rsp=%r10
- call xen_evtchn_do_upcall
- LEAVE_IRQ_STACK
-
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp error_exit
-SYM_CODE_END(xen_do_hypervisor_callback)
+ call xen_pv_evtchn_do_upcall
+
+ jmp error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)

/*
* Hypervisor uses this for application faults while it executes.
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -332,6 +332,13 @@ static __always_inline void __##func(str
* This avoids duplicate defines and ensures that everything is consistent.
*/

+/*
+ * Dummy trap number so the low level ASM macro vector number checks do not
+ * match which results in emitting plain IDTENTRY stubs without bells and
+ * whistels.
+ */
+#define X86_TRAP_OTHER 0xFFFF
+
/* Simple exception entry points. No hardware error code */
DECLARE_IDTENTRY(X86_TRAP_DE, exc_divide_error);
DECLARE_IDTENTRY(X86_TRAP_OF, exc_overflow);
@@ -371,4 +378,10 @@ DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug)
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);

+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
+#endif
+
+#undef X86_TRAP_OTHER
+
#endif
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -20,6 +20,7 @@
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/numa.h>
+#include <asm/idtentry.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>

@@ -993,7 +994,8 @@ static void __init xen_pvmmu_arch_setup(
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);

- if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ if (register_callback(CALLBACKTYPE_event,
+ xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();

--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -27,6 +27,7 @@
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
+#include <asm/idtentry.h>
#include <asm/cpu.h>

#include <xen/interface/xen.h>
@@ -347,7 +348,7 @@ cpu_initialize_context(unsigned int cpu,
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->event_callback_eip =
- (unsigned long)xen_hypervisor_callback;
+ (unsigned long)xen_asm_exc_xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -93,7 +93,7 @@ SYM_CODE_START(xen_iret)

/*
* If there's something pending, mask events again so we can
- * jump back into xen_hypervisor_callback. Otherwise do not
+ * jump back into exc_xen_hypervisor_callback. Otherwise do not
* touch XEN_vcpu_info_mask.
*/
jne 1f
@@ -113,7 +113,7 @@ SYM_CODE_START(xen_iret)
* Events are masked, so jumping out of the critical region is
* OK.
*/
- je xen_hypervisor_callback
+ je asm_exc_xen_hypervisor_callback

1: iret
xen_iret_end_crit:
@@ -127,7 +127,7 @@ SYM_CODE_END(xen_iret)
.globl xen_iret_start_crit, xen_iret_end_crit

/*
- * This is called by xen_hypervisor_callback in entry_32.S when it sees
+ * This is called by exc_xen_hypervisor_callback in entry_32.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit.
*
@@ -144,7 +144,7 @@ SYM_CODE_END(xen_iret)
* eflags }
* cs } nested exception info
* eip }
- * return address : (into xen_hypervisor_callback)
+ * return address : (into asm_exc_xen_hypervisor_callback)
*
* In order to deliver the nested exception properly, we need to discard the
* nested exception frame such that when we handle the exception, we do it
@@ -152,7 +152,8 @@ SYM_CODE_END(xen_iret)
*
* The only caveat is that if the outer eax hasn't been restored yet (i.e.
* it's still on stack), we need to restore its value here.
- */
+*/
+.pushsection .noinstr.text, "ax"
SYM_CODE_START(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
@@ -181,3 +182,4 @@ SYM_CODE_START(xen_iret_crit_fixup)
2:
ret
SYM_CODE_END(xen_iret_crit_fixup)
+.popsection
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -54,7 +54,7 @@ xen_pv_trap asm_exc_simd_coprocessor_err
#ifdef CONFIG_IA32_EMULATION
xen_pv_trap entry_INT80_compat
#endif
-xen_pv_trap hypervisor_callback
+xen_pv_trap asm_exc_xen_hypervisor_callback

__INIT
SYM_CODE_START(xen_early_idt_handler_array)
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -8,7 +8,6 @@
#include <xen/xen-ops.h>

/* These are code, but not functions. Defined in entry.S */
-extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];

void xen_sysenter_target(void);
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
-obj-y += grant-table.o features.o balloon.o manage.o preempt.o time.o
+obj-y += grant-table.o features.o balloon.o manage.o time.o
obj-y += mem-reservation.o
obj-y += events/
obj-y += xenbus/
--- a/drivers/xen/preempt.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Preemptible hypercalls
- *
- * Copyright (C) 2014 Citrix Systems R&D ltd.
- */
-
-#include <linux/sched.h>
-#include <xen/xen-ops.h>
-
-#ifndef CONFIG_PREEMPTION
-
-/*
- * Some hypercalls issued by the toolstack can take many 10s of
- * seconds. Allow tasks running hypercalls via the privcmd driver to
- * be voluntarily preempted even if full kernel preemption is
- * disabled.
- *
- * Such preemptible hypercalls are bracketed by
- * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
- * calls.
- */
-
-DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
-
-asmlinkage __visible void xen_maybe_preempt_hcall(void)
-{
- if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
- && need_resched())) {
- /*
- * Clear flag as we may be rescheduled on a different
- * cpu.
- */
- __this_cpu_write(xen_in_preemptible_hcall, false);
- local_irq_enable();
- cond_resched();
- local_irq_disable();
- __this_cpu_write(xen_in_preemptible_hcall, true);
- }
-}
-#endif /* CONFIG_PREEMPTION */

2020-05-20 23:23:48

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On Wed, May 20, 2020 at 12:17 PM Thomas Gleixner <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> writes:
> > Andrew Cooper pointed out that there is too much magic in Xen for this
> > to work. So never mind.
>
> :)
>
> But you made me stare more at that stuff and I came up with a way
> simpler solution. See below.

I like it, but I bet it can be even simpler if you do the
tickle_whatever_paulmck_call_it() change:

> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs;
> + bool inhcall;
> +
> + idtentry_enter(regs);
> + old_regs = set_irq_regs(regs);
> +
> + run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
> +
> + set_irq_regs(old_regs);
> +
> + inhcall = get_and_clear_inhcall();
> + __idtentry_exit(regs, inhcall);
> + restore_inhcall(inhcall);

How about:

inhcall = get_and_clear_inhcall();
if (inhcall) {
if (!WARN_ON_ONCE((regs->flags & X86_EFLAGS_IF) || preempt_count()) {
local_irq_enable();
cond_resched();
local_irq_disable();
}
}
restore_inhcall(inhcall);
idtentry_exit(regs);

This could probably be tidied up by having a xen_maybe_preempt() that
does the inhcall and resched mess.

The point is that, with the tickle_nohz_ stuff, there is nothing
actually preventing IRQ handlers from sleeping as long as they aren't
on the IRQ stack and as long as the interrupted context was safe to
sleep in.

--Andy

2020-05-21 02:26:47

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

On 5/20/20 3:16 PM, Thomas Gleixner wrote:


> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs;
> + bool inhcall;
> +
> + idtentry_enter(regs);
> + old_regs = set_irq_regs(regs);
> +
> + run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);


We need to handle nested case (i.e. !irq_needs_irq_stack(), like in your
original version). Moving get_and_clear_inhcall() up should prevent
scheduling when this happens.


-boris


> +
> + set_irq_regs(old_regs);
> +
> + inhcall = get_and_clear_inhcall();
> + __idtentry_exit(regs, inhcall);
> + restore_inhcall(inhcall);
> }

2020-05-21 07:13:59

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

Boris Ostrovsky <[email protected]> writes:

> On 5/20/20 3:16 PM, Thomas Gleixner wrote:
>
>
>> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
>> +{
>> + struct pt_regs *old_regs;
>> + bool inhcall;
>> +
>> + idtentry_enter(regs);
>> + old_regs = set_irq_regs(regs);
>> +
>> + run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
>
>
> We need to handle nested case (i.e. !irq_needs_irq_stack(), like in your
> original version). Moving get_and_clear_inhcall() up should prevent
> scheduling when this happens.

I locally changed run_on_irqstack() to do the magic checks and select the
right one.

Thanks,

tglx

2020-05-21 10:47:45

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V6 10/37] x86/entry: Switch XEN/PV hypercall entry to IDTENTRY

Andy Lutomirski <[email protected]> writes:
> On Wed, May 20, 2020 at 12:17 PM Thomas Gleixner <[email protected]> wrote:
>>
>> Andy Lutomirski <[email protected]> writes:
>> > Andrew Cooper pointed out that there is too much magic in Xen for this
>> > to work. So never mind.
>>
>> :)
>>
>> But you made me stare more at that stuff and I came up with a way
>> simpler solution. See below.
>
> I like it, but I bet it can be even simpler if you do the
> tickle_whatever_paulmck_call_it() change:
>
>> +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
>> +{
>> + struct pt_regs *old_regs;
>> + bool inhcall;
>> +
>> + idtentry_enter(regs);
>> + old_regs = set_irq_regs(regs);
>> +
>> + run_on_irqstack(__xen_pv_evtchn_do_upcall, NULL, regs);
>> +
>> + set_irq_regs(old_regs);
>> +
>> + inhcall = get_and_clear_inhcall();
>> + __idtentry_exit(regs, inhcall);
>> + restore_inhcall(inhcall);
>
> How about:
>
> inhcall = get_and_clear_inhcall();
> if (inhcall) {
> if (!WARN_ON_ONCE((regs->flags & X86_EFLAGS_IF) || preempt_count()) {
> local_irq_enable();
> cond_resched();
> local_irq_disable();

This really want's to use preempt_schedule_irq() as the above is racy
vs. need_resched().

> }
> }
> restore_inhcall(inhcall);
> idtentry_exit(regs);
>
> This could probably be tidied up by having a xen_maybe_preempt() that
> does the inhcall and resched mess.
>
> The point is that, with the tickle_nohz_ stuff, there is nothing
> actually preventing IRQ handlers from sleeping as long as they aren't
> on the IRQ stack and as long as the interrupted context was safe to
> sleep in.

You still lose the debug checks. I'm working on it ...

Thanks,

tglx