Here are three cleanups to the IOPL switching code for x86.
[PATCH 1/3] x86: Remove native_set_iopl_mask()
[PATCH 2/3] x86-32: Simplify IOPL switch check
[PATCH 3/3] x86/xen: Move paravirt IOPL switching to slow the path
arch/x86/include/asm/paravirt.h | 6 ++++++
arch/x86/include/asm/processor.h | 22 ++--------------------
arch/x86/include/asm/thread_info.h | 5 ++++-
arch/x86/include/asm/xen/hypervisor.h | 2 --
arch/x86/kernel/ioport.c | 6 ++++++
arch/x86/kernel/paravirt.c | 2 +-
arch/x86/kernel/process.c | 8 ++++++++
arch/x86/kernel/process_32.c | 9 ---------
arch/x86/kernel/process_64.c | 11 -----------
arch/x86/xen/enlighten_pv.c | 6 ++++--
10 files changed, 31 insertions(+), 46 deletions(-)
On native hardware, IRET and POPF will correctly restore the IOPL bits.
This was left over from when the SYSEXIT path did not restore the user
flags with POPF.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/processor.h | 21 +--------------------
arch/x86/kernel/paravirt.c | 2 +-
2 files changed, 2 insertions(+), 21 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada99..06c4795 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -482,25 +482,6 @@ struct thread_struct {
*/
#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void native_set_iopl_mask(unsigned mask)
-{
-#ifdef CONFIG_X86_32
- unsigned int reg;
-
- asm volatile ("pushfl;"
- "popl %0;"
- "andl %1, %0;"
- "orl %2, %0;"
- "pushl %0;"
- "popfl"
- : "=&r" (reg)
- : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-#endif
-}
-
static inline void
native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
{
@@ -542,7 +523,7 @@ static inline void load_sp0(struct tss_struct *tss,
native_load_sp0(tss, thread);
}
-#define set_iopl_mask native_set_iopl_mask
+static inline void set_iopl_mask(unsigned mask) { }
#endif /* CONFIG_PARAVIRT */
/* Free all resources held by a thread. */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996..1d50eb5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -367,7 +367,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.iret = native_iret,
.swapgs = native_swapgs,
- .set_iopl_mask = native_set_iopl_mask,
+ .set_iopl_mask = paravirt_nop,
.io_delay = native_io_delay,
.start_context_switch = paravirt_nop,
--
2.9.4
Since tasks using IOPL are very rare, move the switching code to the slow
path for lower impact on normal tasks.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/paravirt.h | 6 ++++++
arch/x86/include/asm/processor.h | 1 +
arch/x86/include/asm/thread_info.h | 5 ++++-
arch/x86/include/asm/xen/hypervisor.h | 2 --
arch/x86/kernel/ioport.c | 6 ++++++
arch/x86/kernel/process.c | 8 ++++++++
arch/x86/kernel/process_32.c | 9 ---------
arch/x86/kernel/process_64.c | 11 -----------
arch/x86/xen/enlighten_pv.c | 2 +-
9 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739..2145dbd 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -265,6 +265,12 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
}
+
+static inline bool paravirt_iopl(void)
+{
+ return pv_cpu_ops.set_iopl_mask != paravirt_nop;
+}
+
static inline void set_iopl_mask(unsigned mask)
{
PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 06c4795..4411d67 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -523,6 +523,7 @@ static inline void load_sp0(struct tss_struct *tss,
native_load_sp0(tss, thread);
}
+static inline bool paravirt_iopl(void) { return false; }
static inline void set_iopl_mask(unsigned mask) { }
#endif /* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd..350f3d3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,6 +79,7 @@ struct thread_info {
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
+#define TIF_PV_IOPL 5 /* call hypervisor to change IOPL */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
@@ -104,6 +105,7 @@ struct thread_info {
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
+#define _TIF_PV_IOPL (1 << TIF_PV_IOPL)
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
@@ -141,7 +143,8 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
+ (_TIF_IO_BITMAP | _TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
+ _TIF_PV_IOPL)
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 39171b3..8b2d4be 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
#endif
-extern void xen_set_iopl_mask(unsigned mask);
-
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 9c3cf09..2051e7d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -126,6 +126,12 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
(level << X86_EFLAGS_IOPL_BIT);
t->iopl = level << X86_EFLAGS_IOPL_BIT;
+ if (paravirt_iopl()) {
+ if (level > 0)
+ set_thread_flag(TIF_PV_IOPL);
+ else
+ clear_thread_flag(TIF_PV_IOPL);
+ }
set_iopl_mask(t->iopl);
return 0;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428c..78d1ab2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -296,6 +296,14 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
if ((tifp ^ tifn) & _TIF_NOCPUID)
set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (prev->iopl != next->iopl)
+ set_iopl_mask(next->iopl);
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b2d1f7c..19527b4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -258,15 +258,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_TLS(next, cpu);
/*
- * Restore IOPL if needed. In normal use, the flags restore
- * in the switch assembly will handle this. But if the kernel
- * is running virtualized at a non-zero CPL, the popf will
- * not restore flags, so it must be done in a separate step.
- */
- if (unlikely(prev->iopl != next->iopl))
- set_iopl_mask(next->iopl);
-
- /*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8..9525e10 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -446,17 +446,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
-#ifdef CONFIG_XEN_PV
- /*
- * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
- * current_pt_regs()->flags may not match the current task's
- * intended IOPL. We need to switch it manually.
- */
- if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
- prev->iopl != next->iopl))
- xen_set_iopl_mask(next->iopl);
-#endif
-
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 05257c0..ea0d269 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -813,7 +813,7 @@ static void xen_load_sp0(struct tss_struct *tss,
tss->x86_tss.sp0 = thread->sp0;
}
-void xen_set_iopl_mask(unsigned mask)
+static void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;
--
2.9.4
Remove the get_kernel_rpl() call from the test for switching IOPL. Instead
make set_iopl_mask() a no-op when Xen is running in supervisor mode.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/kernel/process_32.c | 2 +-
arch/x86/xen/enlighten_pv.c | 4 +++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae81..b2d1f7c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -263,7 +263,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be done in a separate step.
*/
- if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+ if (unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
/*
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f33eef4..05257c0 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1350,8 +1350,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
#ifdef CONFIG_X86_32
pv_info.kernel_rpl = 1;
- if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ if (xen_feature(XENFEAT_supervisor_mode_kernel)) {
pv_info.kernel_rpl = 0;
+ pv_cpu_ops.set_iopl_mask = paravirt_nop;
+ }
#else
pv_info.kernel_rpl = 0;
#endif
--
2.9.4
On 14/06/17 14:40, Brian Gerst wrote:
> Since tasks using IOPL are very rare, move the switching code to the slow
> path for lower impact on normal tasks.
>
> Signed-off-by: Brian Gerst <[email protected]>
> ---
> arch/x86/include/asm/paravirt.h | 6 ++++++
> arch/x86/include/asm/processor.h | 1 +
> arch/x86/include/asm/thread_info.h | 5 ++++-
> arch/x86/include/asm/xen/hypervisor.h | 2 --
> arch/x86/kernel/ioport.c | 6 ++++++
> arch/x86/kernel/process.c | 8 ++++++++
> arch/x86/kernel/process_32.c | 9 ---------
> arch/x86/kernel/process_64.c | 11 -----------
> arch/x86/xen/enlighten_pv.c | 2 +-
> 9 files changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 9a15739..2145dbd 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -265,6 +265,12 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
> {
> PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
> }
> +
> +static inline bool paravirt_iopl(void)
> +{
> + return pv_cpu_ops.set_iopl_mask != paravirt_nop;
> +}
> +
> static inline void set_iopl_mask(unsigned mask)
> {
> PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 06c4795..4411d67 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -523,6 +523,7 @@ static inline void load_sp0(struct tss_struct *tss,
> native_load_sp0(tss, thread);
> }
>
> +static inline bool paravirt_iopl(void) { return false; }
> static inline void set_iopl_mask(unsigned mask) { }
> #endif /* CONFIG_PARAVIRT */
>
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index e00e1bd..350f3d3 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -79,6 +79,7 @@ struct thread_info {
> #define TIF_SIGPENDING 2 /* signal pending */
> #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
> #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
> +#define TIF_PV_IOPL 5 /* call hypervisor to change IOPL */
> #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
> #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
> #define TIF_SECCOMP 8 /* secure computing */
> @@ -104,6 +105,7 @@ struct thread_info {
> #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
> #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
> +#define _TIF_PV_IOPL (1 << TIF_PV_IOPL)
> #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
> #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
> #define _TIF_SECCOMP (1 << TIF_SECCOMP)
> @@ -141,7 +143,8 @@ struct thread_info {
>
> /* flags to check in __switch_to() */
> #define _TIF_WORK_CTXSW \
> - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
> + (_TIF_IO_BITMAP | _TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
> + _TIF_PV_IOPL)
>
> #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
> #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
> diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
> index 39171b3..8b2d4be 100644
> --- a/arch/x86/include/asm/xen/hypervisor.h
> +++ b/arch/x86/include/asm/xen/hypervisor.h
> @@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num);
> void xen_arch_unregister_cpu(int num);
> #endif
>
> -extern void xen_set_iopl_mask(unsigned mask);
> -
> #endif /* _ASM_X86_XEN_HYPERVISOR_H */
> diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
> index 9c3cf09..2051e7d 100644
> --- a/arch/x86/kernel/ioport.c
> +++ b/arch/x86/kernel/ioport.c
> @@ -126,6 +126,12 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
> regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
> (level << X86_EFLAGS_IOPL_BIT);
> t->iopl = level << X86_EFLAGS_IOPL_BIT;
> + if (paravirt_iopl()) {
> + if (level > 0)
> + set_thread_flag(TIF_PV_IOPL);
> + else
> + clear_thread_flag(TIF_PV_IOPL);
> + }
You could get rid of paravirt_iopl() by adding a "setflag" boolean
parameter to set_iopl_mask(). Only the Xen variant would need above
thread flag manipulation.
Juergen
> set_iopl_mask(t->iopl);
>
> return 0;
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 0bb88428c..78d1ab2 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -296,6 +296,14 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
>
> if ((tifp ^ tifn) & _TIF_NOCPUID)
> set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
> +
> + /*
> + * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
> + * current_pt_regs()->flags may not match the current task's
> + * intended IOPL. We need to switch it manually.
> + */
> + if (prev->iopl != next->iopl)
> + set_iopl_mask(next->iopl);
> }
>
> /*
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index b2d1f7c..19527b4 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -258,15 +258,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> load_TLS(next, cpu);
>
> /*
> - * Restore IOPL if needed. In normal use, the flags restore
> - * in the switch assembly will handle this. But if the kernel
> - * is running virtualized at a non-zero CPL, the popf will
> - * not restore flags, so it must be done in a separate step.
> - */
> - if (unlikely(prev->iopl != next->iopl))
> - set_iopl_mask(next->iopl);
> -
> - /*
> * Now maybe handle debug registers and/or IO bitmaps
> */
> if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 9c39ab8..9525e10 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -446,17 +446,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
> task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
> __switch_to_xtra(prev_p, next_p, tss);
>
> -#ifdef CONFIG_XEN_PV
> - /*
> - * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
> - * current_pt_regs()->flags may not match the current task's
> - * intended IOPL. We need to switch it manually.
> - */
> - if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
> - prev->iopl != next->iopl))
> - xen_set_iopl_mask(next->iopl);
> -#endif
> -
> if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
> /*
> * AMD CPUs have a misfeature: SYSRET sets the SS selector but
> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
> index 05257c0..ea0d269 100644
> --- a/arch/x86/xen/enlighten_pv.c
> +++ b/arch/x86/xen/enlighten_pv.c
> @@ -813,7 +813,7 @@ static void xen_load_sp0(struct tss_struct *tss,
> tss->x86_tss.sp0 = thread->sp0;
> }
>
> -void xen_set_iopl_mask(unsigned mask)
> +static void xen_set_iopl_mask(unsigned mask)
> {
> struct physdev_set_iopl set_iopl;
>
>
On Wed, Jun 14, 2017 at 9:14 AM, Juergen Gross <[email protected]> wrote:
> On 14/06/17 14:40, Brian Gerst wrote:
>> Since tasks using IOPL are very rare, move the switching code to the slow
>> path for lower impact on normal tasks.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>> ---
>> arch/x86/include/asm/paravirt.h | 6 ++++++
>> arch/x86/include/asm/processor.h | 1 +
>> arch/x86/include/asm/thread_info.h | 5 ++++-
>> arch/x86/include/asm/xen/hypervisor.h | 2 --
>> arch/x86/kernel/ioport.c | 6 ++++++
>> arch/x86/kernel/process.c | 8 ++++++++
>> arch/x86/kernel/process_32.c | 9 ---------
>> arch/x86/kernel/process_64.c | 11 -----------
>> arch/x86/xen/enlighten_pv.c | 2 +-
>> 9 files changed, 26 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>> index 9a15739..2145dbd 100644
>> --- a/arch/x86/include/asm/paravirt.h
>> +++ b/arch/x86/include/asm/paravirt.h
>> @@ -265,6 +265,12 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
>> {
>> PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
>> }
>> +
>> +static inline bool paravirt_iopl(void)
>> +{
>> + return pv_cpu_ops.set_iopl_mask != paravirt_nop;
>> +}
>> +
>> static inline void set_iopl_mask(unsigned mask)
>> {
>> PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>> index 06c4795..4411d67 100644
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -523,6 +523,7 @@ static inline void load_sp0(struct tss_struct *tss,
>> native_load_sp0(tss, thread);
>> }
>>
>> +static inline bool paravirt_iopl(void) { return false; }
>> static inline void set_iopl_mask(unsigned mask) { }
>> #endif /* CONFIG_PARAVIRT */
>>
>> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
>> index e00e1bd..350f3d3 100644
>> --- a/arch/x86/include/asm/thread_info.h
>> +++ b/arch/x86/include/asm/thread_info.h
>> @@ -79,6 +79,7 @@ struct thread_info {
>> #define TIF_SIGPENDING 2 /* signal pending */
>> #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
>> #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
>> +#define TIF_PV_IOPL 5 /* call hypervisor to change IOPL */
>> #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
>> #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
>> #define TIF_SECCOMP 8 /* secure computing */
>> @@ -104,6 +105,7 @@ struct thread_info {
>> #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
>> #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
>> #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
>> +#define _TIF_PV_IOPL (1 << TIF_PV_IOPL)
>> #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
>> #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
>> #define _TIF_SECCOMP (1 << TIF_SECCOMP)
>> @@ -141,7 +143,8 @@ struct thread_info {
>>
>> /* flags to check in __switch_to() */
>> #define _TIF_WORK_CTXSW \
>> - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
>> + (_TIF_IO_BITMAP | _TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
>> + _TIF_PV_IOPL)
>>
>> #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
>> #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
>> diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
>> index 39171b3..8b2d4be 100644
>> --- a/arch/x86/include/asm/xen/hypervisor.h
>> +++ b/arch/x86/include/asm/xen/hypervisor.h
>> @@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num);
>> void xen_arch_unregister_cpu(int num);
>> #endif
>>
>> -extern void xen_set_iopl_mask(unsigned mask);
>> -
>> #endif /* _ASM_X86_XEN_HYPERVISOR_H */
>> diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
>> index 9c3cf09..2051e7d 100644
>> --- a/arch/x86/kernel/ioport.c
>> +++ b/arch/x86/kernel/ioport.c
>> @@ -126,6 +126,12 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
>> regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
>> (level << X86_EFLAGS_IOPL_BIT);
>> t->iopl = level << X86_EFLAGS_IOPL_BIT;
>> + if (paravirt_iopl()) {
>> + if (level > 0)
>> + set_thread_flag(TIF_PV_IOPL);
>> + else
>> + clear_thread_flag(TIF_PV_IOPL);
>> + }
>
> You could get rid of paravirt_iopl() by adding a "setflag" boolean
> parameter to set_iopl_mask(). Only the Xen variant would need above
> thread flag manipulation.
After the first two patches, the hook is a no-op except in the Xen
case, so the thread flag change gets skipped except on Xen. Moving
the code to Xen would work, but it would mean that any other
hypervisor using that hook would also need to duplicate the thread
flag changes. Granted, it looks like Xen is doing something unique
here (running the guest kernel at CPL 1), so that probably won't be an
issue.
--
Brian Gerst
On 14/06/17 19:29, Brian Gerst wrote:
> On Wed, Jun 14, 2017 at 9:14 AM, Juergen Gross <[email protected]> wrote:
>> On 14/06/17 14:40, Brian Gerst wrote:
>>> Since tasks using IOPL are very rare, move the switching code to the slow
>>> path for lower impact on normal tasks.
>>>
>>> Signed-off-by: Brian Gerst <[email protected]>
>>> ---
>>> arch/x86/include/asm/paravirt.h | 6 ++++++
>>> arch/x86/include/asm/processor.h | 1 +
>>> arch/x86/include/asm/thread_info.h | 5 ++++-
>>> arch/x86/include/asm/xen/hypervisor.h | 2 --
>>> arch/x86/kernel/ioport.c | 6 ++++++
>>> arch/x86/kernel/process.c | 8 ++++++++
>>> arch/x86/kernel/process_32.c | 9 ---------
>>> arch/x86/kernel/process_64.c | 11 -----------
>>> arch/x86/xen/enlighten_pv.c | 2 +-
>>> 9 files changed, 26 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
>>> index 9a15739..2145dbd 100644
>>> --- a/arch/x86/include/asm/paravirt.h
>>> +++ b/arch/x86/include/asm/paravirt.h
>>> @@ -265,6 +265,12 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
>>> {
>>> PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
>>> }
>>> +
>>> +static inline bool paravirt_iopl(void)
>>> +{
>>> + return pv_cpu_ops.set_iopl_mask != paravirt_nop;
>>> +}
>>> +
>>> static inline void set_iopl_mask(unsigned mask)
>>> {
>>> PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
>>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>>> index 06c4795..4411d67 100644
>>> --- a/arch/x86/include/asm/processor.h
>>> +++ b/arch/x86/include/asm/processor.h
>>> @@ -523,6 +523,7 @@ static inline void load_sp0(struct tss_struct *tss,
>>> native_load_sp0(tss, thread);
>>> }
>>>
>>> +static inline bool paravirt_iopl(void) { return false; }
>>> static inline void set_iopl_mask(unsigned mask) { }
>>> #endif /* CONFIG_PARAVIRT */
>>>
>>> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
>>> index e00e1bd..350f3d3 100644
>>> --- a/arch/x86/include/asm/thread_info.h
>>> +++ b/arch/x86/include/asm/thread_info.h
>>> @@ -79,6 +79,7 @@ struct thread_info {
>>> #define TIF_SIGPENDING 2 /* signal pending */
>>> #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
>>> #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
>>> +#define TIF_PV_IOPL 5 /* call hypervisor to change IOPL */
>>> #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
>>> #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
>>> #define TIF_SECCOMP 8 /* secure computing */
>>> @@ -104,6 +105,7 @@ struct thread_info {
>>> #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
>>> #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
>>> #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
>>> +#define _TIF_PV_IOPL (1 << TIF_PV_IOPL)
>>> #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
>>> #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
>>> #define _TIF_SECCOMP (1 << TIF_SECCOMP)
>>> @@ -141,7 +143,8 @@ struct thread_info {
>>>
>>> /* flags to check in __switch_to() */
>>> #define _TIF_WORK_CTXSW \
>>> - (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
>>> + (_TIF_IO_BITMAP | _TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
>>> + _TIF_PV_IOPL)
>>>
>>> #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
>>> #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
>>> diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
>>> index 39171b3..8b2d4be 100644
>>> --- a/arch/x86/include/asm/xen/hypervisor.h
>>> +++ b/arch/x86/include/asm/xen/hypervisor.h
>>> @@ -62,6 +62,4 @@ void xen_arch_register_cpu(int num);
>>> void xen_arch_unregister_cpu(int num);
>>> #endif
>>>
>>> -extern void xen_set_iopl_mask(unsigned mask);
>>> -
>>> #endif /* _ASM_X86_XEN_HYPERVISOR_H */
>>> diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
>>> index 9c3cf09..2051e7d 100644
>>> --- a/arch/x86/kernel/ioport.c
>>> +++ b/arch/x86/kernel/ioport.c
>>> @@ -126,6 +126,12 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
>>> regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
>>> (level << X86_EFLAGS_IOPL_BIT);
>>> t->iopl = level << X86_EFLAGS_IOPL_BIT;
>>> + if (paravirt_iopl()) {
>>> + if (level > 0)
>>> + set_thread_flag(TIF_PV_IOPL);
>>> + else
>>> + clear_thread_flag(TIF_PV_IOPL);
>>> + }
>>
>> You could get rid of paravirt_iopl() by adding a "setflag" boolean
>> parameter to set_iopl_mask(). Only the Xen variant would need above
>> thread flag manipulation.
>
> After the first two patches, the hook is a no-op except in the Xen
> case, so the thread flag change gets skipped except on Xen. Moving
> the code to Xen would work, but it would mean that any other
> hypervisor using that hook would also need to duplicate the thread
> flag changes. Granted, it looks like Xen is doing something unique
> here (running the guest kernel at CPL 1), so that probably won't be an
> issue.
Just create a service function which is doing the job. Xen should call
it and any other future non-standard user of set_iopl_mask(), too.
Juergen
On Wed, Jun 14, 2017 at 5:40 AM, Brian Gerst <[email protected]> wrote:
> Since tasks using IOPL are very rare, move the switching code to the slow
> path for lower impact on normal tasks.
I think that Andrew Cooper added a vmassist that we could opt in to
that makes Xen PV IOPL switching work more or less just like native.
We could maybe opt in to that and avoid needing this stuff at all on
newer hypervisors.
On 14/06/17 18:40, Andy Lutomirski wrote:
> On Wed, Jun 14, 2017 at 5:40 AM, Brian Gerst <[email protected]> wrote:
>> Since tasks using IOPL are very rare, move the switching code to the slow
>> path for lower impact on normal tasks.
> I think that Andrew Cooper added a vmassist that we could opt in to
> that makes Xen PV IOPL switching work more or less just like native.
> We could maybe opt in to that and avoid needing this stuff at all on
> newer hypervisors.
Indeed.
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_architectural_iopl);
(if recognised) does two things.
1) virtual IOPL is picked up from EFLAGS in the iret frame, exactly like
native.
2) The guest kernel is assumed to have virtual CPL0 for the purpose of
privilege calculations.
Xen never runs with the real IOPL different to 0, or a PV guests could
disable interrupts with popf. As a result, all IO port access does trap
to Xen for auditing. What part 2) does is avoid having the awkward
double-step of Linux needing to set IOPL to 1 for kernel level IO access
to avoid faulting.
The assist should be available in Xen 4.7 and later (or wherever vendors
have backported it to).
~Andrew
On Wed, Jun 14, 2017 at 2:02 PM, Andrew Cooper
<[email protected]> wrote:
> On 14/06/17 18:40, Andy Lutomirski wrote:
>> On Wed, Jun 14, 2017 at 5:40 AM, Brian Gerst <[email protected]> wrote:
>>> Since tasks using IOPL are very rare, move the switching code to the slow
>>> path for lower impact on normal tasks.
>> I think that Andrew Cooper added a vmassist that we could opt in to
>> that makes Xen PV IOPL switching work more or less just like native.
>> We could maybe opt in to that and avoid needing this stuff at all on
>> newer hypervisors.
>
> Indeed.
>
> HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_architectural_iopl);
>
> (if recognised) does two things.
>
> 1) virtual IOPL is picked up from EFLAGS in the iret frame, exactly like
> native.
> 2) The guest kernel is assumed to have virtual CPL0 for the purpose of
> privilege calculations.
>
> Xen never runs with the real IOPL different to 0, or a PV guests could
> disable interrupts with popf. As a result, all IO port access does trap
> to Xen for auditing. What part 2) does is avoid having the awkward
> double-step of Linux needing to set IOPL to 1 for kernel level IO access
> to avoid faulting.
>
> The assist should be available in Xen 4.7 and later (or wherever vendors
> have backported it to).
>
> ~Andrew
Ok. So do we keep the old code around to support older Xen
hypervisors or just require the newer Xen for guest userspace IOPL
support? Part of the reason I am making these changes is to sync the
32-bit and 64-bit code in __switch_to(), to ultimately merge them.
--
Brian Gerst
On Wed, Jun 14, 2017 at 11:25 AM, Brian Gerst <[email protected]> wrote:
> On Wed, Jun 14, 2017 at 2:02 PM, Andrew Cooper
> <[email protected]> wrote:
>> On 14/06/17 18:40, Andy Lutomirski wrote:
>>> On Wed, Jun 14, 2017 at 5:40 AM, Brian Gerst <[email protected]> wrote:
>>>> Since tasks using IOPL are very rare, move the switching code to the slow
>>>> path for lower impact on normal tasks.
>>> I think that Andrew Cooper added a vmassist that we could opt in to
>>> that makes Xen PV IOPL switching work more or less just like native.
>>> We could maybe opt in to that and avoid needing this stuff at all on
>>> newer hypervisors.
>>
>> Indeed.
>>
>> HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_architectural_iopl);
>>
>> (if recognised) does two things.
>>
>> 1) virtual IOPL is picked up from EFLAGS in the iret frame, exactly like
>> native.
>> 2) The guest kernel is assumed to have virtual CPL0 for the purpose of
>> privilege calculations.
>>
>> Xen never runs with the real IOPL different to 0, or a PV guests could
>> disable interrupts with popf. As a result, all IO port access does trap
>> to Xen for auditing. What part 2) does is avoid having the awkward
>> double-step of Linux needing to set IOPL to 1 for kernel level IO access
>> to avoid faulting.
>>
>> The assist should be available in Xen 4.7 and later (or wherever vendors
>> have backported it to).
>>
>> ~Andrew
>
> Ok. So do we keep the old code around to support older Xen
> hypervisors or just require the newer Xen for guest userspace IOPL
> support? Part of the reason I am making these changes is to sync the
> 32-bit and 64-bit code in __switch_to(), to ultimately merge them.
I think we should keep the old code.
One way to structure this that might be nicer than using paravirt ops
would be to add a new bug X86_BUG_XEN_PV_IOPL that would only be set
on old hypervisors that don't have the assist. Then the code could
look like:
if (static_cpu_has_bug(X86_XEN_PV_IOPL))
xen_set_iopl(whatever);
and we wouldn't need the paravirt indirection at all.