This provides no direct benefit by itself, but it finally lets us
apply some of the speedups that Paolo and possible Rik independently
wrote: we won't need an irqsave/irqrestore pair in the context
tracking code.
It should also let us remove the last callers of exception_enter in
x86 kernels, but that removal should probably wait until I carefully
re-read the page fault code.
Andy Lutomirski (2):
x86/entry/32: Change INT80 to be an interrupt gate
x86/entry: Do enter_from_user_mode with IRQs off
arch/x86/entry/common.c | 54 ++++++++++++--------------------------
arch/x86/entry/entry_32.S | 8 +++---
arch/x86/entry/entry_64_compat.S | 2 +-
arch/x86/include/asm/thread_info.h | 5 +++-
arch/x86/kernel/traps.c | 2 +-
5 files changed, 27 insertions(+), 44 deletions(-)
--
2.5.0
Now that slow-path syscalls always enter C before enabling
interrupts, it's straightforward to do enter_from_user_mode before
enabling interrupts rather than doing it as part of entry tracing.
With this change, we should finally be able to retire
exception_enter.
This will also enable optimizations based on knowing that we never
change context tracking state with interrupts on.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/common.c | 39 ++++++++++++++------------------------
arch/x86/include/asm/thread_info.h | 5 ++++-
2 files changed, 18 insertions(+), 26 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9ef3258320ec..8e8ca0eb36be 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -38,14 +38,17 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
return (struct thread_info *)(top_of_stack - THREAD_SIZE);
}
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifndef CONFIG_CONTEXT_TRACKING
+static
+#else
+__visible
+#endif
/* Called on entry from user mode with IRQs off. */
-__visible void enter_from_user_mode(void)
+void enter_from_user_mode(void)
{
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit();
}
-#endif
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
@@ -85,17 +88,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
-#ifdef CONFIG_CONTEXT_TRACKING
- /*
- * If TIF_NOHZ is set, we are required to call user_exit() before
- * doing anything that could touch RCU.
- */
- if (work & _TIF_NOHZ) {
- enter_from_user_mode();
- work &= ~_TIF_NOHZ;
- }
-#endif
-
#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
@@ -354,6 +346,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long nr = regs->orig_ax;
+ enter_from_user_mode();
local_irq_enable();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -376,9 +369,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
- * Does a 32-bit syscall. Called with IRQs on and does all entry and
- * exit work and returns with IRQs off. This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
+ * all entry and exit work and returns with IRQs off. This function is
+ * extremely hot in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
@@ -419,6 +412,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
/* Handles int $0x80 */
__visible void do_int80_syscall_32(struct pt_regs *regs)
{
+ enter_from_user_mode();
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
@@ -441,11 +435,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
*/
regs->ip = landing_pad;
- /*
- * Fetch EBP from where the vDSO stashed it.
- *
- * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
- */
+ enter_from_user_mode();
+
+ /* Fetch EBP from where the vDSO stashed it. */
local_irq_enable();
if (
#ifdef CONFIG_X86_64
@@ -464,9 +456,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
/* User code screwed up. */
local_irq_disable();
regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
- enter_from_user_mode();
-#endif
prepare_exit_to_usermode(regs);
return 0; /* Keep it simple: use IRET. */
}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c0778fcab06d..fbdb24e61835 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -134,7 +134,10 @@ struct thread_info {
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
#define _TIF_WORK_SYSCALL_ENTRY \
(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
_TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
--
2.5.0
I want all of the syscall entries to run with interrupts off so that
I can efficiently run context tracking before enabling interrupts.
This will regress int $0x80 performance on 32-bit kernels by a
couple of cycles. This shouldn't matter much -- int $0x80 is not a
fast path.
This effectively reverts 657c1eea0019 ("x86/entry/32: Fix
entry_INT80_32() to expect interrupts to be on") and fixes the issue
differently.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/common.c | 15 +++------------
arch/x86/entry/entry_32.S | 8 ++++----
arch/x86/entry/entry_64_compat.S | 2 +-
arch/x86/kernel/traps.c | 2 +-
4 files changed, 9 insertions(+), 18 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1a000f59f04a..9ef3258320ec 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -381,14 +381,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
* in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned int nr = (unsigned int)regs->orig_ax;
@@ -423,14 +416,12 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
syscall_return_slowpath(regs);
}
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
{
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
-#endif
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 66350e6a6ca5..ab710eee4308 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -348,13 +348,13 @@ ENTRY(entry_INT80_32)
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
/*
- * User mode is traced as though IRQs are on. Unlike the 64-bit
- * case, INT80 is a trap gate on 32-bit kernels, so interrupts
- * are already on (unless user code is messing around with iopl).
+ * User mode is traced as though IRQs are on, and the interrupt gate
+ * turned them off.
*/
+ TRACE_IRQS_OFF
movl %esp, %eax
- call do_syscall_32_irqs_on
+ call do_int80_syscall_32
.Lsyscall_32_done:
restore_all:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 3c990eeee40b..89bcb4979e7a 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -300,7 +300,7 @@ ENTRY(entry_INT80_compat)
TRACE_IRQS_OFF
movq %rsp, %rdi
- call do_syscall_32_irqs_off
+ call do_int80_syscall_32
.Lsyscall_32_done:
/* Go back to user mode. */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 410e8e2700c5..dd2c2e66c2e1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -859,7 +859,7 @@ void __init trap_init(void)
#endif
#ifdef CONFIG_X86_32
- set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
--
2.5.0
On Sun, Mar 06, 2016 at 06:30:07PM -0800, Andy Lutomirski wrote:
> Now that slow-path syscalls always enter C before enabling
> interrupts, it's straightforward to do enter_from_user_mode before
> enabling interrupts rather than doing it as part of entry tracing.
>
> With this change, we should finally be able to retire
> exception_enter.
>
> This will also enable optimizations based on knowing that we never
> change context tracking state with interrupts on.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/entry/common.c | 39 ++++++++++++++------------------------
> arch/x86/include/asm/thread_info.h | 5 ++++-
> 2 files changed, 18 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 9ef3258320ec..8e8ca0eb36be 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -38,14 +38,17 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
> return (struct thread_info *)(top_of_stack - THREAD_SIZE);
> }
>
> -#ifdef CONFIG_CONTEXT_TRACKING
> +#ifndef CONFIG_CONTEXT_TRACKING
> +static
> +#else
> +__visible
> +#endif
> /* Called on entry from user mode with IRQs off. */
> -__visible void enter_from_user_mode(void)
> +void enter_from_user_mode(void)
> {
> CT_WARN_ON(ct_state() != CONTEXT_USER);
> user_exit();
> }
> -#endif
I'm guessing you're relying on the compiler to optimize out everything
in the !CONFIG_CONTEXT_TRACKING case, and it seems to do that here.
However, I would do the regular thing we do for cases like that. It is
much easier and faster to read because we do use it everywhere:
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void)
{
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit();
}
#else
#define enter_from_user_mode() do { } while (0)
#endif
>
> static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
> {
> @@ -85,17 +88,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
>
> work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
>
> -#ifdef CONFIG_CONTEXT_TRACKING
> - /*
> - * If TIF_NOHZ is set, we are required to call user_exit() before
> - * doing anything that could touch RCU.
> - */
> - if (work & _TIF_NOHZ) {
> - enter_from_user_mode();
> - work &= ~_TIF_NOHZ;
> - }
> -#endif
> -
> #ifdef CONFIG_SECCOMP
> /*
> * Do seccomp first -- it should minimize exposure of other
> @@ -354,6 +346,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
> struct thread_info *ti = pt_regs_to_thread_info(regs);
> unsigned long nr = regs->orig_ax;
>
> + enter_from_user_mode();
> local_irq_enable();
>
> if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
> @@ -376,9 +369,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
>
> #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
> /*
> - * Does a 32-bit syscall. Called with IRQs on and does all entry and
> - * exit work and returns with IRQs off. This function is extremely hot
> - * in workloads that use it, and it's usually called from
> + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
> + * all entry and exit work and returns with IRQs off. This function is
> + * extremely hot in workloads that use it, and it's usually called from
> * do_fast_syscall_32, so forcibly inline it to improve performance.
> */
> static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
> @@ -419,6 +412,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
> /* Handles int $0x80 */
> __visible void do_int80_syscall_32(struct pt_regs *regs)
> {
> + enter_from_user_mode();
> local_irq_enable();
> do_syscall_32_irqs_on(regs);
> }
> @@ -441,11 +435,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
> */
> regs->ip = landing_pad;
>
> - /*
> - * Fetch EBP from where the vDSO stashed it.
> - *
> - * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
> - */
> + enter_from_user_mode();
> +
> + /* Fetch EBP from where the vDSO stashed it. */
> local_irq_enable();
Comment above should be here.
> if (
> #ifdef CONFIG_X86_64
> @@ -464,9 +456,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
> /* User code screwed up. */
> local_irq_disable();
> regs->ax = -EFAULT;
> -#ifdef CONFIG_CONTEXT_TRACKING
> - enter_from_user_mode();
> -#endif
> prepare_exit_to_usermode(regs);
> return 0; /* Keep it simple: use IRET. */
> }
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index c0778fcab06d..fbdb24e61835 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -134,7 +134,10 @@ struct thread_info {
> #define _TIF_ADDR32 (1 << TIF_ADDR32)
> #define _TIF_X32 (1 << TIF_X32)
>
> -/* work to do in syscall_trace_enter() */
> +/*
> + * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
> + * enter_from_user_mode()
> + */
> #define _TIF_WORK_SYSCALL_ENTRY \
> (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
> _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
> --
> 2.5.0
>
>
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
On Mar 8, 2016 5:46 AM, "Borislav Petkov" <[email protected]> wrote:
>
> On Sun, Mar 06, 2016 at 06:30:07PM -0800, Andy Lutomirski wrote:
> > Now that slow-path syscalls always enter C before enabling
> > interrupts, it's straightforward to do enter_from_user_mode before
> > enabling interrupts rather than doing it as part of entry tracing.
> >
> > With this change, we should finally be able to retire
> > exception_enter.
> >
> > This will also enable optimizations based on knowing that we never
> > change context tracking state with interrupts on.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> > arch/x86/entry/common.c | 39 ++++++++++++++------------------------
> > arch/x86/include/asm/thread_info.h | 5 ++++-
> > 2 files changed, 18 insertions(+), 26 deletions(-)
> >
> > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> > index 9ef3258320ec..8e8ca0eb36be 100644
> > --- a/arch/x86/entry/common.c
> > +++ b/arch/x86/entry/common.c
> > @@ -38,14 +38,17 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
> > return (struct thread_info *)(top_of_stack - THREAD_SIZE);
> > }
> >
> > -#ifdef CONFIG_CONTEXT_TRACKING
> > +#ifndef CONFIG_CONTEXT_TRACKING
> > +static
> > +#else
> > +__visible
> > +#endif
> > /* Called on entry from user mode with IRQs off. */
> > -__visible void enter_from_user_mode(void)
> > +void enter_from_user_mode(void)
> > {
> > CT_WARN_ON(ct_state() != CONTEXT_USER);
> > user_exit();
> > }
> > -#endif
>
> I'm guessing you're relying on the compiler to optimize out everything
> in the !CONFIG_CONTEXT_TRACKING case, and it seems to do that here.
>
> However, I would do the regular thing we do for cases like that. It is
> much easier and faster to read because we do use it everywhere:
>
> #ifdef CONFIG_CONTEXT_TRACKING
> /* Called on entry from user mode with IRQs off. */
> __visible void enter_from_user_mode(void)
> {
> CT_WARN_ON(ct_state() != CONTEXT_USER);
> user_exit();
> }
> #else
> #define enter_from_user_mode() do { } while (0)
> #endif
>
Will do, after I test to make sure it builds.
>
> >
> > static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
> > {
> > @@ -85,17 +88,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
> >
> > work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
> >
> > -#ifdef CONFIG_CONTEXT_TRACKING
> > - /*
> > - * If TIF_NOHZ is set, we are required to call user_exit() before
> > - * doing anything that could touch RCU.
> > - */
> > - if (work & _TIF_NOHZ) {
> > - enter_from_user_mode();
> > - work &= ~_TIF_NOHZ;
> > - }
> > -#endif
> > -
> > #ifdef CONFIG_SECCOMP
> > /*
> > * Do seccomp first -- it should minimize exposure of other
> > @@ -354,6 +346,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
> > struct thread_info *ti = pt_regs_to_thread_info(regs);
> > unsigned long nr = regs->orig_ax;
> >
> > + enter_from_user_mode();
> > local_irq_enable();
> >
> > if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
> > @@ -376,9 +369,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
> >
> > #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
> > /*
> > - * Does a 32-bit syscall. Called with IRQs on and does all entry and
> > - * exit work and returns with IRQs off. This function is extremely hot
> > - * in workloads that use it, and it's usually called from
> > + * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
> > + * all entry and exit work and returns with IRQs off. This function is
> > + * extremely hot in workloads that use it, and it's usually called from
> > * do_fast_syscall_32, so forcibly inline it to improve performance.
> > */
> > static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
> > @@ -419,6 +412,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
> > /* Handles int $0x80 */
> > __visible void do_int80_syscall_32(struct pt_regs *regs)
> > {
> > + enter_from_user_mode();
> > local_irq_enable();
> > do_syscall_32_irqs_on(regs);
> > }
> > @@ -441,11 +435,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
> > */
> > regs->ip = landing_pad;
> >
> > - /*
> > - * Fetch EBP from where the vDSO stashed it.
> > - *
> > - * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
> > - */
> > + enter_from_user_mode();
> > +
> > + /* Fetch EBP from where the vDSO stashed it. */
> > local_irq_enable();
>
> Comment above should be here.
Indeed.
--Andy