2020-10-23 06:36:08

by Ira Weiny

[permalink] [raw]
Subject: [PATCH 06/10] x86/entry: Move nmi entry/exit into common code

From: Thomas Gleixner <[email protected]>

Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
not any different on other architectures. Also the extra state type is not
necessary, irqentry_state_t can carry the necessary information as well.

Move it to common code and extend irqentry_state_t to carry lockdep state.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Ira Weiny <[email protected]>
---
arch/x86/entry/common.c | 34 -------------------------------
arch/x86/include/asm/idtentry.h | 3 ---
arch/x86/kernel/cpu/mce/core.c | 6 +++---
arch/x86/kernel/nmi.c | 6 +++---
arch/x86/kernel/traps.c | 13 ++++++------
include/linux/entry-common.h | 24 +++++++++++++++++++++-
kernel/entry/common.c | 36 +++++++++++++++++++++++++++++++++
7 files changed, 72 insertions(+), 50 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..18d8f17f755c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}

-noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
-{
- bool irq_state = lockdep_hardirqs_enabled();
-
- __nmi_enter();
- lockdep_hardirqs_off(CALLER_ADDR0);
- lockdep_hardirq_enter();
- rcu_nmi_enter();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- ftrace_nmi_enter();
- instrumentation_end();
-
- return irq_state;
-}
-
-noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
-{
- instrumentation_begin();
- ftrace_nmi_exit();
- if (restore) {
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- }
- instrumentation_end();
-
- rcu_nmi_exit();
- lockdep_hardirq_exit();
- if (restore)
- lockdep_hardirqs_on(CALLER_ADDR0);
- __nmi_exit();
-}
-
#ifdef CONFIG_XEN_PV
#ifndef CONFIG_PREEMPTION
/*
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b2442eb0ac2f..247a60a47331 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -11,9 +11,6 @@

#include <asm/irq_stack.h>

-bool idtentry_enter_nmi(struct pt_regs *regs);
-void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
-
/**
* DECLARE_IDTENTRY - Declare functions for simple IDT entry points
* No error code pushed by hardware
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 1c08cb9eb9f6..eb3338c0bbc1 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1983,7 +1983,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;

static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- bool irq_state;
+ irqentry_state_t irq_state;

WARN_ON_ONCE(user_mode(regs));

@@ -1995,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
mce_check_crashing_cpu())
return;

- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
/*
* The call targets are marked noinstr, but objtool can't figure
* that out because it's an indirect call. Annotate it.
@@ -2006,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}

static __always_inline void exc_machine_check_user(struct pt_regs *regs)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4bc77aaf1303..bf250a339655 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);

DEFINE_IDTENTRY_RAW(exc_nmi)
{
- bool irq_state;
+ irqentry_state_t irq_state;

/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
@@ -502,14 +502,14 @@ DEFINE_IDTENTRY_RAW(exc_nmi)

this_cpu_write(nmi_dr7, local_db_save());

- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);

inc_irq_stat(__nmi_count);

if (!ignore_nmis)
default_do_nmi(regs);

- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);

local_db_restore(this_cpu_read(nmi_dr7));

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3c70fb34028b..bffbbe29fc8c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
#endif

- idtentry_enter_nmi(regs);
+ irqentry_nmi_enter(regs);
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);

@@ -651,12 +651,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
instrumentation_end();
irqentry_exit_to_user_mode(regs);
} else {
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
instrumentation_begin();
if (!do_int3(regs))
die("int3", regs, 0);
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
}

@@ -864,7 +865,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
* includes the entry stack is excluded for everything.
*/
unsigned long dr7 = local_db_save();
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
instrumentation_begin();

/*
@@ -907,7 +908,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
regs->flags &= ~X86_EFLAGS_TF;
out:
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);

local_db_restore(dr7);
}
@@ -925,7 +926,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,

/*
* NB: We can't easily clear DR7 here because
- * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
* user memory, etc. This means that a recursive #DB is possible. If
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
* Since we're not on the IST stack right now, everything will be
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 474f29638d2c..47f9a0658acf 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -342,7 +342,10 @@ void irqentry_exit_to_user_mode(struct pt_regs *regs);

#ifndef irqentry_state
typedef struct irqentry_state {
- bool exit_rcu;
+ union {
+ bool exit_rcu;
+ bool lockdep;
+ };
} irqentry_state_t;
#endif

@@ -402,4 +405,23 @@ void irqentry_exit_cond_resched(void);
*/
void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);

+/**
+ * irqentry_nmi_enter - Handle NMI entry
+ * @regs: Pointer to currents pt_regs
+ *
+ * Similar to irqentry_enter() but taking care of the NMI constraints.
+ */
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
+
+/**
+ * irqentry_nmi_exit - Handle return from NMI handling
+ * @regs: Pointer to pt_regs (NMI entry regs)
+ * @irq_state: Return value from matching call to irqentry_nmi_enter()
+ *
+ * Last action before returning to the low level assmenbly code.
+ *
+ * Counterpart to irqentry_nmi_enter().
+ */
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0a1e20f8d4e8..5cc2e4174d7c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -398,3 +398,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
rcu_irq_exit();
}
}
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ irq_state.lockdep = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ rcu_nmi_enter();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (irq_state.lockdep) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ }
+ instrumentation_end();
+
+ rcu_nmi_exit();
+ lockdep_hardirq_exit();
+ if (irq_state.lockdep)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}
--
2.28.0.rc0.12.gb6a658bd00c9


2020-10-24 09:21:17

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH 06/10] x86/entry: Move nmi entry/exit into common code

On Thu, Oct 22 2020 at 15:26, ira weiny wrote:

> From: Thomas Gleixner <[email protected]>
>
> Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
> not any different on other architectures. Also the extra state type is not
> necessary, irqentry_state_t can carry the necessary information as well.
>
> Move it to common code and extend irqentry_state_t to carry lockdep
> state.

This lacks something like:

[ Ira: Made the states a union as they are mutually exclusive and added
the missing kernel doc ]

Hrm.

> #ifndef irqentry_state
> typedef struct irqentry_state {
> - bool exit_rcu;
> + union {
> + bool exit_rcu;
> + bool lockdep;
> + };
> } irqentry_state_t;
> #endif

-E_NO_KERNELDOC

Thanks,

tglx

2020-10-27 14:12:20

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH 06/10] x86/entry: Move nmi entry/exit into common code

On Fri, Oct 23, 2020 at 11:50:11PM +0200, Thomas Gleixner wrote:
> On Thu, Oct 22 2020 at 15:26, ira weiny wrote:
>
> > From: Thomas Gleixner <[email protected]>
> >
> > Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
> > not any different on other architectures. Also the extra state type is not
> > necessary, irqentry_state_t can carry the necessary information as well.
> >
> > Move it to common code and extend irqentry_state_t to carry lockdep
> > state.
>
> This lacks something like:
>
> [ Ira: Made the states a union as they are mutually exclusive and added
> the missing kernel doc ]

Fair enough. done.

>
> Hrm.
>
> > #ifndef irqentry_state
> > typedef struct irqentry_state {
> > - bool exit_rcu;
> > + union {
> > + bool exit_rcu;
> > + bool lockdep;
> > + };
> > } irqentry_state_t;
> > #endif
>
> -E_NO_KERNELDOC

Adding: Paul McKenney

I'm happy to write something but I'm very unfamiliar with this code. So I'm
getting confused what exactly exit_rcu is flagging.

I can see that exit_rcu is a bad name for the state used in
irqentry_nmi_[enter|exit](). Furthermore, I see why 'lockdep' is a better
name. But similar lockdep handling is used in irqentry_exit() if exit_rcu is
true...


Given my limited knowledge; here is my proposed text:

/**
* struct irqentry_state - Opaque object for exception state storage
* @exit_rcu: Used exclusively in the irqentry_*() calls; tracks if the
* exception hit the idle task which requires special handling,
* including calling rcu_irq_exit(), when the exception exits.
* @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures lockdep
* tracking is maintained if hardirqs were already enabled
*
* This opaque object is filled in by the irqentry_*_enter() functions and
* should be passed back into the corresponding irqentry_*_exit() functions
* when the exception is complete.
*
* Callers of irqentry_*_[enter|exit]() should consider this structure opaque
* and all members private. Descriptions of the members are provided to aid in
* the maintenance of the irqentry_*() functions.
*/


Perhaps Paul can enlighten me on how exit_rcu is used beyond just flagging a
call to rcu_irq_exit()?

Why do we call lockdep_hardirqs_off() only when in the idle task? That implies
that regs_irqs_disabled() can only be false if we were in the idle task to
match up the lockdep on/off calls. This does not make sense to me because why
do we need the extra check for exit_rcu? I'm still trying to understand when
regs_irqs_disabled() is false.


} else if (!regs_irqs_disabled(regs)) {
...
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
rcu_irq_exit();
}

Also, the comment in irqentry_enter() refers to irq_enter_from_user_mode() which
does not seem to exist anymore. So I'm not sure what careful sequence it is
referring to.

/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irq_enter_from_user_mode().
*/

?

Ira

2020-10-28 18:16:26

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH 06/10] x86/entry: Move nmi entry/exit into common code

On Tue, Oct 27 2020 at 00:07, Ira Weiny wrote:
> On Fri, Oct 23, 2020 at 11:50:11PM +0200, Thomas Gleixner wrote:
>> > #ifndef irqentry_state
>> > typedef struct irqentry_state {
>> > - bool exit_rcu;
>> > + union {
>> > + bool exit_rcu;
>> > + bool lockdep;
>> > + };
>> > } irqentry_state_t;
>> > #endif
>>
>> -E_NO_KERNELDOC
>
> Adding: Paul McKenney
>
> I'm happy to write something but I'm very unfamiliar with this code. So I'm
> getting confused what exactly exit_rcu is flagging.
>
> I can see that exit_rcu is a bad name for the state used in
> irqentry_nmi_[enter|exit](). Furthermore, I see why 'lockdep' is a better
> name. But similar lockdep handling is used in irqentry_exit() if exit_rcu is
> true...

No, it's not similar at all. Lockdep state vs. interrupts and regular
exceptions is always consistent.

In the NMI case, that's not guaranteed because of

local_irq_disable()
arch_local_irq_disable()
<- NMI race window
trace_hardirqs_off()

same the other way round

local_irq_enable()
trace_hardirqs_on()
<- NMI race window
arch_local_irq_enable()

IOW, the hardware state and the lockdep state are not consistent.

> /**
> * struct irqentry_state - Opaque object for exception state storage
> * @exit_rcu: Used exclusively in the irqentry_*() calls; tracks if the
> * exception hit the idle task which requires special handling,
> * including calling rcu_irq_exit(), when the exception
> exits.

calls; signals whether the exit path has to invoke rcu_irq_exit().

> * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures lockdep
> * tracking is maintained if hardirqs were already enabled

ensures that lockdep state is restored correctly on exit from nmi.

> *
> * This opaque object is filled in by the irqentry_*_enter() functions and
> * should be passed back into the corresponding irqentry_*_exit()
> functions

s/should/must/

> * when the exception is complete.
> *
> * Callers of irqentry_*_[enter|exit]() should consider this structure
> opaque

s/should/must/

> * and all members private. Descriptions of the members are provided to aid in
> * the maintenance of the irqentry_*() functions.
> */
>
> Perhaps Paul can enlighten me on how exit_rcu is used beyond just flagging a
> call to rcu_irq_exit()?

I can do that as well :) The only purpose is to invoke rcu_irq_exit()
conditionally.

> Why do we call lockdep_hardirqs_off() only when in the idle task? That implies
> that regs_irqs_disabled() can only be false if we were in the idle task to
> match up the lockdep on/off calls.

You're reading the code slightly wrong.

> This does not make sense to me because why do we need the extra check
> for exit_rcu? I'm still trying to understand when regs_irqs_disabled() is false.

It's false when the interrupted context had interrupts enabled.

So we have the following scenarios:

Usermode Idletask irqs enabled RCU entry RCU exit
Y N Y Y Y

N N Y N N
N N N N N
N Y Y Y Y
N Y N Y Y

Now you might wonder about irqs enabled/disabled. This code is not only
used for interrupts (device, ipi, local timer...) where interrupts are
obviously enabled, it's also used for exception entry/exit. You can have
e.g. pagefaults in interrupt disabled regions.

> Also, the comment in irqentry_enter() refers to irq_enter_from_user_mode() which
> does not seem to exist anymore. So I'm not sure what careful sequence it is
> referring to.

That was renamed to irqentry_enter_from_user_mode() and the comment was
not updated. Sorry for leaving this hard to solve puzzle around.

Thanks,

tglx