From: Thomas Gleixner <[email protected]>
Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
not any different on other architectures. Also the extra state type is not
necessary, irqentry_state_t can carry the necessary information as well.
Move it to common code and extend irqentry_state_t to carry lockdep state.
Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Ira Weiny <[email protected]>
---
arch/x86/entry/common.c | 34 -------------------------------
arch/x86/include/asm/idtentry.h | 3 ---
arch/x86/kernel/cpu/mce/core.c | 6 +++---
arch/x86/kernel/nmi.c | 6 +++---
arch/x86/kernel/traps.c | 13 ++++++------
include/linux/entry-common.h | 24 +++++++++++++++++++++-
kernel/entry/common.c | 36 +++++++++++++++++++++++++++++++++
7 files changed, 72 insertions(+), 50 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..18d8f17f755c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
}
-noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
-{
- bool irq_state = lockdep_hardirqs_enabled();
-
- __nmi_enter();
- lockdep_hardirqs_off(CALLER_ADDR0);
- lockdep_hardirq_enter();
- rcu_nmi_enter();
-
- instrumentation_begin();
- trace_hardirqs_off_finish();
- ftrace_nmi_enter();
- instrumentation_end();
-
- return irq_state;
-}
-
-noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
-{
- instrumentation_begin();
- ftrace_nmi_exit();
- if (restore) {
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(CALLER_ADDR0);
- }
- instrumentation_end();
-
- rcu_nmi_exit();
- lockdep_hardirq_exit();
- if (restore)
- lockdep_hardirqs_on(CALLER_ADDR0);
- __nmi_exit();
-}
-
#ifdef CONFIG_XEN_PV
#ifndef CONFIG_PREEMPTION
/*
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b2442eb0ac2f..247a60a47331 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -11,9 +11,6 @@
#include <asm/irq_stack.h>
-bool idtentry_enter_nmi(struct pt_regs *regs);
-void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
-
/**
* DECLARE_IDTENTRY - Declare functions for simple IDT entry points
* No error code pushed by hardware
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 1c08cb9eb9f6..eb3338c0bbc1 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1983,7 +1983,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- bool irq_state;
+ irqentry_state_t irq_state;
WARN_ON_ONCE(user_mode(regs));
@@ -1995,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
mce_check_crashing_cpu())
return;
- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
/*
* The call targets are marked noinstr, but objtool can't figure
* that out because it's an indirect call. Annotate it.
@@ -2006,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4bc77aaf1303..bf250a339655 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
- bool irq_state;
+ irqentry_state_t irq_state;
/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
@@ -502,14 +502,14 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
this_cpu_write(nmi_dr7, local_db_save());
- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
local_db_restore(this_cpu_read(nmi_dr7));
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3c70fb34028b..bffbbe29fc8c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
#endif
- idtentry_enter_nmi(regs);
+ irqentry_nmi_enter(regs);
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -651,12 +651,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
instrumentation_end();
irqentry_exit_to_user_mode(regs);
} else {
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
instrumentation_begin();
if (!do_int3(regs))
die("int3", regs, 0);
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
}
@@ -864,7 +865,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
* includes the entry stack is excluded for everything.
*/
unsigned long dr7 = local_db_save();
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
instrumentation_begin();
/*
@@ -907,7 +908,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
regs->flags &= ~X86_EFLAGS_TF;
out:
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
local_db_restore(dr7);
}
@@ -925,7 +926,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
/*
* NB: We can't easily clear DR7 here because
- * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
* user memory, etc. This means that a recursive #DB is possible. If
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
* Since we're not on the IST stack right now, everything will be
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 474f29638d2c..47f9a0658acf 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -342,7 +342,10 @@ void irqentry_exit_to_user_mode(struct pt_regs *regs);
#ifndef irqentry_state
typedef struct irqentry_state {
- bool exit_rcu;
+ union {
+ bool exit_rcu;
+ bool lockdep;
+ };
} irqentry_state_t;
#endif
@@ -402,4 +405,23 @@ void irqentry_exit_cond_resched(void);
*/
void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
+/**
+ * irqentry_nmi_enter - Handle NMI entry
+ * @regs: Pointer to currents pt_regs
+ *
+ * Similar to irqentry_enter() but taking care of the NMI constraints.
+ */
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
+
+/**
+ * irqentry_nmi_exit - Handle return from NMI handling
+ * @regs: Pointer to pt_regs (NMI entry regs)
+ * @irq_state: Return value from matching call to irqentry_nmi_enter()
+ *
+ * Last action before returning to the low level assmenbly code.
+ *
+ * Counterpart to irqentry_nmi_enter().
+ */
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0a1e20f8d4e8..5cc2e4174d7c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -398,3 +398,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
rcu_irq_exit();
}
}
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ irq_state.lockdep = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ rcu_nmi_enter();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (irq_state.lockdep) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ }
+ instrumentation_end();
+
+ rcu_nmi_exit();
+ lockdep_hardirq_exit();
+ if (irq_state.lockdep)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}
--
2.28.0.rc0.12.gb6a658bd00c9
On Thu, Oct 22 2020 at 15:26, ira weiny wrote:
> From: Thomas Gleixner <[email protected]>
>
> Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
> not any different on other architectures. Also the extra state type is not
> necessary, irqentry_state_t can carry the necessary information as well.
>
> Move it to common code and extend irqentry_state_t to carry lockdep
> state.
This lacks something like:
[ Ira: Made the states a union as they are mutually exclusive and added
the missing kernel doc ]
Hrm.
> #ifndef irqentry_state
> typedef struct irqentry_state {
> - bool exit_rcu;
> + union {
> + bool exit_rcu;
> + bool lockdep;
> + };
> } irqentry_state_t;
> #endif
-E_NO_KERNELDOC
Thanks,
tglx
On Fri, Oct 23, 2020 at 11:50:11PM +0200, Thomas Gleixner wrote:
> On Thu, Oct 22 2020 at 15:26, ira weiny wrote:
>
> > From: Thomas Gleixner <[email protected]>
> >
> > Lockdep state handling on NMI enter and exit is nothing specific to X86. It's
> > not any different on other architectures. Also the extra state type is not
> > necessary, irqentry_state_t can carry the necessary information as well.
> >
> > Move it to common code and extend irqentry_state_t to carry lockdep
> > state.
>
> This lacks something like:
>
> [ Ira: Made the states a union as they are mutually exclusive and added
> the missing kernel doc ]
Fair enough. done.
>
> Hrm.
>
> > #ifndef irqentry_state
> > typedef struct irqentry_state {
> > - bool exit_rcu;
> > + union {
> > + bool exit_rcu;
> > + bool lockdep;
> > + };
> > } irqentry_state_t;
> > #endif
>
> -E_NO_KERNELDOC
Adding: Paul McKenney
I'm happy to write something but I'm very unfamiliar with this code. So I'm
getting confused what exactly exit_rcu is flagging.
I can see that exit_rcu is a bad name for the state used in
irqentry_nmi_[enter|exit](). Furthermore, I see why 'lockdep' is a better
name. But similar lockdep handling is used in irqentry_exit() if exit_rcu is
true...
Given my limited knowledge; here is my proposed text:
/**
* struct irqentry_state - Opaque object for exception state storage
* @exit_rcu: Used exclusively in the irqentry_*() calls; tracks if the
* exception hit the idle task which requires special handling,
* including calling rcu_irq_exit(), when the exception exits.
* @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures lockdep
* tracking is maintained if hardirqs were already enabled
*
* This opaque object is filled in by the irqentry_*_enter() functions and
* should be passed back into the corresponding irqentry_*_exit() functions
* when the exception is complete.
*
* Callers of irqentry_*_[enter|exit]() should consider this structure opaque
* and all members private. Descriptions of the members are provided to aid in
* the maintenance of the irqentry_*() functions.
*/
Perhaps Paul can enlighten me on how exit_rcu is used beyond just flagging a
call to rcu_irq_exit()?
Why do we call lockdep_hardirqs_off() only when in the idle task? That implies
that regs_irqs_disabled() can only be false if we were in the idle task to
match up the lockdep on/off calls. This does not make sense to me because why
do we need the extra check for exit_rcu? I'm still trying to understand when
regs_irqs_disabled() is false.
} else if (!regs_irqs_disabled(regs)) {
...
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
rcu_irq_exit();
}
Also, the comment in irqentry_enter() refers to irq_enter_from_user_mode() which
does not seem to exist anymore. So I'm not sure what careful sequence it is
referring to.
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irq_enter_from_user_mode().
*/
?
Ira
On Tue, Oct 27 2020 at 00:07, Ira Weiny wrote:
> On Fri, Oct 23, 2020 at 11:50:11PM +0200, Thomas Gleixner wrote:
>> > #ifndef irqentry_state
>> > typedef struct irqentry_state {
>> > - bool exit_rcu;
>> > + union {
>> > + bool exit_rcu;
>> > + bool lockdep;
>> > + };
>> > } irqentry_state_t;
>> > #endif
>>
>> -E_NO_KERNELDOC
>
> Adding: Paul McKenney
>
> I'm happy to write something but I'm very unfamiliar with this code. So I'm
> getting confused what exactly exit_rcu is flagging.
>
> I can see that exit_rcu is a bad name for the state used in
> irqentry_nmi_[enter|exit](). Furthermore, I see why 'lockdep' is a better
> name. But similar lockdep handling is used in irqentry_exit() if exit_rcu is
> true...
No, it's not similar at all. Lockdep state vs. interrupts and regular
exceptions is always consistent.
In the NMI case, that's not guaranteed because of
local_irq_disable()
arch_local_irq_disable()
<- NMI race window
trace_hardirqs_off()
same the other way round
local_irq_enable()
trace_hardirqs_on()
<- NMI race window
arch_local_irq_enable()
IOW, the hardware state and the lockdep state are not consistent.
> /**
> * struct irqentry_state - Opaque object for exception state storage
> * @exit_rcu: Used exclusively in the irqentry_*() calls; tracks if the
> * exception hit the idle task which requires special handling,
> * including calling rcu_irq_exit(), when the exception
> exits.
calls; signals whether the exit path has to invoke rcu_irq_exit().
> * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures lockdep
> * tracking is maintained if hardirqs were already enabled
ensures that lockdep state is restored correctly on exit from nmi.
> *
> * This opaque object is filled in by the irqentry_*_enter() functions and
> * should be passed back into the corresponding irqentry_*_exit()
> functions
s/should/must/
> * when the exception is complete.
> *
> * Callers of irqentry_*_[enter|exit]() should consider this structure
> opaque
s/should/must/
> * and all members private. Descriptions of the members are provided to aid in
> * the maintenance of the irqentry_*() functions.
> */
>
> Perhaps Paul can enlighten me on how exit_rcu is used beyond just flagging a
> call to rcu_irq_exit()?
I can do that as well :) The only purpose is to invoke rcu_irq_exit()
conditionally.
> Why do we call lockdep_hardirqs_off() only when in the idle task? That implies
> that regs_irqs_disabled() can only be false if we were in the idle task to
> match up the lockdep on/off calls.
You're reading the code slightly wrong.
> This does not make sense to me because why do we need the extra check
> for exit_rcu? I'm still trying to understand when regs_irqs_disabled() is false.
It's false when the interrupted context had interrupts enabled.
So we have the following scenarios:
Usermode Idletask irqs enabled RCU entry RCU exit
Y N Y Y Y
N N Y N N
N N N N N
N Y Y Y Y
N Y N Y Y
Now you might wonder about irqs enabled/disabled. This code is not only
used for interrupts (device, ipi, local timer...) where interrupts are
obviously enabled, it's also used for exception entry/exit. You can have
e.g. pagefaults in interrupt disabled regions.
> Also, the comment in irqentry_enter() refers to irq_enter_from_user_mode() which
> does not seem to exist anymore. So I'm not sure what careful sequence it is
> referring to.
That was renamed to irqentry_enter_from_user_mode() and the comment was
not updated. Sorry for leaving this hard to solve puzzle around.
Thanks,
tglx