Current arch_cpu_idle() is called with IRQs disabled, but will return
with IRQs enabled.
However, the very first thing the generic code does after calling
arch_cpu_idle() is raw_local_irq_disable(). This means that
architectures that can idle with IRQs disabled end up doing a
pointless 'enable-disable' dance.
Therefore, push this IRQ disabling into the idle function, meaning
that those architectures can avoid the pointless IRQ state flipping.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/alpha/kernel/process.c | 1 -
arch/arc/kernel/process.c | 3 +++
arch/arm/kernel/process.c | 1 -
arch/arm/mach-gemini/board-dt.c | 3 ++-
arch/arm64/kernel/idle.c | 1 -
arch/csky/kernel/process.c | 1 -
arch/csky/kernel/smp.c | 2 +-
arch/h8300/kernel/process.c | 1 +
arch/hexagon/kernel/process.c | 1 -
arch/ia64/kernel/process.c | 1 +
arch/microblaze/kernel/process.c | 1 -
arch/mips/kernel/idle.c | 8 +++-----
arch/nios2/kernel/process.c | 1 -
arch/openrisc/kernel/process.c | 1 +
arch/parisc/kernel/process.c | 2 --
arch/powerpc/kernel/idle.c | 5 ++---
arch/riscv/kernel/process.c | 1 -
arch/s390/kernel/idle.c | 1 -
arch/sh/kernel/idle.c | 1 +
arch/sparc/kernel/leon_pmc.c | 4 ++++
arch/sparc/kernel/process_32.c | 1 -
arch/sparc/kernel/process_64.c | 3 ++-
arch/um/kernel/process.c | 1 -
arch/x86/coco/tdx/tdx.c | 3 +++
arch/x86/kernel/process.c | 14 ++++----------
arch/xtensa/kernel/process.c | 1 +
kernel/sched/idle.c | 2 --
27 files changed, 29 insertions(+), 36 deletions(-)
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -57,7 +57,6 @@ EXPORT_SYMBOL(pm_power_off);
void arch_cpu_idle(void)
{
wtint(0);
- raw_local_irq_enable();
}
void arch_cpu_idle_dead(void)
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -114,6 +114,8 @@ void arch_cpu_idle(void)
"sleep %0 \n"
:
:"I"(arg)); /* can't be "r" has to be embedded const */
+
+ raw_local_irq_disable();
}
#else /* ARC700 */
@@ -122,6 +124,7 @@ void arch_cpu_idle(void)
{
/* sleep, but enable both set E1/E2 (levels of interrupts) before committing */
__asm__ __volatile__("sleep 0x3 \n");
+ raw_local_irq_disable();
}
#endif
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -78,7 +78,6 @@ void arch_cpu_idle(void)
arm_pm_idle();
else
cpu_do_idle();
- raw_local_irq_enable();
}
void arch_cpu_idle_prepare(void)
--- a/arch/arm/mach-gemini/board-dt.c
+++ b/arch/arm/mach-gemini/board-dt.c
@@ -42,8 +42,9 @@ static void gemini_idle(void)
*/
/* FIXME: Enabling interrupts here is racy! */
- local_irq_enable();
+ raw_local_irq_enable();
cpu_do_idle();
+ raw_local_irq_disable();
}
static void __init gemini_init_machine(void)
--- a/arch/arm64/kernel/idle.c
+++ b/arch/arm64/kernel/idle.c
@@ -42,5 +42,4 @@ void noinstr arch_cpu_idle(void)
* tricks
*/
cpu_do_idle();
- raw_local_irq_enable();
}
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -102,6 +102,5 @@ void arch_cpu_idle(void)
#ifdef CONFIG_CPU_PM_STOP
asm volatile("stop\n");
#endif
- raw_local_irq_enable();
}
#endif
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -314,7 +314,7 @@ void arch_cpu_idle_dead(void)
while (!secondary_stack)
arch_cpu_idle();
- local_irq_disable();
+ raw_local_irq_disable();
asm volatile(
"mov sp, %0\n"
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -59,6 +59,7 @@ void arch_cpu_idle(void)
{
raw_local_irq_enable();
__asm__("sleep");
+ raw_local_irq_disable();
}
void machine_restart(char *__unused)
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -44,7 +44,6 @@ void arch_cpu_idle(void)
{
__vmwait();
/* interrupts wake us up, but irqs are still disabled */
- raw_local_irq_enable();
}
/*
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -241,6 +241,7 @@ void arch_cpu_idle(void)
(*mark_idle)(1);
raw_safe_halt();
+ raw_local_irq_disable();
if (mark_idle)
(*mark_idle)(0);
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -138,5 +138,4 @@ int dump_fpu(struct pt_regs *regs, elf_f
void arch_cpu_idle(void)
{
- raw_local_irq_enable();
}
--- a/arch/mips/kernel/idle.c
+++ b/arch/mips/kernel/idle.c
@@ -33,13 +33,13 @@ static void __cpuidle r3081_wait(void)
{
unsigned long cfg = read_c0_conf();
write_c0_conf(cfg | R30XX_CONF_HALT);
- raw_local_irq_enable();
}
void __cpuidle r4k_wait(void)
{
raw_local_irq_enable();
__r4k_wait();
+ raw_local_irq_disable();
}
/*
@@ -57,7 +57,6 @@ void __cpuidle r4k_wait_irqoff(void)
" .set arch=r4000 \n"
" wait \n"
" .set pop \n");
- raw_local_irq_enable();
}
/*
@@ -77,7 +76,6 @@ static void __cpuidle rm7k_wait_irqoff(v
" wait \n"
" mtc0 $1, $12 # stalls until W stage \n"
" .set pop \n");
- raw_local_irq_enable();
}
/*
@@ -103,6 +101,8 @@ static void __cpuidle au1k_wait(void)
" nop \n"
" .set pop \n"
: : "r" (au1k_wait), "r" (c0status));
+
+ raw_local_irq_disable();
}
static int __initdata nowait;
@@ -245,8 +245,6 @@ void arch_cpu_idle(void)
{
if (cpu_wait)
cpu_wait();
- else
- raw_local_irq_enable();
}
#ifdef CONFIG_CPU_IDLE
--- a/arch/nios2/kernel/process.c
+++ b/arch/nios2/kernel/process.c
@@ -33,7 +33,6 @@ EXPORT_SYMBOL(pm_power_off);
void arch_cpu_idle(void)
{
- raw_local_irq_enable();
}
/*
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -87,6 +87,7 @@ void arch_cpu_idle(void)
raw_local_irq_enable();
if (mfspr(SPR_UPR) & SPR_UPR_PMP)
mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME);
+ raw_local_irq_disable();
}
void (*pm_power_off) (void) = machine_power_off;
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -187,8 +187,6 @@ void arch_cpu_idle_dead(void)
void __cpuidle arch_cpu_idle(void)
{
- raw_local_irq_enable();
-
/* nop on real hardware, qemu will idle sleep. */
asm volatile("or %%r10,%%r10,%%r10\n":::);
}
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -51,10 +51,9 @@ void arch_cpu_idle(void)
* Some power_save functions return with
* interrupts enabled, some don't.
*/
- if (irqs_disabled())
- raw_local_irq_enable();
+ if (!irqs_disabled())
+ raw_local_irq_disable();
} else {
- raw_local_irq_enable();
/*
* Go into low thread priority and possibly
* low power mode.
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -39,7 +39,6 @@ extern asmlinkage void ret_from_kernel_t
void arch_cpu_idle(void)
{
cpu_do_idle();
- raw_local_irq_enable();
}
void __show_regs(struct pt_regs *regs)
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -66,7 +66,6 @@ void arch_cpu_idle(void)
idle->idle_count++;
account_idle_time(cputime_to_nsecs(idle_time));
raw_write_seqcount_end(&idle->seqcount);
- raw_local_irq_enable();
}
static ssize_t show_idle_count(struct device *dev,
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -25,6 +25,7 @@ void default_idle(void)
raw_local_irq_enable();
/* Isn't this racy ? */
cpu_sleep();
+ raw_local_irq_disable();
clear_bl_bit();
}
--- a/arch/sparc/kernel/leon_pmc.c
+++ b/arch/sparc/kernel/leon_pmc.c
@@ -57,6 +57,8 @@ static void pmc_leon_idle_fixup(void)
"lda [%0] %1, %%g0\n"
:
: "r"(address), "i"(ASI_LEON_BYPASS));
+
+ raw_local_irq_disable();
}
/*
@@ -70,6 +72,8 @@ static void pmc_leon_idle(void)
/* For systems without power-down, this will be no-op */
__asm__ __volatile__ ("wr %g0, %asr19\n\t");
+
+ raw_local_irq_disable();
}
/* Install LEON Power Down function */
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -71,7 +71,6 @@ void arch_cpu_idle(void)
{
if (sparc_idle)
(*sparc_idle)();
- raw_local_irq_enable();
}
/* XXX cli/sti -> local_irq_xxx here, check this works once SMP is fixed. */
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -59,7 +59,6 @@ void arch_cpu_idle(void)
{
if (tlb_type != hypervisor) {
touch_nmi_watchdog();
- raw_local_irq_enable();
} else {
unsigned long pstate;
@@ -90,6 +89,8 @@ void arch_cpu_idle(void)
"wrpr %0, %%g0, %%pstate"
: "=&r" (pstate)
: "i" (PSTATE_IE));
+
+ raw_local_irq_disable();
}
}
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -216,7 +216,6 @@ void arch_cpu_idle(void)
{
cpu_tasks[current_thread_info()->cpu].pid = os_getpid();
um_idle_sleep();
- raw_local_irq_enable();
}
int __cant_sleep(void) {
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -178,6 +178,9 @@ void __cpuidle tdx_safe_halt(void)
*/
if (__halt(irq_disabled, do_sti))
WARN_ONCE(1, "HLT instruction emulation failed\n");
+
+ /* XXX I can't make sense of what @do_sti actually does */
+ raw_local_irq_disable();
}
static bool read_msr(struct pt_regs *regs)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -699,6 +699,7 @@ EXPORT_SYMBOL(boot_option_idle_override)
void __cpuidle default_idle(void)
{
raw_safe_halt();
+ raw_local_irq_disable();
}
#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
EXPORT_SYMBOL(default_idle);
@@ -804,13 +805,7 @@ static void amd_e400_idle(void)
default_idle();
- /*
- * The switch back from broadcast mode needs to be called with
- * interrupts disabled.
- */
- raw_local_irq_disable();
tick_broadcast_exit();
- raw_local_irq_enable();
}
/*
@@ -849,12 +844,11 @@ static __cpuidle void mwait_idle(void)
}
__monitor((void *)¤t_thread_info()->flags, 0, 0);
- if (!need_resched())
+ if (!need_resched()) {
__sti_mwait(0, 0);
- else
- raw_local_irq_enable();
+ raw_local_irq_disable();
+ }
} else {
- raw_local_irq_enable();
}
__current_clr_polling();
}
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -120,6 +120,7 @@ void coprocessor_flush_all(struct thread
void arch_cpu_idle(void)
{
platform_idle();
+ raw_local_irq_disable();
}
/*
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- raw_local_irq_enable();
}
/**
@@ -96,7 +95,6 @@ void __cpuidle default_idle_call(void)
cpuidle_rcu_enter();
arch_cpu_idle();
- raw_local_irq_disable();
cpuidle_rcu_exit();
start_critical_timings();
On Fri, May 20, 2022 at 05:20:52AM +0300, Kirill A. Shutemov wrote:
> On Fri, May 20, 2022 at 12:03:49AM +0200, Peter Zijlstra wrote:
> >
> > On Thu, May 19, 2022 at 11:27:59PM +0200, Peter Zijlstra wrote:
> > > --- a/arch/x86/coco/tdx/tdx.c
> > > +++ b/arch/x86/coco/tdx/tdx.c
> > > @@ -178,6 +178,9 @@ void __cpuidle tdx_safe_halt(void)
> > > */
> > > if (__halt(irq_disabled, do_sti))
> > > WARN_ONCE(1, "HLT instruction emulation failed\n");
> > > +
> > > + /* XXX I can't make sense of what @do_sti actually does */
> > > + raw_local_irq_disable();
> > > }
> > >
> >
> > Kirill, Dave says I should prod you :-)
>
> It calls STI just before doing TDCALL that requests HLT.
> See comment above $TDX_HCALL_ISSUE_STI usage in __tdx_hypercall()[1].
Yes, it says that, but it's useless information since it doesn't
actually tell me the behaviour.
What I'm interested in is the behavour of the hypercall when:
.irq_disabled=false, .do_sti=false
From what I can tell, irq_disabled=false should have the hypercall wake
on interrupt, do_sti=false should have it not enable interrupts.
But what does it actually do ? Because HLT without STI is a dead
machine, but this hypercall looks more like mwait with the irq_disabled
argument...
>
> __halt(do_sti == true) matches native_safe_halt() semantics (or suppose
> to) and __halt(do_sti == false) corresponds to native_halt().
>
> For context, see Section 3.8 in GHCI[2]
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/arch/x86/coco/tdx/tdcall.S?h=x86/tdx#n151
> [2] https://www.intel.com/content/dam/develop/external/us/en/documents/intel-tdx-guest-hypervisor-communication-interface-1.0-344426-002.pdf
Yeah, that stuff is unreadable garbage. Not going to waste time trying
to make sense of it again.
On Thu, May 19, 2022 at 11:27:59PM +0200, Peter Zijlstra wrote:
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -178,6 +178,9 @@ void __cpuidle tdx_safe_halt(void)
> */
> if (__halt(irq_disabled, do_sti))
> WARN_ONCE(1, "HLT instruction emulation failed\n");
> +
> + /* XXX I can't make sense of what @do_sti actually does */
> + raw_local_irq_disable();
> }
>
Kirill, Dave says I should prod you :-)
On Fri, May 20, 2022 at 12:03:49AM +0200, Peter Zijlstra wrote:
>
> On Thu, May 19, 2022 at 11:27:59PM +0200, Peter Zijlstra wrote:
> > --- a/arch/x86/coco/tdx/tdx.c
> > +++ b/arch/x86/coco/tdx/tdx.c
> > @@ -178,6 +178,9 @@ void __cpuidle tdx_safe_halt(void)
> > */
> > if (__halt(irq_disabled, do_sti))
> > WARN_ONCE(1, "HLT instruction emulation failed\n");
> > +
> > + /* XXX I can't make sense of what @do_sti actually does */
> > + raw_local_irq_disable();
> > }
> >
>
> Kirill, Dave says I should prod you :-)
It calls STI just before doing TDCALL that requests HLT.
See comment above $TDX_HCALL_ISSUE_STI usage in __tdx_hypercall()[1].
__halt(do_sti == true) matches native_safe_halt() semantics (or suppose
to) and __halt(do_sti == false) corresponds to native_halt().
For context, see Section 3.8 in GHCI[2]
[1] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/arch/x86/coco/tdx/tdcall.S?h=x86/tdx#n151
[2] https://www.intel.com/content/dam/develop/external/us/en/documents/intel-tdx-guest-hypervisor-communication-interface-1.0-344426-002.pdf
--
Kirill A. Shutemov
On Fri, May 20, 2022 at 01:13:22PM +0300, Kirill A. Shutemov wrote:
> So you want to call call the HLT hypercall with .irq_disabled=false and
> .do_sti=false, but actual RFLAGS.IF in the guest is 0 and avoid CLI on
> wake up expecting it to be cleared already, right?
Yep, just like MWAIT can, avoids pointless IF flipping.
> My reading of the spec is "don't do that". But actual behaviour is up to
> VMM and TDX module implementation. VMM doens't have access to the guest
> register file, so it *may* work, I donno.
Yeah, it totally *can* work, but I've no idea if they done the right
thing.
On Fri, May 20, 2022 at 09:06:14AM +0200, Peter Zijlstra wrote:
> On Fri, May 20, 2022 at 05:20:52AM +0300, Kirill A. Shutemov wrote:
> > On Fri, May 20, 2022 at 12:03:49AM +0200, Peter Zijlstra wrote:
> > >
> > > On Thu, May 19, 2022 at 11:27:59PM +0200, Peter Zijlstra wrote:
> > > > --- a/arch/x86/coco/tdx/tdx.c
> > > > +++ b/arch/x86/coco/tdx/tdx.c
> > > > @@ -178,6 +178,9 @@ void __cpuidle tdx_safe_halt(void)
> > > > */
> > > > if (__halt(irq_disabled, do_sti))
> > > > WARN_ONCE(1, "HLT instruction emulation failed\n");
> > > > +
> > > > + /* XXX I can't make sense of what @do_sti actually does */
> > > > + raw_local_irq_disable();
> > > > }
> > > >
> > >
> > > Kirill, Dave says I should prod you :-)
> >
> > It calls STI just before doing TDCALL that requests HLT.
> > See comment above $TDX_HCALL_ISSUE_STI usage in __tdx_hypercall()[1].
>
> Yes, it says that, but it's useless information since it doesn't
> actually tell me the behaviour.
>
> What I'm interested in is the behavour of the hypercall when:
> .irq_disabled=false, .do_sti=false
>
> From what I can tell, irq_disabled=false should have the hypercall wake
> on interrupt, do_sti=false should have it not enable interrupts.
>
> But what does it actually do ? Because HLT without STI is a dead
> machine, but this hypercall looks more like mwait with the irq_disabled
> argument...
+Isaku.
So you want to call call the HLT hypercall with .irq_disabled=false and
.do_sti=false, but actual RFLAGS.IF in the guest is 0 and avoid CLI on
wake up expecting it to be cleared already, right?
My reading of the spec is "don't do that". But actual behaviour is up to
VMM and TDX module implementation. VMM doens't have access to the guest
register file, so it *may* work, I donno.
Sorry for not being helpful :/
Isaku, maybe you can clarify this?
--
Kirill A. Shutemov
On Thu, May 19, 2022 at 11:27:59PM +0200, Peter Zijlstra wrote:
> Current arch_cpu_idle() is called with IRQs disabled, but will return
> with IRQs enabled.
>
> However, the very first thing the generic code does after calling
> arch_cpu_idle() is raw_local_irq_disable(). This means that
> architectures that can idle with IRQs disabled end up doing a
> pointless 'enable-disable' dance.
>
> Therefore, push this IRQ disabling into the idle function, meaning
> that those architectures can avoid the pointless IRQ state flipping.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
[...]
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -699,6 +699,7 @@ EXPORT_SYMBOL(boot_option_idle_override)
> void __cpuidle default_idle(void)
> {
> raw_safe_halt();
> + raw_local_irq_disable();
> }
> #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
> EXPORT_SYMBOL(default_idle);
> @@ -804,13 +805,7 @@ static void amd_e400_idle(void)
>
> default_idle();
>
> - /*
> - * The switch back from broadcast mode needs to be called with
> - * interrupts disabled.
> - */
> - raw_local_irq_disable();
> tick_broadcast_exit();
> - raw_local_irq_enable();
> }
>
> /*
> @@ -849,12 +844,11 @@ static __cpuidle void mwait_idle(void)
> }
>
> __monitor((void *)¤t_thread_info()->flags, 0, 0);
> - if (!need_resched())
> + if (!need_resched()) {
> __sti_mwait(0, 0);
> - else
> - raw_local_irq_enable();
> + raw_local_irq_disable();
> + }
> } else {
> - raw_local_irq_enable();
> }
We don't need the outer else clause anymore.
> __current_clr_polling();
> }
--
Thanks and Regards
gautham.
On Fri, May 20, 2022 at 02:58:19PM +0200,
Peter Zijlstra <[email protected]> wrote:
> On Fri, May 20, 2022 at 01:13:22PM +0300, Kirill A. Shutemov wrote:
>
> > So you want to call call the HLT hypercall with .irq_disabled=false and
> > .do_sti=false, but actual RFLAGS.IF in the guest is 0 and avoid CLI on
> > wake up expecting it to be cleared already, right?
>
> Yep, just like MWAIT can, avoids pointless IF flipping.
>
> > My reading of the spec is "don't do that". But actual behaviour is up to
> > VMM and TDX module implementation. VMM doens't have access to the guest
> > register file, so it *may* work, I donno.
>
> Yeah, it totally *can* work, but I've no idea if they done the right
> thing.
There are two cases when interrupt arrives.
- If interrupts arrives after the CPU start executing VMM (or the TDX module),
VMM can know if interrupt for vCPU arrives. VMM will unblock vcpu scheduling.
The HLT hypercall returns back to guest.
- If interrupts arrives and vcpu recognizes it before the CPU starts executing
VMM (or TDX module), the interrupt request is recorded in vRVI (VMCS.RVI)
due to vRFLAGS.IF=0. After that, CPU exits from guest to VMM due to HLT
hypercall.
Before KVM blocking vcpu scheduling, due to irq_disable=false TDX KVM checks
if deliverable interrupt events is pending by TDX SEAMCALL (because CPU state
is protected, VMM can't peek vRVI and vPPR directly. Note that vRFLAGS.IF is
ignored in this check). If vcpu has deliverable pending interrupt, HLT
hypercall returns.
Anyway this scenario isn't tested, I need to test it.
--
Isaku Yamahata <[email protected]>