Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757007Ab2FDS7d (ORCPT ); Mon, 4 Jun 2012 14:59:33 -0400 Received: from mga11.intel.com ([192.55.52.93]:12374 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756038Ab2FDS72 (ORCPT ); Mon, 4 Jun 2012 14:59:28 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.71,315,1320652800"; d="scan'208";a="160532809" Subject: Re: [PATCH 3/6] x86/smpboot.c: Wake up offline CPU via mwait or nmi From: Suresh Siddha Reply-To: Suresh Siddha To: Fenghua Yu Cc: Ingo Molnar , Thomas Gleixner , H Peter Anvin , Tony Luck , Asit K Mallick , Arjan Dan De Ven , linux-kernel , x86 , linux-pm Date: Mon, 04 Jun 2012 11:58:46 -0700 In-Reply-To: <1338833876-29721-4-git-send-email-fenghua.yu@intel.com> References: <1338833876-29721-1-git-send-email-fenghua.yu@intel.com> <1338833876-29721-4-git-send-email-fenghua.yu@intel.com> Organization: Intel Corp Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.0.3 (3.0.3-1.fc15) Content-Transfer-Encoding: 7bit Message-ID: <1338836326.28766.3.camel@sbsiddha-desk.sc.intel.com> Mime-Version: 1.0 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10231 Lines: 331 On Mon, 2012-06-04 at 11:17 -0700, Fenghua Yu wrote: > From: Fenghua Yu > > wakeup_secondary_cpu_via_soft() is defined to wake up offline CPU via mwait if > the CPU is in mwait or via nmi if the CPU is in hlt. > > A CPU boots up by INIT, INIT, STARTUP sequence when it boots up for the first > time during boot time or hot plug. I think this breaks suspend/resume as the cpu state gets lost. Have you tried suspend/resume? > > Signed-off-by: Fenghua Yu > --- > arch/x86/include/asm/apic.h | 5 +- > arch/x86/kernel/smpboot.c | 187 ++++++++++++++++++++++++++++++++++++------ > 2 files changed, 164 insertions(+), 28 deletions(-) > > diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h > index eaff479..cad00b1 100644 > --- a/arch/x86/include/asm/apic.h > +++ b/arch/x86/include/asm/apic.h > @@ -425,7 +425,10 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[]; > #ifdef CONFIG_SMP > extern atomic_t init_deasserted; > extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); > -#endif > +extern int wakeup_secondary_cpu_via_soft(int apicid, unsigned long start_eip); > +#else /* CONFIG_SMP */ > +#define wakeup_secondary_cpu_via_soft NULL > +#endif /* CONFIG_SMP */ > > #ifdef CONFIG_X86_LOCAL_APIC > > diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c > index fd019d7..109df30 100644 > --- a/arch/x86/kernel/smpboot.c > +++ b/arch/x86/kernel/smpboot.c > @@ -472,13 +472,8 @@ void __inquire_remote_apic(int apicid) > } > } > > -/* > - * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal > - * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this > - * won't ... remember to clear down the APIC, etc later. > - */ > -int __cpuinit > -wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) > +static int __cpuinit > +_wakeup_secondary_cpu_via_nmi(int apicid, int dest_mode) > { > unsigned long send_status, accept_status = 0; > int maxlvt; > @@ -486,7 +481,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) > /* Target chip */ > /* Boot on the stack */ > /* Kick the second */ > - apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); > + apic_icr_write(APIC_DM_NMI | dest_mode, apicid); > > pr_debug("Waiting for send to finish...\n"); > send_status = safe_apic_wait_icr_idle(); > @@ -511,6 +506,47 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) > return (send_status | accept_status); > } > > +/* > + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal > + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this > + * won't ... remember to clear down the APIC, etc later. > + */ > +int __cpuinit > +wakeup_secondary_cpu_via_nmi_phys(int phys_apicid, unsigned long start_eip) > +{ > + return _wakeup_secondary_cpu_via_nmi(phys_apicid, APIC_DEST_PHYSICAL); > +} > + > +int __cpuinit > +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) > +{ > + return _wakeup_secondary_cpu_via_nmi(logical_apicid, APIC_DEST_LOGICAL); > +} > + > +DEFINE_PER_CPU(int, cpu_dead) = { 0 }; > +#define CPU_DEAD_TRIGGER 1 > +#define CPU_DEAD_MWAIT 2 > +#define CPU_DEAD_HLT 4 > + > +static int wakeup_secondary_cpu_via_mwait(int cpu) > +{ > + per_cpu(cpu_dead, cpu) |= CPU_DEAD_TRIGGER; > + return 0; > +} > + > +static int wakeup_cpu_nmi(unsigned int cmd, struct pt_regs *regs) > +{ > + int cpu = smp_processor_id(); > + int *cpu_dead_ptr; > + > + cpu_dead_ptr = &per_cpu(cpu_dead, cpu); > + if (!cpu_online(cpu) && (*cpu_dead_ptr & CPU_DEAD_HLT) && > + (*cpu_dead_ptr & CPU_DEAD_TRIGGER)) > + return NMI_HANDLED; > + > + return NMI_DONE; > +} > + > static int __cpuinit > wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) > { > @@ -626,6 +662,52 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) > return (send_status | accept_status); > } > > +/* > + * Kick a cpu. > + * > + * If the CPU is in mwait, wake it up by mwait method. Otherwise, if the CPU is > + * in halt, wake it up by NMI. If none of above exists, wake it up by INIT boot > + * APIC message. > + * > + * When the CPU first time boots up, i.e. cpu_dead is 0, it's waken up by INIT > + * boot APIC message. > + * > + * At this point, the CPU should be in a fixed dead state. So we don't consider > + * racy condition here. > + */ > +int __cpuinit > +wakeup_secondary_cpu_via_soft(int apicid, unsigned long start_eip) > +{ > + int cpu; > + int boot_error = 0; > + /* start_ip had better be page-aligned! */ > + unsigned long start_ip = real_mode_header->trampoline_start; > + > + for (cpu = 0; cpu < nr_cpu_ids; cpu++) > + if (apicid == apic->cpu_present_to_apicid(cpu)) > + break; > + > + if (cpu >= nr_cpu_ids) > + return -EINVAL; > + > + if (per_cpu(cpu_dead, cpu) & CPU_DEAD_MWAIT) { > + boot_error = wakeup_secondary_cpu_via_mwait(cpu); > + } else if (per_cpu(cpu_dead, cpu) & CPU_DEAD_HLT) { > + int *cpu_dead_ptr; > + > + cpu_dead_ptr = &per_cpu(cpu_dead, cpu); > + *cpu_dead_ptr |= CPU_DEAD_TRIGGER; > + > + boot_error = wakeup_secondary_cpu_via_nmi_phys(apicid, > + start_ip); > + if (boot_error) > + *cpu_dead_ptr &= ~CPU_DEAD_TRIGGER; > + } else > + boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); > + > + return boot_error; > +} > + > /* reduce the number of lines printed when booting a large cpu count system */ > static void __cpuinit announce_cpu(int cpu, int apicid) > { > @@ -778,6 +860,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) > */ > smpboot_restore_warm_reset_vector(); > } > + > return boot_error; > } > > @@ -977,6 +1060,20 @@ static void __init smp_cpu_index_default(void) > } > } > > +static bool mwait_supported(void) > +{ > + struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); > + > + if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))) > + return false; > + if (!this_cpu_has(X86_FEATURE_CLFLSH)) > + return false; > + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) > + return false; > + > + return true; > +} > + > /* > * Prepare for SMP bootup. The MP table or ACPI has been read > * earlier. Just do some sanity checking here and enable APIC mode. > @@ -1051,6 +1148,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) > uv_system_init(); > > set_mtrr_aps_delayed_init(); > + > +#ifdef CONFIG_HOTPLUG_CPU > + if (!mwait_supported()) > + register_nmi_handler(NMI_LOCAL, wakeup_cpu_nmi, 0, "wake_cpu"); > +#endif > out: > preempt_enable(); > } > @@ -1111,6 +1213,12 @@ static int __init _setup_possible_cpus(char *str) > } > early_param("possible_cpus", _setup_possible_cpus); > > +static int __init setup_wakeup_cpu_via_init(char *str) > +{ > + apic->wakeup_secondary_cpu = NULL; > + return 0; > +} > +__setup("wakeup_cpu_via_init", setup_wakeup_cpu_via_init); > > /* > * cpu_possible_mask should be static, it cannot change as cpu's > @@ -1286,6 +1394,28 @@ void play_dead_common(void) > local_irq_disable(); > } > > +static bool wakeup_cpu(int *trigger) > +{ > + unsigned int timeout; > + > + /* > + * Wait up to 1 seconds to check if CPU wakeup trigger is set in > + * cpu_dead by either memory write or NMI. > + * If there is no CPU wakeup trigger, go back to sleep. > + */ > + for (timeout = 0; timeout < 1000000; timeout++) { > + /* > + * Check if CPU0 wakeup NMI is issued and handled. > + */ > + if (*trigger & CPU_DEAD_TRIGGER) > + return true; > + > + udelay(1); > + } > + > + return false; > +} > + > /* > * We need to flush the caches before going to sleep, lest we have > * dirty data in our caches when we come back up. > @@ -1296,14 +1426,9 @@ static inline void mwait_play_dead(void) > unsigned int highest_cstate = 0; > unsigned int highest_subcstate = 0; > int i; > - void *mwait_ptr; > - struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); > + int *cpu_dead_ptr; > > - if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))) > - return; > - if (!this_cpu_has(X86_FEATURE_CLFLSH)) > - return; > - if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) > + if (!mwait_supported()) > return; > > eax = CPUID_MWAIT_LEAF; > @@ -1328,16 +1453,10 @@ static inline void mwait_play_dead(void) > (highest_subcstate - 1); > } > > - /* > - * This should be a memory location in a cache line which is > - * unlikely to be touched by other processors. The actual > - * content is immaterial as it is not actually modified in any way. > - */ > - mwait_ptr = ¤t_thread_info()->flags; > - > - wbinvd(); > - > + cpu_dead_ptr = &per_cpu(cpu_dead, smp_processor_id()); > + *cpu_dead_ptr = CPU_DEAD_MWAIT; > while (1) { > + *cpu_dead_ptr &= ~CPU_DEAD_TRIGGER; > /* > * The CLFLUSH is a workaround for erratum AAI65 for > * the Xeon 7400 series. It's not clear it is actually > @@ -1345,20 +1464,34 @@ static inline void mwait_play_dead(void) > * The WBINVD is insufficient due to the spurious-wakeup > * case where we return around the loop. > */ > - clflush(mwait_ptr); > - __monitor(mwait_ptr, 0, 0); > + wbinvd(); > + clflush(cpu_dead_ptr); > + __monitor(cpu_dead_ptr, 0, 0); > mb(); > - __mwait(eax, 0); > + if ((*cpu_dead_ptr & CPU_DEAD_TRIGGER) == 0) > + __mwait(eax, 0); > + > + /* Waken up by another CPU. */ > + if (wakeup_cpu(cpu_dead_ptr)) > + start_cpu(); > } > } > > static inline void hlt_play_dead(void) > { > + int *cpu_dead_ptr; > + > if (__this_cpu_read(cpu_info.x86) >= 4) > wbinvd(); > > + cpu_dead_ptr = &per_cpu(cpu_dead, smp_processor_id()); > + *cpu_dead_ptr = CPU_DEAD_HLT; > while (1) { > + *cpu_dead_ptr &= ~CPU_DEAD_TRIGGER; > native_halt(); > + /* If NMI wants to wake up me, I'll start. */ > + if (wakeup_cpu(cpu_dead_ptr)) > + start_cpu(); > } > } > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/