2023-08-10 20:40:07

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

From: Thomas Gleixner <[email protected]>

Offline CPUs need to be parked in a safe loop when microcode update is in
progress on the primary CPU. Currently offline CPUs are parked in
'mwait_play_dead()', and for Intel CPUs, its not a safe instruction, because
'mwait' instruction can be patched in the new microcode update that can
cause instability.

- Adds a new microcode state 'UCODE_OFFLINE' to report status on per-cpu
basis.
- Force NMI on the offline CPUs.

Wakeup offline CPUs while the update is in progress and then return them
back to 'mwait_play_dead()' after microcode update is complete.

Signed-off-by: Thomas Gleixner <[email protected]>

---
arch/x86/include/asm/microcode.h | 1
arch/x86/kernel/cpu/microcode/core.c | 112 +++++++++++++++++++++++++++++--
arch/x86/kernel/cpu/microcode/internal.h | 1
arch/x86/kernel/nmi.c | 5 +
4 files changed, 113 insertions(+), 6 deletions(-)
---
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -79,6 +79,7 @@ static inline void show_ucode_info_early
#endif /* !CONFIG_CPU_SUP_INTEL */

bool microcode_nmi_handler(void);
+void microcode_offline_nmi_handler(void);

#ifdef CONFIG_MICROCODE_LATE_LOADING
DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -341,8 +341,9 @@ struct ucode_ctrl {

DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
static DEFINE_PER_CPU(struct ucode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
static unsigned int loops_per_usec;
-static atomic_t late_cpus_in;
+static cpumask_t cpu_offline_mask;

static noinstr bool wait_for_cpus(atomic_t *cnt)
{
@@ -450,7 +451,7 @@ static noinstr void ucode_load_secondary
instrumentation_end();
}

-static void ucode_load_primary(unsigned int cpu)
+static void __ucode_load_primary(unsigned int cpu)
{
struct cpumask *secondaries = topology_sibling_cpumask(cpu);
enum sibling_ctrl ctrl;
@@ -486,6 +487,67 @@ static void ucode_load_primary(unsigned
}
}

+static bool ucode_kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendevouz handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
+}
+
+static void ucode_release_offline_cpus(void)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void ucode_load_primary(unsigned int cpu)
+{
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
+
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = ucode_kick_offline_cpus(nr_offl);
+
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __ucode_load_primary(cpu);
+
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ ucode_release_offline_cpus();
+}
+
+/*
+ * Minimal stub rendevouz handler for soft-offlined CPUs which participate
+ * in the NMI rendevouz to protect against a concurrent NMI on affected
+ * CPUs.
+ */
+void noinstr microcode_offline_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
+
static noinstr bool microcode_update_handler(void)
{
unsigned int cpu = raw_smp_processor_id();
@@ -542,6 +604,7 @@ static int ucode_load_cpus_stopped(void
static int ucode_load_late_stop_cpus(void)
{
unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
int old_rev = boot_cpu_data.microcode;
struct cpuinfo_x86 prev_info;

@@ -549,6 +612,7 @@ static int ucode_load_late_stop_cpus(voi
pr_err("You should switch to early loading, if possible.\n");

atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);

/*
@@ -571,6 +635,7 @@ static int ucode_load_late_stop_cpus(voi
case UCODE_UPDATED: updated++; break;
case UCODE_TIMEOUT: timedout++; break;
case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
default: failed++; break;
}
}
@@ -582,6 +647,13 @@ static int ucode_load_late_stop_cpus(voi
/* Nothing changed. */
if (!failed && !timedout)
return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
pr_err("Microcode update failed: %u CPUs failed %u CPUs timed out\n",
failed, timedout);
return -EIO;
@@ -615,19 +687,49 @@ static int ucode_load_late_stop_cpus(voi
* modern CPUs is using MWAIT, which is also not guaranteed to be safe
* against a microcode update which affects MWAIT.
*
- * 2) Initialize the per CPU control structure
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendevouz and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvouz in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
*/
static bool ucode_setup_cpus(void)
{
struct ucode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
unsigned int cpu;

+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
+
+ cpumask_clear(&cpu_offline_mask);
+
for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
if (!cpu_online(cpu)) {
- if (topology_is_primary_thread(cpu) || !microcode_ops->nmi_safe) {
- pr_err("CPU %u not online\n", cpu);
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
return false;
}
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
}

/*
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -17,6 +17,7 @@ enum ucode_state {
UCODE_NFOUND,
UCODE_ERROR,
UCODE_TIMEOUT,
+ UCODE_OFFLINE,
};

struct microcode_ops {
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -502,8 +502,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
raw_atomic_long_inc(&nsp->idt_calls);

- if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
return;
+ }

if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_state, NMI_LATCHED);



2023-08-10 21:17:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:

> for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
> + /*
> + * Offline CPUs sit in one of the play_dead() functions
> + * with interrupts disabled, but they still react on NMIs
> + * and execute arbitrary code. Also MWAIT being updated
> + * while the offline CPU sits there is not necessarily safe
> + * on all CPU variants.
> + *
> + * Mark them in the offline_cpus mask which will be handled
> + * by CPU0 later in the update process.
> + *
> + * Ensure that the primary thread is online so that it is
> + * guaranteed that all cores are updated.
> + */
> if (!cpu_online(cpu)) {
> + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
> + pr_err("CPU %u not online, loading aborted\n", cpu);

We could make the NMI handler do the ucode load, no? Also, you just need
any thread online, don't particularly care about primary thread or not
afaict.

> return false;
> }
> + cpumask_set_cpu(cpu, &cpu_offline_mask);
> + per_cpu(ucode_ctrl, cpu) = ctrl;
> + continue;
> }

2023-08-10 21:25:37

by Raj, Ashok

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10, 2023 at 10:46:05PM +0200, Peter Zijlstra wrote:
> On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:
>
> > for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
> > + /*
> > + * Offline CPUs sit in one of the play_dead() functions
> > + * with interrupts disabled, but they still react on NMIs
> > + * and execute arbitrary code. Also MWAIT being updated
> > + * while the offline CPU sits there is not necessarily safe
> > + * on all CPU variants.
> > + *
> > + * Mark them in the offline_cpus mask which will be handled
> > + * by CPU0 later in the update process.
> > + *
> > + * Ensure that the primary thread is online so that it is
> > + * guaranteed that all cores are updated.
> > + */
> > if (!cpu_online(cpu)) {
> > + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
> > + pr_err("CPU %u not online, loading aborted\n", cpu);
>
> We could make the NMI handler do the ucode load, no? Also, you just need
> any thread online, don't particularly care about primary thread or not
> afaict.

Patch 25 does that load in NMI. You are right, we just need "a" CPU in each
core online.
>
> > return false;
> > }
> > + cpumask_set_cpu(cpu, &cpu_offline_mask);
> > + per_cpu(ucode_ctrl, cpu) = ctrl;
> > + continue;
> > }

2023-08-10 22:05:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10, 2023 at 01:50:17PM -0700, Ashok Raj wrote:
> On Thu, Aug 10, 2023 at 10:46:05PM +0200, Peter Zijlstra wrote:
> > On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:
> >
> > > for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
> > > + /*
> > > + * Offline CPUs sit in one of the play_dead() functions
> > > + * with interrupts disabled, but they still react on NMIs
> > > + * and execute arbitrary code. Also MWAIT being updated
> > > + * while the offline CPU sits there is not necessarily safe
> > > + * on all CPU variants.
> > > + *
> > > + * Mark them in the offline_cpus mask which will be handled
> > > + * by CPU0 later in the update process.
> > > + *
> > > + * Ensure that the primary thread is online so that it is
> > > + * guaranteed that all cores are updated.
> > > + */
> > > if (!cpu_online(cpu)) {
> > > + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
> > > + pr_err("CPU %u not online, loading aborted\n", cpu);
> >
> > We could make the NMI handler do the ucode load, no? Also, you just need
> > any thread online, don't particularly care about primary thread or not
> > afaict.
>
> Patch 25 does that load in NMI. You are right, we just need "a" CPU in each
> core online.

Patch 25 does it for online CPUs, offline CPUs are having a separate
code path:

microcode_nmi_handler()

vs

microcode_offline_nmi_handler()

2023-08-10 22:13:31

by Raj, Ashok

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10, 2023 at 11:05:11PM +0200, Peter Zijlstra wrote:
> On Thu, Aug 10, 2023 at 01:50:17PM -0700, Ashok Raj wrote:
> > On Thu, Aug 10, 2023 at 10:46:05PM +0200, Peter Zijlstra wrote:
> > > On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:
> > >
> > > > for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
> > > > + /*
> > > > + * Offline CPUs sit in one of the play_dead() functions
> > > > + * with interrupts disabled, but they still react on NMIs
> > > > + * and execute arbitrary code. Also MWAIT being updated
> > > > + * while the offline CPU sits there is not necessarily safe
> > > > + * on all CPU variants.
> > > > + *
> > > > + * Mark them in the offline_cpus mask which will be handled
> > > > + * by CPU0 later in the update process.
> > > > + *
> > > > + * Ensure that the primary thread is online so that it is
> > > > + * guaranteed that all cores are updated.
> > > > + */
> > > > if (!cpu_online(cpu)) {
> > > > + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
> > > > + pr_err("CPU %u not online, loading aborted\n", cpu);
> > >
> > > We could make the NMI handler do the ucode load, no? Also, you just need
> > > any thread online, don't particularly care about primary thread or not
> > > afaict.
> >
> > Patch 25 does that load in NMI. You are right, we just need "a" CPU in each
> > core online.
>
> Patch 25 does it for online CPUs, offline CPUs are having a separate
> code path:
>
> microcode_nmi_handler()
>
> vs
>
> microcode_offline_nmi_handler()

Since the code enforces all primary CPUs to be ONLINE, the secondaries are the
other thread of the same core. So they automatically get the update when
primary does it.

The secondaries are parked in NMI just to avoid the risk of executing code
that might be patched by primary.

Or maybe you had something else in mind.

2023-08-10 23:44:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10, 2023 at 02:49:39PM -0700, Ashok Raj wrote:
> On Thu, Aug 10, 2023 at 11:05:11PM +0200, Peter Zijlstra wrote:
> > On Thu, Aug 10, 2023 at 01:50:17PM -0700, Ashok Raj wrote:
> > > On Thu, Aug 10, 2023 at 10:46:05PM +0200, Peter Zijlstra wrote:
> > > > On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:
> > > >
> > > > > for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
> > > > > + /*
> > > > > + * Offline CPUs sit in one of the play_dead() functions
> > > > > + * with interrupts disabled, but they still react on NMIs
> > > > > + * and execute arbitrary code. Also MWAIT being updated
> > > > > + * while the offline CPU sits there is not necessarily safe
> > > > > + * on all CPU variants.
> > > > > + *
> > > > > + * Mark them in the offline_cpus mask which will be handled
> > > > > + * by CPU0 later in the update process.
> > > > > + *
> > > > > + * Ensure that the primary thread is online so that it is
> > > > > + * guaranteed that all cores are updated.
> > > > > + */
> > > > > if (!cpu_online(cpu)) {
> > > > > + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
> > > > > + pr_err("CPU %u not online, loading aborted\n", cpu);
> > > >
> > > > We could make the NMI handler do the ucode load, no? Also, you just need
> > > > any thread online, don't particularly care about primary thread or not
> > > > afaict.
> > >
> > > Patch 25 does that load in NMI. You are right, we just need "a" CPU in each
> > > core online.
> >
> > Patch 25 does it for online CPUs, offline CPUs are having a separate
> > code path:
> >
> > microcode_nmi_handler()
> >
> > vs
> >
> > microcode_offline_nmi_handler()
>
> Since the code enforces all primary CPUs to be ONLINE, the secondaries are the
> other thread of the same core. So they automatically get the update when
> primary does it.
>
> The secondaries are parked in NMI just to avoid the risk of executing code
> that might be patched by primary.
>
> Or maybe you had something else in mind.

Yeah, not placing constraints on who is online at all. Also, if both
siblings are offline, then onlining will re-load ucode anyway, no?

2023-08-11 01:39:10

by Raj, Ashok

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Fri, Aug 11, 2023 at 12:29:57AM +0200, Peter Zijlstra wrote:
> On Thu, Aug 10, 2023 at 02:49:39PM -0700, Ashok Raj wrote:
> > On Thu, Aug 10, 2023 at 11:05:11PM +0200, Peter Zijlstra wrote:
> > > On Thu, Aug 10, 2023 at 01:50:17PM -0700, Ashok Raj wrote:
> > > > On Thu, Aug 10, 2023 at 10:46:05PM +0200, Peter Zijlstra wrote:
> > > > > On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:
> > > > >
> > > > > > for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
> > > > > > + /*
> > > > > > + * Offline CPUs sit in one of the play_dead() functions
> > > > > > + * with interrupts disabled, but they still react on NMIs
> > > > > > + * and execute arbitrary code. Also MWAIT being updated
> > > > > > + * while the offline CPU sits there is not necessarily safe
> > > > > > + * on all CPU variants.
> > > > > > + *
> > > > > > + * Mark them in the offline_cpus mask which will be handled
> > > > > > + * by CPU0 later in the update process.
> > > > > > + *
> > > > > > + * Ensure that the primary thread is online so that it is
> > > > > > + * guaranteed that all cores are updated.
> > > > > > + */
> > > > > > if (!cpu_online(cpu)) {
> > > > > > + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
> > > > > > + pr_err("CPU %u not online, loading aborted\n", cpu);
> > > > >
> > > > > We could make the NMI handler do the ucode load, no? Also, you just need
> > > > > any thread online, don't particularly care about primary thread or not
> > > > > afaict.
> > > >
> > > > Patch 25 does that load in NMI. You are right, we just need "a" CPU in each
> > > > core online.
> > >
> > > Patch 25 does it for online CPUs, offline CPUs are having a separate
> > > code path:
> > >
> > > microcode_nmi_handler()
> > >
> > > vs
> > >
> > > microcode_offline_nmi_handler()
> >
> > Since the code enforces all primary CPUs to be ONLINE, the secondaries are the
> > other thread of the same core. So they automatically get the update when
> > primary does it.
> >
> > The secondaries are parked in NMI just to avoid the risk of executing code
> > that might be patched by primary.
> >
> > Or maybe you had something else in mind.
>
> Yeah, not placing constraints on who is online at all. Also, if both
> siblings are offline, then onlining will re-load ucode anyway, no?

We need one thread in a core online, because a MCE can happen and we don't
want those running something stale.

2023-08-11 02:23:27

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10 2023 at 16:02, Ashok Raj wrote:
> On Fri, Aug 11, 2023 at 12:29:57AM +0200, Peter Zijlstra wrote:
>>
>> Yeah, not placing constraints on who is online at all. Also, if both
>> siblings are offline, then onlining will re-load ucode anyway, no?
>
> We need one thread in a core online, because a MCE can happen and we don't
> want those running something stale.

Nonsense. This is a constraint at boot time. But afterwards it does not
matter at all. That's what Peter is talking about.



2023-08-11 03:10:10

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Fri, Aug 11 2023 at 03:19, Thomas Gleixner wrote:

> On Thu, Aug 10 2023 at 16:02, Ashok Raj wrote:
>> On Fri, Aug 11, 2023 at 12:29:57AM +0200, Peter Zijlstra wrote:
>>>
>>> Yeah, not placing constraints on who is online at all. Also, if both
>>> siblings are offline, then onlining will re-load ucode anyway, no?
>>
>> We need one thread in a core online, because a MCE can happen and we don't
>> want those running something stale.
>
> Nonsense. This is a constraint at boot time. But afterwards it does not
> matter at all. That's what Peter is talking about.

And worse. It does not matter whether one thread of a core is online or
not in the case of a broadcast MCE during a microcode update simply
because that's game over.

Thanks,

tglx

2023-08-11 10:57:37

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Thu, Aug 10 2023 at 22:46, Peter Zijlstra wrote:

> On Thu, Aug 10, 2023 at 08:38:07PM +0200, Thomas Gleixner wrote:
>
>> for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
>> + /*
>> + * Offline CPUs sit in one of the play_dead() functions
>> + * with interrupts disabled, but they still react on NMIs
>> + * and execute arbitrary code. Also MWAIT being updated
>> + * while the offline CPU sits there is not necessarily safe
>> + * on all CPU variants.
>> + *
>> + * Mark them in the offline_cpus mask which will be handled
>> + * by CPU0 later in the update process.
>> + *
>> + * Ensure that the primary thread is online so that it is
>> + * guaranteed that all cores are updated.
>> + */
>> if (!cpu_online(cpu)) {
>> + if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
>> + pr_err("CPU %u not online, loading aborted\n", cpu);
>
> We could make the NMI handler do the ucode load, no? Also, you just need
> any thread online, don't particularly care about primary thread or not
> afaict.

Yes, we could. But I did not go there because it's a fricking nightmare
vs. the offline state and noinstr.

OTOH, it's not really required. Right now we mandate that _all_ present
cores have at least one sibling online. For simplicity (and practical
reasons - think "nosmt") we require the "primary" thread to be online.

Microcode is strict per core, no matter how many threads are there. We
would not need any of this mess if Intel would have synchronized the
threads on microcode update like AMD does. This is coming with future
CPUs which advertise "uniform" update with a scope ranging from core,
package to systemwide.

Even today, the only exercise what online SMT siblings do after the
primary thread updated the microcode is verification that update
happened which creates consistent software state. But in principle the
secondaries could just do nothing and everything would work (+/-
hardware,firmware bugs).

Sure we could lift that requirement, but why making this horrorshow even
more complex than it is already ?

There is zero point to support esoteric usecases just because we
can. The realistic use case is a server with all threads online or SMT
disabled via command line or sysfs. Anything else is just a pointless
exercise.

Thanks,

tglx

2023-08-12 17:16:09

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 28/30] x86/microcode: Handle "offline" CPUs correctly

On Fri, Aug 11 2023 at 11:36, Thomas Gleixner wrote:
> On Thu, Aug 10 2023 at 22:46, Peter Zijlstra wrote:
> OTOH, it's not really required. Right now we mandate that _all_ present
> cores have at least one sibling online. For simplicity (and practical
> reasons - think "nosmt") we require the "primary" thread to be online.
>
> Microcode is strict per core, no matter how many threads are there. We
> would not need any of this mess if Intel would have synchronized the
> threads on microcode update like AMD does. This is coming with future
> CPUs which advertise "uniform" update with a scope ranging from core,
> package to systemwide.

Which still requires the "offline" CPU treatment as the siblings are not
allowed to sit in MWAIT or HLT. So this whole NMI exercise is bound to
stay.

Thanks,

tglx