There may be different considerations on how to handle #AC for split lock,
e.g. how to handle system hang caused by split lock issue in firmware,
how to emulate faulting instruction, etc. We use a simple method to
handle user and kernel split lock and may extend the method in the future.
When #AC exception for split lock is triggered from user process, the
process is killed by SIGBUS. To execute the process properly, a user
application developer needs to fix the split lock issue.
When #AC exception for split lock is triggered from a kernel instruction,
disable split lock detection on local CPU and warn the split lock issue.
After the exception, the faulting instruction will be executed and kernel
execution continues. Split lock detection is only disabled on the local
CPU, not globally. It will be re-enabled if the CPU is offline and then
online or through debugfs interface.
A kernel/driver developer should check the warning, which contains helpful
faulting address, context, and callstack info, and fix the split lock
issues. Then further split lock issues may be captured and fixed.
After bit 29 in MSR_TEST_CTL is set to 1 in kernel, firmware inherits
the setting when firmware is executed in S4, S5, run time services, SMI,
etc. If there is a split lock operation in firmware, it will triggers
#AC and may hang the system depending on how firmware handles the #AC.
It's up to a firmware developer to fix split lock issues in firmware.
MSR TEST_CTL value is cached in per CPU msr_test_ctl_cached which will be
used in virtualization to avoid costly MSR read.
Ingo suggests to use global split_lock_debug flag to allow only one CPU to
print split lock warning in the #AC handler because WARN_ONCE() and
underlying BUGFLAG_ONCE mechanism are not atomic. This also solves
the race if the split-lock #AC fault is re-triggered by NMI of perf
context interrupting one split-lock warning execution while the original
WARN_ON() is executing.
Signed-off-by: Fenghua Yu <[email protected]>
---
arch/x86/include/asm/cpu.h | 3 +++
arch/x86/kernel/cpu/intel.c | 38 +++++++++++++++++++++++++++++++++
arch/x86/kernel/traps.c | 42 ++++++++++++++++++++++++++++++++++++-
3 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4e03f53fc079..81710f2a3eea 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -42,7 +42,10 @@ unsigned int x86_model(unsigned int sig);
unsigned int x86_stepping(unsigned int sig);
#ifdef CONFIG_CPU_SUP_INTEL
void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
+DECLARE_PER_CPU(u64, msr_test_ctl_cached);
+void split_lock_disable(void);
#else
static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
+static inline void split_lock_disable(void) {}
#endif
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7ae6cc22657d..16cf1631b7f9 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -31,6 +31,9 @@
#include <asm/apic.h>
#endif
+DEFINE_PER_CPU(u64, msr_test_ctl_cached);
+EXPORT_PER_CPU_SYMBOL_GPL(msr_test_ctl_cached);
+
/*
* Just in case our CPU detection goes bad, or you have a weird system,
* allow a way to override the automatic disabling of MPX.
@@ -624,6 +627,17 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
}
+static void split_lock_init(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ u64 test_ctl_val;
+
+ /* Cache MSR TEST_CTL */
+ rdmsrl(MSR_TEST_CTL, test_ctl_val);
+ this_cpu_write(msr_test_ctl_cached, test_ctl_val);
+ }
+}
+
static void init_intel(struct cpuinfo_x86 *c)
{
early_init_intel(c);
@@ -734,6 +748,8 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_tme(c);
init_intel_misc_features(c);
+
+ split_lock_init(c);
}
#ifdef CONFIG_X86_32
@@ -1027,3 +1043,25 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
if (ia32_core_cap & MSR_IA32_CORE_CAP_SPLIT_LOCK_DETECT)
split_lock_setup();
}
+
+static atomic_t split_lock_debug;
+
+void split_lock_disable(void)
+{
+ /* Disable split lock detection on this CPU */
+ this_cpu_and(msr_test_ctl_cached, ~MSR_TEST_CTL_SPLIT_LOCK_DETECT);
+ wrmsrl(MSR_TEST_CTL, this_cpu_read(msr_test_ctl_cached));
+
+ /*
+ * Use the atomic variable split_lock_debug to ensure only the
+ * first CPU hitting split lock issue prints one single complete
+ * warning. This also solves the race if the split-lock #AC fault
+ * is re-triggered by NMI of perf context interrupting one
+ * split-lock warning execution while the original WARN_ONCE() is
+ * executing.
+ */
+ if (atomic_cmpxchg(&split_lock_debug, 0, 1) == 0) {
+ WARN_ONCE(1, "split lock operation detected\n");
+ atomic_set(&split_lock_debug, 0);
+ }
+}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8b6d03e55d2f..38143c028f5a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -61,6 +61,7 @@
#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/cpu.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -293,9 +294,48 @@ DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overru
DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
-DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check)
#undef IP
+dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
+{
+ unsigned int trapnr = X86_TRAP_AC;
+ char str[] = "alignment check";
+ int signr = SIGBUS;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) == NOTIFY_STOP)
+ return;
+
+ cond_local_irq_enable(regs);
+ if (!user_mode(regs) && static_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ /*
+ * Only split locks can generate #AC from kernel mode.
+ *
+ * The split-lock detection feature is a one-shot
+ * debugging facility, so we disable it immediately and
+ * print a warning.
+ *
+ * This also solves the instruction restart problem: we
+ * return the faulting instruction right after this it
+ * will be executed without generating another #AC fault
+ * and getting into an infinite loop, instead it will
+ * continue without side effects to the interrupted
+ * execution context.
+ *
+ * Split-lock detection will remain disabled after this,
+ * until the next reboot.
+ */
+ split_lock_disable();
+
+ return;
+ }
+
+ /* Handle #AC generated in any other cases. */
+ do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
+ error_code, BUS_ADRALN, NULL);
+}
+
#ifdef CONFIG_VMAP_STACK
__visible void __noreturn handle_stack_overflow(const char *message,
struct pt_regs *regs,
--
2.19.1
On Tue, 18 Jun 2019, Fenghua Yu wrote:
> +
> +static atomic_t split_lock_debug;
> +
> +void split_lock_disable(void)
> +{
> + /* Disable split lock detection on this CPU */
> + this_cpu_and(msr_test_ctl_cached, ~MSR_TEST_CTL_SPLIT_LOCK_DETECT);
> + wrmsrl(MSR_TEST_CTL, this_cpu_read(msr_test_ctl_cached));
> +
> + /*
> + * Use the atomic variable split_lock_debug to ensure only the
> + * first CPU hitting split lock issue prints one single complete
> + * warning. This also solves the race if the split-lock #AC fault
> + * is re-triggered by NMI of perf context interrupting one
> + * split-lock warning execution while the original WARN_ONCE() is
> + * executing.
> + */
> + if (atomic_cmpxchg(&split_lock_debug, 0, 1) == 0) {
> + WARN_ONCE(1, "split lock operation detected\n");
> + atomic_set(&split_lock_debug, 0);
What's the purpose of this atomic_set()?
> +dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
> +{
> + unsigned int trapnr = X86_TRAP_AC;
> + char str[] = "alignment check";
> + int signr = SIGBUS;
> +
> + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> +
> + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) == NOTIFY_STOP)
> + return;
> +
> + cond_local_irq_enable(regs);
> + if (!user_mode(regs) && static_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
> + /*
> + * Only split locks can generate #AC from kernel mode.
> + *
> + * The split-lock detection feature is a one-shot
> + * debugging facility, so we disable it immediately and
> + * print a warning.
> + *
> + * This also solves the instruction restart problem: we
> + * return the faulting instruction right after this it
we return the faulting instruction ... to the store so we get our deposit
back :)
the fault handler returns to the faulting instruction which will be then
executed without ....
Don't try to impersonate code, cpus or whatever. It doesn't make sense and
confuses people.
> + * will be executed without generating another #AC fault
> + * and getting into an infinite loop, instead it will
> + * continue without side effects to the interrupted
> + * execution context.
That last part 'instead .....' is redundant. It's entirely clear from the
above that the faulting instruction is reexecuted ....
Please write concise comments and do try to repeat the same information
with a different painting.
> + *
> + * Split-lock detection will remain disabled after this,
> + * until the next reboot.
> + */
> + split_lock_disable();
> +
> + return;
> + }
> +
> + /* Handle #AC generated in any other cases. */
> + do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
> + error_code, BUS_ADRALN, NULL);
> +}
> +
> #ifdef CONFIG_VMAP_STACK
> __visible void __noreturn handle_stack_overflow(const char *message,
> struct pt_regs *regs,
> --
> 2.19.1
>
>
On Wed, Jun 26, 2019 at 10:20:05PM +0200, Thomas Gleixner wrote:
> On Tue, 18 Jun 2019, Fenghua Yu wrote:
> > +
> > +static atomic_t split_lock_debug;
> > +
> > +void split_lock_disable(void)
> > +{
> > + /* Disable split lock detection on this CPU */
> > + this_cpu_and(msr_test_ctl_cached, ~MSR_TEST_CTL_SPLIT_LOCK_DETECT);
> > + wrmsrl(MSR_TEST_CTL, this_cpu_read(msr_test_ctl_cached));
> > +
> > + /*
> > + * Use the atomic variable split_lock_debug to ensure only the
> > + * first CPU hitting split lock issue prints one single complete
> > + * warning. This also solves the race if the split-lock #AC fault
> > + * is re-triggered by NMI of perf context interrupting one
> > + * split-lock warning execution while the original WARN_ONCE() is
> > + * executing.
> > + */
> > + if (atomic_cmpxchg(&split_lock_debug, 0, 1) == 0) {
> > + WARN_ONCE(1, "split lock operation detected\n");
> > + atomic_set(&split_lock_debug, 0);
>
> What's the purpose of this atomic_set()?
atomic_set() releases the split_lock_debug flag after WARN_ONCE() is done.
The same split_lock_debug flag will be used in sysfs write for atomic
operation as well, as proposed by Ingo in https://lkml.org/lkml/2019/4/25/48
So that's why the flag needs to be cleared, right?
>
> > +dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
> > +{
> > + unsigned int trapnr = X86_TRAP_AC;
> > + char str[] = "alignment check";
> > + int signr = SIGBUS;
> > +
> > + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> > +
> > + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) == NOTIFY_STOP)
> > + return;
> > +
> > + cond_local_irq_enable(regs);
> > + if (!user_mode(regs) && static_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
> > + /*
> > + * Only split locks can generate #AC from kernel mode.
> > + *
> > + * The split-lock detection feature is a one-shot
> > + * debugging facility, so we disable it immediately and
> > + * print a warning.
> > + *
> > + * This also solves the instruction restart problem: we
> > + * return the faulting instruction right after this it
>
> we return the faulting instruction ... to the store so we get our deposit
> back :)
>
> the fault handler returns to the faulting instruction which will be then
> executed without ....
>
> Don't try to impersonate code, cpus or whatever. It doesn't make sense and
> confuses people.
>
> > + * will be executed without generating another #AC fault
> > + * and getting into an infinite loop, instead it will
> > + * continue without side effects to the interrupted
> > + * execution context.
>
> That last part 'instead .....' is redundant. It's entirely clear from the
> above that the faulting instruction is reexecuted ....
>
> Please write concise comments and do try to repeat the same information
> with a different painting.
I copied the comment completely from Ingo's comment on v8:
https://lkml.org/lkml/2019/4/25/40
Thanks.
-Fenghua
On Wed, 26 Jun 2019, Fenghua Yu wrote:
> On Wed, Jun 26, 2019 at 10:20:05PM +0200, Thomas Gleixner wrote:
> > On Tue, 18 Jun 2019, Fenghua Yu wrote:
> > > +
> > > +static atomic_t split_lock_debug;
> > > +
> > > +void split_lock_disable(void)
> > > +{
> > > + /* Disable split lock detection on this CPU */
> > > + this_cpu_and(msr_test_ctl_cached, ~MSR_TEST_CTL_SPLIT_LOCK_DETECT);
> > > + wrmsrl(MSR_TEST_CTL, this_cpu_read(msr_test_ctl_cached));
> > > +
> > > + /*
> > > + * Use the atomic variable split_lock_debug to ensure only the
> > > + * first CPU hitting split lock issue prints one single complete
> > > + * warning. This also solves the race if the split-lock #AC fault
> > > + * is re-triggered by NMI of perf context interrupting one
> > > + * split-lock warning execution while the original WARN_ONCE() is
> > > + * executing.
> > > + */
> > > + if (atomic_cmpxchg(&split_lock_debug, 0, 1) == 0) {
> > > + WARN_ONCE(1, "split lock operation detected\n");
> > > + atomic_set(&split_lock_debug, 0);
> >
> > What's the purpose of this atomic_set()?
>
> atomic_set() releases the split_lock_debug flag after WARN_ONCE() is done.
> The same split_lock_debug flag will be used in sysfs write for atomic
> operation as well, as proposed by Ingo in https://lkml.org/lkml/2019/4/25/48
Your comment above lacks any useful information about that whole thing.
> So that's why the flag needs to be cleared, right?
Errm. No.
CPU 0 CPU 1
hits AC hits AC
if (atomic_cmpxchg() == success) if (atomic_cmpxchg() == success)
warn() warn()
So only one of the CPUs will win the cmpxchg race, set te variable to 1 and
warn, the other and any subsequent AC on any other CPU will not warn
either. So you don't need WARN_ONCE() at all. It's redundant and confusing
along with the atomic_set().
Whithout reading that link [1], what Ingo proposed was surely not the
trainwreck which you decided to put into that debugfs thing.
Thanks,
tglx
[1] lkml.org sucks. We have https://lkml.kernel.org/r/$MESSAGEID for
that. That actually works.
On Wed, Jun 26, 2019 at 11:47:40PM +0200, Thomas Gleixner wrote:
> So only one of the CPUs will win the cmpxchg race, set te variable to 1 and
> warn, the other and any subsequent AC on any other CPU will not warn
> either. So you don't need WARN_ONCE() at all. It's redundant and confusing
> along with the atomic_set().
>
> Whithout reading that link [1], what Ingo proposed was surely not the
> trainwreck which you decided to put into that debugfs thing.
We're trying to sort out the trainwreck, but there's an additional wrinkle
that I'd like your input on.
We overlooked the fact that MSR_TEST_CTRL is per-core, i.e. shared by
sibling hyperthreads. This is especially problematic for KVM, as loading
MSR_TEST_CTRL during VM-Enter could cause spurious #AC faults in the kernel
and bounce MSR_TEST_CTRL.split_lock.
E.g. if CPU0 and CPU1 are siblings and CPU1 is running a KVM guest with
MSR_TEST_CTRL.split_lock=1, hitting an #AC on CPU0 in the host kernel will
lead to suprious #AC faults and constant toggling of of the MSR.
CPU0 CPU1
split_lock=enabled
#AC -> disabled
VM-Enter -> enabled
#AC -> disabled
VM-Enter -> enabled
#AC -> disabled
My thought to handle this:
- Remove the per-cpu cache.
- Rework the atomic variable to differentiate between "disabled globally"
and "disabled by kernel (on some CPUs)".
- Modify the #AC handler to test/set the same atomic variable as the
sysfs knob. This is the "disabled by kernel" flow.
- Modify the debugfs/sysfs knob to only allow disabling split-lock
detection. This is the "disabled globally" path, i.e. sends IPIs to
clear MSR_TEST_CTRL.split_lock on all online CPUs.
- Modify the resume/init flow to clear MSR_TEST_CTRL.split_lock if it's
been disabled on *any* CPU via #AC or via the knob.
- Modify the debugs/sysfs read function to either print the raw atomic
variable, or differentiate between "enabled", "disabled globally" and
"disabled by kernel".
- Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
guest can do WRMSR and handle its own #AC faults, but KVM doesn't
change the value in hardware.
* Allowing guest to enable split-lock detection can induce #AC on
the host after it has been explicitly turned off, e.g. the sibling
hyperthread hits an #AC in the host kernel, or worse, causes a
different process in the host to SIGBUS.
* Allowing guest to disable split-lock detection opens up the host
to DoS attacks.
- KVM advertises split-lock detection to guest/userspace if and only if
split_lock_detect_disabled is zero.
- Add a pr_warn_once() in KVM that triggers if split locks are disabled
after support has been advertised to a guest.
Does this sound sane?
The question at the forefront of my mind is: why not have the #AC handler
send a fire-and-forget IPI to online CPUs to disable split-lock detection
on all CPUs? Would the IPI be problematic? Globally disabling split-lock
on any #AC would (marginally) simplify the code and would eliminate the
oddity of userspace process (and KVM guest) #AC behavior varying based on
the physical CPU it's running on.
Something like:
#define SPLIT_LOCK_DISABLED_IN_KERNEL BIT(0)
#define SPLIT_LOCK_DISABLED_GLOBALLY BIT(1)
static atomic_t split_lock_detect_disabled = ATOMIT_INIT(0);
void split_lock_detect_ac(void)
{
lockdep_assert_irqs_disabled();
/* Disable split lock detection on this CPU to avoid reentrant #AC. */
wrmsrl(MSR_TEST_CTRL,
rdmsrl(MSR_TEST_CTRL) & ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT);
/*
* If split-lock detection has not been disabled, either by the kernel
* or globally, record that it has been disabled by the kernel and
* WARN. Guarding WARN with the atomic ensures only the first #AC due
* to split-lock is logged, e.g. if multiple CPUs encounter #AC or if
* #AC is retriggered by a perf context NMI that interrupts the
* original WARN.
*/
if (atomic_cmpxchg(&split_lock_detect_disabled, 0,
SPLIT_LOCK_DISABLED_IN_KERNEL) == 0)
WARN(1, "split lock operation detected\n");
}
static ssize_t split_lock_detect_wr(struct file *f, const char __user *user_buf,
size_t count, loff_t *ppos)
{
int old;
<parse or ignore input value?>
old = atomic_fetch_or(SPLIT_LOCK_DISABLED_GLOBALLY,
&split_lock_detect_disabled);
/* Update MSR_TEST_CTRL unless split-lock was already disabled. */
if (!(old & SPLIT_LOCK_DISABLED_GLOBALLY))
on_each_cpu(split_lock_update, NULL, 1);
return count;
}
On 9/26/2019 2:09 AM, Sean Christopherson wrote:
> On Wed, Jun 26, 2019 at 11:47:40PM +0200, Thomas Gleixner wrote:
>> So only one of the CPUs will win the cmpxchg race, set te variable to 1 and
>> warn, the other and any subsequent AC on any other CPU will not warn
>> either. So you don't need WARN_ONCE() at all. It's redundant and confusing
>> along with the atomic_set().
>>
>> Whithout reading that link [1], what Ingo proposed was surely not the
>> trainwreck which you decided to put into that debugfs thing.
>
> We're trying to sort out the trainwreck, but there's an additional wrinkle
> that I'd like your input on.
>
> We overlooked the fact that MSR_TEST_CTRL is per-core, i.e. shared by
> sibling hyperthreads. This is especially problematic for KVM, as loading
> MSR_TEST_CTRL during VM-Enter could cause spurious #AC faults in the kernel
> and bounce MSR_TEST_CTRL.split_lock.
>
> E.g. if CPU0 and CPU1 are siblings and CPU1 is running a KVM guest with
> MSR_TEST_CTRL.split_lock=1, hitting an #AC on CPU0 in the host kernel will
> lead to suprious #AC faults and constant toggling of of the MSR.
>
> CPU0 CPU1
>
> split_lock=enabled
>
> #AC -> disabled
>
> VM-Enter -> enabled
>
> #AC -> disabled
>
> VM-Enter -> enabled
>
> #AC -> disabled
>
>
>
> My thought to handle this:
>
> - Remove the per-cpu cache.
>
> - Rework the atomic variable to differentiate between "disabled globally"
> and "disabled by kernel (on some CPUs)".
>
> - Modify the #AC handler to test/set the same atomic variable as the
> sysfs knob. This is the "disabled by kernel" flow.
>
> - Modify the debugfs/sysfs knob to only allow disabling split-lock
> detection. This is the "disabled globally" path, i.e. sends IPIs to
> clear MSR_TEST_CTRL.split_lock on all online CPUs.
>
> - Modify the resume/init flow to clear MSR_TEST_CTRL.split_lock if it's
> been disabled on *any* CPU via #AC or via the knob.
>
> - Modify the debugs/sysfs read function to either print the raw atomic
> variable, or differentiate between "enabled", "disabled globally" and
> "disabled by kernel".
>
> - Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
> actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
> guest can do WRMSR and handle its own #AC faults, but KVM doesn't
> change the value in hardware.
>
> * Allowing guest to enable split-lock detection can induce #AC on
> the host after it has been explicitly turned off, e.g. the sibling
> hyperthread hits an #AC in the host kernel, or worse, causes a
> different process in the host to SIGBUS.
>
> * Allowing guest to disable split-lock detection opens up the host
> to DoS attacks.
>
> - KVM advertises split-lock detection to guest/userspace if and only if
> split_lock_detect_disabled is zero.
>
> - Add a pr_warn_once() in KVM that triggers if split locks are disabled
> after support has been advertised to a guest.
>
> Does this sound sane?
>
> The question at the forefront of my mind is: why not have the #AC handler
> send a fire-and-forget IPI to online CPUs to disable split-lock detection
> on all CPUs? Would the IPI be problematic? Globally disabling split-lock
> on any #AC would (marginally) simplify the code and would eliminate the
> oddity of userspace process (and KVM guest) #AC behavior varying based on
> the physical CPU it's running on.
>
>
> Something like:
>
> #define SPLIT_LOCK_DISABLED_IN_KERNEL BIT(0)
> #define SPLIT_LOCK_DISABLED_GLOBALLY BIT(1)
>
> static atomic_t split_lock_detect_disabled = ATOMIT_INIT(0);
>
> void split_lock_detect_ac(void)
> {
> lockdep_assert_irqs_disabled();
>
> /* Disable split lock detection on this CPU to avoid reentrant #AC. */
> wrmsrl(MSR_TEST_CTRL,
> rdmsrl(MSR_TEST_CTRL) & ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT);
>
> /*
> * If split-lock detection has not been disabled, either by the kernel
> * or globally, record that it has been disabled by the kernel and
> * WARN. Guarding WARN with the atomic ensures only the first #AC due
> * to split-lock is logged, e.g. if multiple CPUs encounter #AC or if
> * #AC is retriggered by a perf context NMI that interrupts the
> * original WARN.
> */
> if (atomic_cmpxchg(&split_lock_detect_disabled, 0,
> SPLIT_LOCK_DISABLED_IN_KERNEL) == 0)
> WARN(1, "split lock operation detected\n");
> }
>
> static ssize_t split_lock_detect_wr(struct file *f, const char __user *user_buf,
> size_t count, loff_t *ppos)
> {
> int old;
>
> <parse or ignore input value?>
>
> old = atomic_fetch_or(SPLIT_LOCK_DISABLED_GLOBALLY,
> &split_lock_detect_disabled);
>
> /* Update MSR_TEST_CTRL unless split-lock was already disabled. */
> if (!(old & SPLIT_LOCK_DISABLED_GLOBALLY))
> on_each_cpu(split_lock_update, NULL, 1);
>
> return count;
> }
>
Hi Thomas,
Could you please have a look at Sean's proposal and give your opinion.
Sean,
On Wed, 25 Sep 2019, Sean Christopherson wrote:
sorry for the late reply. This got lost in travel/conferencing/vacation
induced backlog.
> On Wed, Jun 26, 2019 at 11:47:40PM +0200, Thomas Gleixner wrote:
> > So only one of the CPUs will win the cmpxchg race, set te variable to 1 and
> > warn, the other and any subsequent AC on any other CPU will not warn
> > either. So you don't need WARN_ONCE() at all. It's redundant and confusing
> > along with the atomic_set().
> >
> > Whithout reading that link [1], what Ingo proposed was surely not the
> > trainwreck which you decided to put into that debugfs thing.
>
> We're trying to sort out the trainwreck, but there's an additional wrinkle
> that I'd like your input on.
>
> We overlooked the fact that MSR_TEST_CTRL is per-core, i.e. shared by
> sibling hyperthreads.
You must be kidding. It took 9 revisions of trainwreck engineering to
find that out.
> This is especially problematic for KVM, as loading MSR_TEST_CTRL during
> VM-Enter could cause spurious #AC faults in the kernel and bounce
> MSR_TEST_CTRL.split_lock.
>
> E.g. if CPU0 and CPU1 are siblings and CPU1 is running a KVM guest with
> MSR_TEST_CTRL.split_lock=1, hitting an #AC on CPU0 in the host kernel will
> lead to suprious #AC faults and constant toggling of of the MSR.
>
> My thought to handle this:
>
> - Remove the per-cpu cache.
>
> - Rework the atomic variable to differentiate between "disabled globally"
> and "disabled by kernel (on some CPUs)".
Under the assumption that the kernel should never trigger #AC anyway, that
should be good enough.
> - Modify the #AC handler to test/set the same atomic variable as the
> sysfs knob. This is the "disabled by kernel" flow.
That's the #AC in kernel handler, right?
> - Modify the debugfs/sysfs knob to only allow disabling split-lock
> detection. This is the "disabled globally" path, i.e. sends IPIs to
> clear MSR_TEST_CTRL.split_lock on all online CPUs.
Why only disable? What's wrong with reenabling it? The shiny new driver you
are working on is triggering #AC. So in order to test the fix, you need to
reboot the machine instead of just unloading the module, reenabling #AC and
then loading the fixed one?
> - Modify the resume/init flow to clear MSR_TEST_CTRL.split_lock if it's
> been disabled on *any* CPU via #AC or via the knob.
Fine.
> - Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
> actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
> guest can do WRMSR and handle its own #AC faults, but KVM doesn't
> change the value in hardware.
>
> * Allowing guest to enable split-lock detection can induce #AC on
> the host after it has been explicitly turned off, e.g. the sibling
> hyperthread hits an #AC in the host kernel, or worse, causes a
> different process in the host to SIGBUS.
>
> * Allowing guest to disable split-lock detection opens up the host
> to DoS attacks.
Wasn't this discussed before and agreed on that if the host has AC enabled
that the guest should not be able to force disable it? I surely lost track
of this completely so my memory might trick me.
The real question is what you do when the host has #AC enabled and the
guest 'disabled' it and triggers #AC. Is that going to be silently ignored
or is the intention to kill the guest in the same way as we kill userspace?
The latter would be the right thing, but given the fact that the current
kernels easily trigger #AC today, that would cause a major wreckage in
hosting scenarios. So I fear we need to bite the bullet and have a knob
which defaults to 'handle silently' and allows to enable the kill mechanics
on purpose. 'Handle silently' needs some logging of course, at least a per
guest counter which can be queried and a tracepoint.
> - KVM advertises split-lock detection to guest/userspace if and only if
> split_lock_detect_disabled is zero.
Assuming that the host kernel is clean, fine. If the sysadmin disables it
after boot and after starting guests, it's his problem.
> - Add a pr_warn_once() in KVM that triggers if split locks are disabled
> after support has been advertised to a guest.
The pr_warn() is more or less redundant, but no strong opinion here.
> The question at the forefront of my mind is: why not have the #AC handler
> send a fire-and-forget IPI to online CPUs to disable split-lock detection
> on all CPUs? Would the IPI be problematic? Globally disabling split-lock
> on any #AC would (marginally) simplify the code and would eliminate the
> oddity of userspace process (and KVM guest) #AC behavior varying based on
> the physical CPU it's running on.
I'm fine with the IPI under the assumption that the kernel should never
trigger it at all in production.
Thanks,
tglx
On 25/09/19 20:09, Sean Christopherson wrote:
> We're trying to sort out the trainwreck, but there's an additional wrinkle
> that I'd like your input on.
That's not exactly a wrinkle...
> - Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
> actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
> guest can do WRMSR and handle its own #AC faults, but KVM doesn't
> change the value in hardware.
>
> * Allowing guest to enable split-lock detection can induce #AC on
> the host after it has been explicitly turned off, e.g. the sibling
> hyperthread hits an #AC in the host kernel, or worse, causes a
> different process in the host to SIGBUS.
>
> * Allowing guest to disable split-lock detection opens up the host
> to DoS attacks.
>
> - KVM advertises split-lock detection to guest/userspace if and only if
> split_lock_detect_disabled is zero.
>
> - Add a pr_warn_once() in KVM that triggers if split locks are disabled
> after support has been advertised to a guest.
>
> Does this sound sane?
Not really, unfortunately. Just never advertise split-lock detection to
guests. If the host has enabled split-lock detection, trap #AC and
forward it to the host handler---which would disable split lock
detection globally and reenter the guest.
Paolo
On Wed, 16 Oct 2019, Paolo Bonzini wrote:
> On 25/09/19 20:09, Sean Christopherson wrote:
> > - Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
> > actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
> > guest can do WRMSR and handle its own #AC faults, but KVM doesn't
> > change the value in hardware.
> >
> > * Allowing guest to enable split-lock detection can induce #AC on
> > the host after it has been explicitly turned off, e.g. the sibling
> > hyperthread hits an #AC in the host kernel, or worse, causes a
> > different process in the host to SIGBUS.
> >
> > * Allowing guest to disable split-lock detection opens up the host
> > to DoS attacks.
> >
> > - KVM advertises split-lock detection to guest/userspace if and only if
> > split_lock_detect_disabled is zero.
> >
> > - Add a pr_warn_once() in KVM that triggers if split locks are disabled
> > after support has been advertised to a guest.
> >
> > Does this sound sane?
>
> Not really, unfortunately. Just never advertise split-lock detection to
> guests. If the host has enabled split-lock detection, trap #AC and
> forward it to the host handler---which would disable split lock
> detection globally and reenter the guest.
Which completely defeats the purpose.
1) Sane guest
Guest kernel has #AC handler and you basically prevent it from detecting
malicious user space and killing it. You also prevent #AC detection in
the guest kernel which limits debugability.
2) Malicious guest
Trigger #AC to disable the host detection and then carry out the DoS
attack.
Try again.
Thanks,
tglx
On 16/10/19 11:47, Thomas Gleixner wrote:
> On Wed, 16 Oct 2019, Paolo Bonzini wrote:
>> Just never advertise split-lock
>> detection to guests. If the host has enabled split-lock detection,
>> trap #AC and forward it to the host handler---which would disable
>> split lock detection globally and reenter the guest.
>
> Which completely defeats the purpose.
Yes it does. But Sean's proposal, as I understand it, leads to the
guest receiving #AC when it wasn't expecting one. So for an old guest,
as soon as the guest kernel happens to do a split lock, it gets an
unexpected #AC and crashes and burns. And then, after much googling and
gnashing of teeth, people proceed to disable split lock detection.
(Old guests are the common case: you're a cloud provider and your
customers run old stuff; it's a workstation and you want to play that
game that requires an old version of Windows; etc.).
To save them the googling and gnashing of teeth, I guess we can do a
pr_warn_ratelimited on the first split lock encountered by a guest. (It
has to be ratelimited because userspace could create an arbitrary amount
of guests to spam the kernel logs). But the end result is the same,
split lock detection is disabled by the user.
The first alternative I thought of was:
- Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
guest can do WRMSR and handle its own #AC faults, but KVM doesn't
change the value in hardware.
- trap #AC if the guest encounters a split lock while detection is
disabled, and then disable split-lock detection in the host.
But I discarded it because it still doesn't do anything for malicious
guests, which can trigger #AC as they prefer. And it makes things
_worse_ for sane guests, because they think split-lock detection is
enabled but they become vulnerable as soon as there is only one
malicious guest on the same machine.
In all of these cases, the common final result is that split-lock
detection is disabled on the host. So might as well go with the
simplest one and not pretend to virtualize something that (without core
scheduling) is obviously not virtualizable.
Thanks,
Paolo
> 1) Sane guest
>
> Guest kernel has #AC handler and you basically prevent it from
> detecting malicious user space and killing it. You also prevent #AC
> detection in the guest kernel which limits debugability.
>
> 2) Malicious guest
>
> Trigger #AC to disable the host detection and then carry out the DoS
> attack.
On 10/16/2019 6:16 PM, Paolo Bonzini wrote:
> On 16/10/19 11:47, Thomas Gleixner wrote:
>> On Wed, 16 Oct 2019, Paolo Bonzini wrote:
>>> Just never advertise split-lock
>>> detection to guests. If the host has enabled split-lock detection,
>>> trap #AC and forward it to the host handler---which would disable
>>> split lock detection globally and reenter the guest.
>>
>> Which completely defeats the purpose.
>
> Yes it does. But Sean's proposal, as I understand it, leads to the
> guest receiving #AC when it wasn't expecting one. So for an old guest,
> as soon as the guest kernel happens to do a split lock, it gets an
> unexpected #AC and crashes and burns. And then, after much googling and
> gnashing of teeth, people proceed to disable split lock detection.
>
> (Old guests are the common case: you're a cloud provider and your
> customers run old stuff; it's a workstation and you want to play that
> game that requires an old version of Windows; etc.).
>
> To save them the googling and gnashing of teeth, I guess we can do a
> pr_warn_ratelimited on the first split lock encountered by a guest. (It
> has to be ratelimited because userspace could create an arbitrary amount
> of guests to spam the kernel logs). But the end result is the same,
> split lock detection is disabled by the user.
>
> The first alternative I thought of was:
>
> - Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
> actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
> guest can do WRMSR and handle its own #AC faults, but KVM doesn't
> change the value in hardware.
>
> - trap #AC if the guest encounters a split lock while detection is
> disabled, and then disable split-lock detection in the host.
>
> But I discarded it because it still doesn't do anything for malicious
> guests, which can trigger #AC as they prefer. And it makes things
> _worse_ for sane guests, because they think split-lock detection is
> enabled but they become vulnerable as soon as there is only one
> malicious guest on the same machine.
>
> In all of these cases, the common final result is that split-lock
> detection is disabled on the host. So might as well go with the
> simplest one and not pretend to virtualize something that (without core
> scheduling) is obviously not virtualizable.
Right, the nature of core-scope makes MSR_TEST_CTL impossible/hard to
virtualize.
- Making old guests survive needs to disable split-lock detection in
host(hardware).
- Defending malicious guests needs to enable split-lock detection in
host(hardware).
We cannot achieve them at the same time.
In my opinion, letting kvm disable the split-lock detection in host is
not acceptable that it just opens the door for malicious guests to
attack. I think we can use Sean's proposal like below.
KVM always traps #AC, and only advertises split-lock detection to guest
when the global variable split_lock_detection_enabled in host is true.
- If guest enables #AC (CPL3 alignment check or split-lock detection
enabled), injecting #AC back into guest since it's supposed capable of
handling it.
- If guest doesn't enable #AC, KVM reports #AC to userspace (like other
unexpected exceptions), and we can print a hint in kernel, or let
userspace (e.g., QEMU) tell the user guest is killed because there is a
split-lock in guest.
In this way, malicious guests always get killed by userspace and old
sane guests cannot survive as well if it causes split-lock. If we do
want old sane guests work we have to disable the split-lock detection
(through booting parameter or debugfs) in the host just the same as we
want to run an old and split-lock generating userspace binary.
But there is an issue that we advertise split-lock detection to guest
based on the value of split_lock_detection_enabled to be true in host,
which can be turned into false dynamically when split-lock happens in
host kernel. This causes guest's capability changes at run time and I
don't if there is a better way to inform guest? Maybe we need a pv
interface?
> Thanks,
>
> Paolo
>
>> 1) Sane guest
>>
>> Guest kernel has #AC handler and you basically prevent it from
>> detecting malicious user space and killing it. You also prevent #AC
>> detection in the guest kernel which limits debugability.
>>
>> 2) Malicious guest
>>
>> Trigger #AC to disable the host detection and then carry out the DoS
>> attack.
>
>
On 16/10/19 13:23, Xiaoyao Li wrote:
> KVM always traps #AC, and only advertises split-lock detection to guest
> when the global variable split_lock_detection_enabled in host is true.
>
> - If guest enables #AC (CPL3 alignment check or split-lock detection
> enabled), injecting #AC back into guest since it's supposed capable of
> handling it.
> - If guest doesn't enable #AC, KVM reports #AC to userspace (like other
> unexpected exceptions), and we can print a hint in kernel, or let
> userspace (e.g., QEMU) tell the user guest is killed because there is a
> split-lock in guest.
>
> In this way, malicious guests always get killed by userspace and old
> sane guests cannot survive as well if it causes split-lock. If we do
> want old sane guests work we have to disable the split-lock detection
> (through booting parameter or debugfs) in the host just the same as we
> want to run an old and split-lock generating userspace binary.
Old guests are prevalent enough that enabling split-lock detection by
default would be a big usability issue. And even ignoring that, you
would get the issue you describe below:
> But there is an issue that we advertise split-lock detection to guest
> based on the value of split_lock_detection_enabled to be true in host,
> which can be turned into false dynamically when split-lock happens in
> host kernel.
... which means that supposedly safe guests become unsafe, and that is bad.
> This causes guest's capability changes at run time and I
> don't if there is a better way to inform guest? Maybe we need a pv
> interface?
Even a PV interface would not change the basic fact that a supposedly
safe configuration becomes unsafe.
Paolo
On Wed, 16 Oct 2019, Paolo Bonzini wrote:
> On 16/10/19 11:47, Thomas Gleixner wrote:
> > On Wed, 16 Oct 2019, Paolo Bonzini wrote:
> >> Just never advertise split-lock
> >> detection to guests. If the host has enabled split-lock detection,
> >> trap #AC and forward it to the host handler---which would disable
> >> split lock detection globally and reenter the guest.
> >
> > Which completely defeats the purpose.
>
> Yes it does. But Sean's proposal, as I understand it, leads to the
> guest receiving #AC when it wasn't expecting one. So for an old guest,
> as soon as the guest kernel happens to do a split lock, it gets an
> unexpected #AC and crashes and burns. And then, after much googling and
> gnashing of teeth, people proceed to disable split lock detection.
I don't think that this was what he suggested/intended.
> In all of these cases, the common final result is that split-lock
> detection is disabled on the host. So might as well go with the
> simplest one and not pretend to virtualize something that (without core
> scheduling) is obviously not virtualizable.
You are completely ignoring any argument here and just leave it behind your
signature (instead of trimming your reply).
> > 1) Sane guest
> >
> > Guest kernel has #AC handler and you basically prevent it from
> > detecting malicious user space and killing it. You also prevent #AC
> > detection in the guest kernel which limits debugability.
That's a perfectly fine situation. Host has #AC enabled and exposes the
availability of #AC to the guest. Guest kernel has a proper handler and
does the right thing. So the host _CAN_ forward #AC to the guest and let it
deal with it. For that to work you need to expose the MSR so you know the
guest state in the host.
Your lazy 'solution' just renders #AC completely useless even for
debugging.
> > 2) Malicious guest
> >
> > Trigger #AC to disable the host detection and then carry out the DoS
> > attack.
With your proposal you render #AC useless even on hosts which have SMT
disabled, which is just wrong. There are enough good reasons to disable
SMT.
I agree that with SMT enabled the situation is truly bad, but we surely can
be smarter than just disabling it globally unconditionally and forever.
Plus we want a knob which treats guests triggering #AC in the same way as
we treat user space, i.e. kill them with SIGBUS.
Thanks,
tglx
On 16/10/19 13:49, Thomas Gleixner wrote:
> On Wed, 16 Oct 2019, Paolo Bonzini wrote:
>> Yes it does. But Sean's proposal, as I understand it, leads to the
>> guest receiving #AC when it wasn't expecting one. So for an old guest,
>> as soon as the guest kernel happens to do a split lock, it gets an
>> unexpected #AC and crashes and burns. And then, after much googling and
>> gnashing of teeth, people proceed to disable split lock detection.
>
> I don't think that this was what he suggested/intended.
Xiaoyao's reply suggests that he also understood it like that.
>> In all of these cases, the common final result is that split-lock
>> detection is disabled on the host. So might as well go with the
>> simplest one and not pretend to virtualize something that (without core
>> scheduling) is obviously not virtualizable.
>
> You are completely ignoring any argument here and just leave it behind your
> signature (instead of trimming your reply).
I am not ignoring them, I think there is no doubt that this is the
intended behavior. I disagree that Sean's patches achieve it, however.
>>> 1) Sane guest
>>>
>>> Guest kernel has #AC handler and you basically prevent it from
>>> detecting malicious user space and killing it. You also prevent #AC
>>> detection in the guest kernel which limits debugability.
>
> That's a perfectly fine situation. Host has #AC enabled and exposes the
> availability of #AC to the guest. Guest kernel has a proper handler and
> does the right thing. So the host _CAN_ forward #AC to the guest and let it
> deal with it. For that to work you need to expose the MSR so you know the
> guest state in the host.
>
> Your lazy 'solution' just renders #AC completely useless even for
> debugging.
>
>>> 2) Malicious guest
>>>
>>> Trigger #AC to disable the host detection and then carry out the DoS
>>> attack.
>
> With your proposal you render #AC useless even on hosts which have SMT
> disabled, which is just wrong. There are enough good reasons to disable
> SMT.
My lazy "solution" only applies to SMT enabled. When SMT is either not
supported, or disabled as in "nosmt=force", we can virtualize it like
the posted patches have done so far.
> I agree that with SMT enabled the situation is truly bad, but we surely can
> be smarter than just disabling it globally unconditionally and forever.
>
> Plus we want a knob which treats guests triggering #AC in the same way as
> we treat user space, i.e. kill them with SIGBUS.
Yes, that's a valid alternative. But if SMT is possible, I think the
only sane possibilities are global disable and SIGBUS. SIGBUS (or
better, a new KVM_RUN exit code) can be acceptable for debugging guests too.
Paolo
On 10/16/2019 7:26 PM, Paolo Bonzini wrote:
> On 16/10/19 13:23, Xiaoyao Li wrote:
>> KVM always traps #AC, and only advertises split-lock detection to guest
>> when the global variable split_lock_detection_enabled in host is true.
>>
>> - If guest enables #AC (CPL3 alignment check or split-lock detection
>> enabled), injecting #AC back into guest since it's supposed capable of
>> handling it.
>> - If guest doesn't enable #AC, KVM reports #AC to userspace (like other
>> unexpected exceptions), and we can print a hint in kernel, or let
>> userspace (e.g., QEMU) tell the user guest is killed because there is a
>> split-lock in guest.
>>
>> In this way, malicious guests always get killed by userspace and old
>> sane guests cannot survive as well if it causes split-lock. If we do
>> want old sane guests work we have to disable the split-lock detection
>> (through booting parameter or debugfs) in the host just the same as we
>> want to run an old and split-lock generating userspace binary.
>
> Old guests are prevalent enough that enabling split-lock detection by
> default would be a big usability issue. And even ignoring that, you
> would get the issue you describe below:
Right, whether enabling split-lock detection is made by the
administrator. The administrator is supposed to know the consequence of
enabling it. Enabling it means don't want any split-lock happens in
userspace, of course VMM softwares are under control.
>> But there is an issue that we advertise split-lock detection to guest
>> based on the value of split_lock_detection_enabled to be true in host,
>> which can be turned into false dynamically when split-lock happens in
>> host kernel.
>
> ... which means that supposedly safe guests become unsafe, and that is bad.
>
>> This causes guest's capability changes at run time and I
>> don't if there is a better way to inform guest? Maybe we need a pv
>> interface?
>
> Even a PV interface would not change the basic fact that a supposedly
> safe configuration becomes unsafe.
I don't catch you about the unsafe?
If host disables split-lock detection dynamically, then the
MST_TEST_CTL.split_lock is clear in the hardware and we can use the PV
interface to notify the guest so that guest knows it loses the
capability of split-lock detection. In this case, I think safety is
meaningless for both host and guest.
> Paolo
>
On 10/16/2019 7:58 PM, Paolo Bonzini wrote:
> On 16/10/19 13:49, Thomas Gleixner wrote:
>> On Wed, 16 Oct 2019, Paolo Bonzini wrote:
>>> Yes it does. But Sean's proposal, as I understand it, leads to the
>>> guest receiving #AC when it wasn't expecting one. So for an old guest,
>>> as soon as the guest kernel happens to do a split lock, it gets an
>>> unexpected #AC and crashes and burns. And then, after much googling and
>>> gnashing of teeth, people proceed to disable split lock detection.
>>
>> I don't think that this was what he suggested/intended.
>
> Xiaoyao's reply suggests that he also understood it like that.
>
Actually, what I replied is a little different from what you stated
above that guest won't receive #AC when it wasn't expecting one but the
userspace receives this #AC.
>>> In all of these cases, the common final result is that split-lock
>>> detection is disabled on the host. So might as well go with the
>>> simplest one and not pretend to virtualize something that (without core
>>> scheduling) is obviously not virtualizable.
>>
>> You are completely ignoring any argument here and just leave it behind your
>> signature (instead of trimming your reply).
>
> I am not ignoring them, I think there is no doubt that this is the
> intended behavior. I disagree that Sean's patches achieve it, however.
>
>>>> 1) Sane guest
>>>>
>>>> Guest kernel has #AC handler and you basically prevent it from
>>>> detecting malicious user space and killing it. You also prevent #AC
>>>> detection in the guest kernel which limits debugability.
>>
>> That's a perfectly fine situation. Host has #AC enabled and exposes the
>> availability of #AC to the guest. Guest kernel has a proper handler and
>> does the right thing. So the host _CAN_ forward #AC to the guest and let it
>> deal with it. For that to work you need to expose the MSR so you know the
>> guest state in the host.
>>
>> Your lazy 'solution' just renders #AC completely useless even for
>> debugging.
>>
>>>> 2) Malicious guest
>>>>
>>>> Trigger #AC to disable the host detection and then carry out the DoS
>>>> attack.
>>
>> With your proposal you render #AC useless even on hosts which have SMT
>> disabled, which is just wrong. There are enough good reasons to disable
>> SMT.
>
> My lazy "solution" only applies to SMT enabled. When SMT is either not
> supported, or disabled as in "nosmt=force", we can virtualize it like
> the posted patches have done so far.
>
Do we really need to divide it into two cases of SMT enabled and SMT
disabled?
>> I agree that with SMT enabled the situation is truly bad, but we surely can
>> be smarter than just disabling it globally unconditionally and forever.
>>
>> Plus we want a knob which treats guests triggering #AC in the same way as
>> we treat user space, i.e. kill them with SIGBUS.
>
> Yes, that's a valid alternative. But if SMT is possible, I think the
> only sane possibilities are global disable and SIGBUS. SIGBUS (or
> better, a new KVM_RUN exit code) can be acceptable for debugging guests too.
If SIGBUS, why need to globally disable?
When there is an #AC due to split-lock in guest, KVM only has below two
choices:
1) inject back into guest.
- If kvm advertise this feature to guest, and guest kernel is
latest, and guest kernel must enable it too. It's the happy case that
guest can handler it on its own purpose.
- Any other cases, guest get an unexpected #AC and crash.
2) report to userspace (I think the same like a SIGBUS)
So for simplicity, we can do what Paolo suggested that don't advertise
this feature and report #AC to userspace when an #AC due to split-lock
in guest *but* we never disable the host's split-lock detection due to
guest's split-lock.
> Paolo
>
On 16/10/19 15:51, Xiaoyao Li wrote:
> On 10/16/2019 7:58 PM, Paolo Bonzini wrote:
>> On 16/10/19 13:49, Thomas Gleixner wrote:
>>> On Wed, 16 Oct 2019, Paolo Bonzini wrote:
>>>> Yes it does. But Sean's proposal, as I understand it, leads to the
>>>> guest receiving #AC when it wasn't expecting one. So for an old guest,
>>>> as soon as the guest kernel happens to do a split lock, it gets an
>>>> unexpected #AC and crashes and burns. And then, after much googling
>>>> and
>>>> gnashing of teeth, people proceed to disable split lock detection.
>>>
>>> I don't think that this was what he suggested/intended.
>>
>> Xiaoyao's reply suggests that he also understood it like that.
>
> Actually, what I replied is a little different from what you stated
> above that guest won't receive #AC when it wasn't expecting one but the
> userspace receives this #AC.
Okay---but userspace has no choice but to crash the guest, which is okay
for debugging but, most likely, undesirable behavior in production.
>>> With your proposal you render #AC useless even on hosts which have SMT
>>> disabled, which is just wrong. There are enough good reasons to disable
>>> SMT.
>>
>> My lazy "solution" only applies to SMT enabled. When SMT is either not
>> supported, or disabled as in "nosmt=force", we can virtualize it like
>> the posted patches have done so far.
>
> Do we really need to divide it into two cases of SMT enabled and SMT
> disabled?
Yes, absolutely. Because in one case MSR_TEST_CTRL behaves sanely, in
the other it doesn't.
>> Yes, that's a valid alternative. But if SMT is possible, I think the
>> only sane possibilities are global disable and SIGBUS. SIGBUS (or
>> better, a new KVM_RUN exit code) can be acceptable for debugging
>> guests too.
>
> If SIGBUS, why need to globally disable?
SIGBUS (actually a new KVM_EXIT_INTERNAL_ERROR result from KVM_RUN is
better, but that's the idea) is for when you're debugging guests.
Global disable (or alternatively, disable SMT) is for production use.
> When there is an #AC due to split-lock in guest, KVM only has below two
> choices:
> 1) inject back into guest.
> - If kvm advertise this feature to guest, and guest kernel is latest,
> and guest kernel must enable it too. It's the happy case that guest can
> handler it on its own purpose.
> - Any other cases, guest get an unexpected #AC and crash.
> 2) report to userspace (I think the same like a SIGBUS)
>
> So for simplicity, we can do what Paolo suggested that don't advertise
> this feature and report #AC to userspace when an #AC due to split-lock
> in guest *but* we never disable the host's split-lock detection due to
> guest's split-lock.
This is one possibility, but it must be opt-in. Either you make split
lock detection opt-in in the host (and then a userspace exit is okay),
or you make split lock detection opt-in for KVM (and then #AC causes a
global disable of split-lock detection on the host).
Breaking all old guests with the default options is not a valid choice.
Paolo
For the smt case, can you make #AC enable a property of the process?
Then disable it on the core if either smt process requires it be disabled?
This would mean that is a 'mixed environment' not all split accesses
would actually generate #AC - but enough would to detect broken code
that doesn't have #AC explicitly disabled.
I'm not sure you'd want a guest to flip #AC enable based on the process
it is scheduling, but it might work for the base metal scheduler.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Wed, 16 Oct 2019, Xiaoyao Li wrote:
> On 10/16/2019 7:26 PM, Paolo Bonzini wrote:
> > Old guests are prevalent enough that enabling split-lock detection by
> > default would be a big usability issue. And even ignoring that, you
> > would get the issue you describe below:
>
> Right, whether enabling split-lock detection is made by the administrator. The
> administrator is supposed to know the consequence of enabling it. Enabling it
> means don't want any split-lock happens in userspace, of course VMM softwares
> are under control.
I have no idea what you are talking about, but the whole thing is trivial
enough to describe in a decision matrix:
N | #AC | #AC enabled | SMT | Ctrl | Guest | Action
R | available | on host | | exposed | #AC |
--|-----------|-------------|-----|---------|-------|---------------------
| | | | | |
0 | N | x | x | N | x | None
| | | | | |
1 | Y | N | x | N | x | None
| | | | | |
2 | Y | Y | x | Y | Y | Forward to guest
| | | | | |
3 | Y | Y | N | Y | N | A) Store in vCPU and
| | | | | | toggle on VMENTER/EXIT
| | | | | |
| | | | | | B) SIGBUS or KVM exit code
| | | | | |
4 | Y | Y | Y | Y | N | A) Disable globally on
| | | | | | host. Store in vCPU/guest
| | | | | | state and evtl. reenable
| | | | | | when guest goes away.
| | | | | |
| | | | | | B) SIGBUS or KVM exit code
[234] need proper accounting and tracepoints in KVM
[34] need a policy decision in KVM
Now there are a two possible state transitions:
#AC enabled on host during runtime
Existing guests are not notified. Nothing changes.
#AC disabled on host during runtime
That only affects state #2 from the above table and there are two
possible solutions:
1) Do nothing.
2) Issue a notification to the guest. This would be doable at least
for Linux guests because any guest kernel which handles #AC is
at least the same generation as the host which added #AC.
Whether it's worth it, I don't know, but it makes sense at least
for consistency reasons.
For a first step I'd go for 'Do nothing'
SMT state transitions could be handled in a similar way, but I don't think
it's worth the trouble. The above should cover everything at least on a
best effort basis.
Thanks,
tglx
On Wed, 16 Oct 2019, Xiaoyao Li wrote:
> On 10/16/2019 7:58 PM, Paolo Bonzini wrote:
> > > With your proposal you render #AC useless even on hosts which have SMT
> > > disabled, which is just wrong. There are enough good reasons to disable
> > > SMT.
> >
> > My lazy "solution" only applies to SMT enabled. When SMT is either not
> > supported, or disabled as in "nosmt=force", we can virtualize it like
> > the posted patches have done so far.
> >
>
> Do we really need to divide it into two cases of SMT enabled and SMT disabled?
Yes. See the matrix I just sent.
> > > I agree that with SMT enabled the situation is truly bad, but we surely
> > > can
> > > be smarter than just disabling it globally unconditionally and forever.
> > >
> > > Plus we want a knob which treats guests triggering #AC in the same way as
> > > we treat user space, i.e. kill them with SIGBUS.
> >
> > Yes, that's a valid alternative. But if SMT is possible, I think the
> > only sane possibilities are global disable and SIGBUS. SIGBUS (or
> > better, a new KVM_RUN exit code) can be acceptable for debugging guests too.
>
> If SIGBUS, why need to globally disable?
See the matrix I just sent.
> When there is an #AC due to split-lock in guest, KVM only has below two
> choices:
> 1) inject back into guest.
> - If kvm advertise this feature to guest, and guest kernel is latest, and
> guest kernel must enable it too. It's the happy case that guest can handler it
> on its own purpose.
> - Any other cases, guest get an unexpected #AC and crash.
That's just wrong for obvious reasons.
> 2) report to userspace (I think the same like a SIGBUS)
No. What guarantees that userspace qemu handles the SIGBUS sanely?
> So for simplicity, we can do what Paolo suggested that don't advertise this
> feature and report #AC to userspace when an #AC due to split-lock in guest
> *but* we never disable the host's split-lock detection due to guest's
> split-lock.
No, you can't.
Guess what happens when you just boot some existing guest on a #AC enabled
host without having updated qemu to handle the exit code/SIGBUS.
It simply will crash and burn in nonsensical ways. Same as reinjecting it
into the guest and letting it crash.
Thanks,
tglx
On Wed, 16 Oct 2019, David Laight wrote:
> For the smt case, can you make #AC enable a property of the process?
> Then disable it on the core if either smt process requires it be disabled?
That would be feasible if the logic of the TEST_CTRL_MSR would be AND, but
it's OR.
Thread0 #AC-EN Thread1 #AC-EN #AC enabled on core
0 0 0
1 0 1
0 1 1
1 1 1
So in order to do flips on VMENTER you'd need to IPI the other thread and
handle all the interesting corner cases.
The 'Rescue SMT' mitigation stuff on top of core scheduling is ugly enough
already, but there the state can be transitionally 'unmitigated' while with
#AC you run into trouble immediately if the transitional state is ON at the
wrong point.
Thanks,
tglx
On 16/10/19 16:43, Thomas Gleixner wrote:
>
> N | #AC | #AC enabled | SMT | Ctrl | Guest | Action
> R | available | on host | | exposed | #AC |
> --|-----------|-------------|-----|---------|-------|---------------------
> | | | | | |
> 0 | N | x | x | N | x | None
> | | | | | |
> 1 | Y | N | x | N | x | None
So far so good.
> 2 | Y | Y | x | Y | Y | Forward to guest
>
> 3 | Y | Y | N | Y | N | A) Store in vCPU and
> | | | | | | toggle on VMENTER/EXIT
> | | | | | |
> | | | | | | B) SIGBUS or KVM exit code
(2) is problematic for the SMT=y case, because of what happens when #AC
is disabled on the host---safe guests can start to be susceptible to
DoS.
For (3), which is the SMT=n case,, the behavior is the same independent of
guest #AC.
So I would change these two lines to:
2 | Y | Y | Y | N | x | On first guest #AC,
| | | | | | disable globally on host.
| | | | | |
3 | Y | Y | N | Y | x | Switch MSR_TEST_CTRL on
| | | | | | enter/exit, plus:
| | | | | | A) #AC forwarded to guest.
| | | | | | B) SIGBUS or KVM exit code
> 4 | Y | Y | Y | Y | N | A) Disable globally on
> | | | | | | host. Store in vCPU/guest
> | | | | | | state and evtl. reenable
> | | | | | | when guest goes away.
> | | | | | |
> | | | | | | B) SIGBUS or KVM exit code
Also okay. And finally:
5 | Y | Y | Y | Y | Y | Forward to guest
> Now there are a two possible state transitions:
> #AC enabled on host during runtime
>
> Existing guests are not notified. Nothing changes.
Switches from (1) to (2) or (4) and (5). Ugly for (2) and (4A), in that
split-lock detection might end up being forcibly disabled on the host, but
guests do not notice anything. Okay for (4B) and (5).
> #AC disabled on host during runtime
Switches from (2), (4) and (5) to (1). Bad for (4A) and (5), in that
guests might miss #ACs from userspace. No problem for (2), okay for (4B)
since the host admin decision affects KVM userspace but not KVM guests.
Because (4A) and (5) are problematic, and (4B) can cause guests to halt
irrecoverably on guest #AC, I'd prefer the control not to be
exposed by default. In KVM API terms:
- KVM_GET_SUPPORTED_CPUID should *not* return the new CPUID bit and
KVM_GET_MSR_INDEX_LIST should not return MSR_TEST_CTRL. A separate
capability can be queried with KVM_CHECK_EXTENSION to determine whether
KVM supports split-lock detection is available. The default behavior will
be (2).
- we only need to pick one of (3A)/(4A) and (3B)/(4B). (3A) should definitely
be the default, probably (4A) too. But if both are implemented, the
aforementioned capability can be used with KVM_ENABLE_CAP to switch from
one behavior to the other.
Paolo
On Wed, Oct 16, 2019 at 04:08:14PM +0200, Paolo Bonzini wrote:
> SIGBUS (actually a new KVM_EXIT_INTERNAL_ERROR result from KVM_RUN is
> better, but that's the idea) is for when you're debugging guests.
> Global disable (or alternatively, disable SMT) is for production use.
Alternatively, for guests without split-lock #AC enabled, what if KVM were
to emulate the faulting instruction with split-lock detection temporarily
disabled?
The emulator can presumably handle all such lock instructions, and an
unhandled instruction would naturally exit to userspace.
The latency of VM-Enter+VM-Exit should be enough to guard against DoS from
a malicious guest. KVM could also artificially rate-limit a guest that is
generating copious amounts of split-lock #ACs.
On 16/10/19 17:41, Sean Christopherson wrote:
> On Wed, Oct 16, 2019 at 04:08:14PM +0200, Paolo Bonzini wrote:
>> SIGBUS (actually a new KVM_EXIT_INTERNAL_ERROR result from KVM_RUN is
>> better, but that's the idea) is for when you're debugging guests.
>> Global disable (or alternatively, disable SMT) is for production use.
>
> Alternatively, for guests without split-lock #AC enabled, what if KVM were
> to emulate the faulting instruction with split-lock detection temporarily
> disabled?
Yes we can get fancy, but remember that KVM is not yet supporting
emulation of locked instructions. Adding it is possible but shouldn't
be in the critical path for the whole feature.
How would you disable split-lock detection temporarily? Just tweak
MSR_TEST_CTRL for the time of running the one instruction, and cross
fingers that the sibling doesn't notice?
Thanks,
Paolo
> The emulator can presumably handle all such lock instructions, and an
> unhandled instruction would naturally exit to userspace.
>
> The latency of VM-Enter+VM-Exit should be enough to guard against DoS from
> a malicious guest. KVM could also artificially rate-limit a guest that is
> generating copious amounts of split-lock #ACs.
>
On Wed, Oct 16, 2019 at 11:29:00AM +0200, Thomas Gleixner wrote:
> > - Modify the #AC handler to test/set the same atomic variable as the
> > sysfs knob. This is the "disabled by kernel" flow.
>
> That's the #AC in kernel handler, right?
Yes.
> > - Modify the debugfs/sysfs knob to only allow disabling split-lock
> > detection. This is the "disabled globally" path, i.e. sends IPIs to
> > clear MSR_TEST_CTRL.split_lock on all online CPUs.
>
> Why only disable? What's wrong with reenabling it? The shiny new driver you
> are working on is triggering #AC. So in order to test the fix, you need to
> reboot the machine instead of just unloading the module, reenabling #AC and
> then loading the fixed one?
A re-enabling path adds complexity (though not much) and is undesirable
for a production environment as a split-lock issue in the kernel isn't
going to magically disappear. And I thought that disable-only was also
your preferred implementation based on a previous comment[*], but that
comment may have been purely in the scope of userspace applications.
Anyways, my personal preference would be to keep things simple and not
support a re-enabling path. But then again, I do 99.9% of my development
in VMs so my vote probably shouldn't count regarding the module issue.
[*] https://lkml.kernel.org/r/[email protected]
> > - Modify the resume/init flow to clear MSR_TEST_CTRL.split_lock if it's
> > been disabled on *any* CPU via #AC or via the knob.
>
> Fine.
>
> > - Remove KVM loading of MSR_TEST_CTRL, i.e. KVM *never* writes the CPU's
> > actual MSR_TEST_CTRL. KVM still emulates MSR_TEST_CTRL so that the
> > guest can do WRMSR and handle its own #AC faults, but KVM doesn't
> > change the value in hardware.
> >
> > * Allowing guest to enable split-lock detection can induce #AC on
> > the host after it has been explicitly turned off, e.g. the sibling
> > hyperthread hits an #AC in the host kernel, or worse, causes a
> > different process in the host to SIGBUS.
> >
> > * Allowing guest to disable split-lock detection opens up the host
> > to DoS attacks.
>
> Wasn't this discussed before and agreed on that if the host has AC enabled
> that the guest should not be able to force disable it? I surely lost track
> of this completely so my memory might trick me.
Yes, I was restating that point, or at least attempting to.
> The real question is what you do when the host has #AC enabled and the
> guest 'disabled' it and triggers #AC. Is that going to be silently ignored
> or is the intention to kill the guest in the same way as we kill userspace?
>
> The latter would be the right thing, but given the fact that the current
> kernels easily trigger #AC today, that would cause a major wreckage in
> hosting scenarios. So I fear we need to bite the bullet and have a knob
> which defaults to 'handle silently' and allows to enable the kill mechanics
> on purpose. 'Handle silently' needs some logging of course, at least a per
> guest counter which can be queried and a tracepoint.
On Wed, Oct 16, 2019 at 05:43:53PM +0200, Paolo Bonzini wrote:
> On 16/10/19 17:41, Sean Christopherson wrote:
> > On Wed, Oct 16, 2019 at 04:08:14PM +0200, Paolo Bonzini wrote:
> >> SIGBUS (actually a new KVM_EXIT_INTERNAL_ERROR result from KVM_RUN is
> >> better, but that's the idea) is for when you're debugging guests.
> >> Global disable (or alternatively, disable SMT) is for production use.
> >
> > Alternatively, for guests without split-lock #AC enabled, what if KVM were
> > to emulate the faulting instruction with split-lock detection temporarily
> > disabled?
>
> Yes we can get fancy, but remember that KVM is not yet supporting
> emulation of locked instructions. Adding it is possible but shouldn't
> be in the critical path for the whole feature.
Ah, didn't realize that. I'm surprised emulating all locks with cmpxchg
doesn't cause problems (or am I misreading the code?). Assuming I'm
reading the code correctly, the #AC path could kick all other vCPUS on
emulation failure and then retry emulation to "guarantee" success. Though
that's starting to build quite the house of cards.
> How would you disable split-lock detection temporarily? Just tweak
> MSR_TEST_CTRL for the time of running the one instruction, and cross
> fingers that the sibling doesn't notice?
Tweak MSR_TEST_CTRL, with logic to handle the scenario where split-lock
detection is globally disable during emulation (so KVM doesn't
inadvertantly re-enable it).
There isn't much for the sibling to notice. The kernel would temporarily
allow split-locks on the sibling, but that's a performance issue and isn't
directly fatal. A missed #AC in the host kernel would only delay the
inevitable global disabling of split-lock. A missed #AC in userspace would
again just delay the inevitable SIGBUS.
On 10/16/2019 11:37 PM, Paolo Bonzini wrote:
> On 16/10/19 16:43, Thomas Gleixner wrote:
>>
>> N | #AC | #AC enabled | SMT | Ctrl | Guest | Action
>> R | available | on host | | exposed | #AC |
>> --|-----------|-------------|-----|---------|-------|---------------------
>> | | | | | |
>> 0 | N | x | x | N | x | None
>> | | | | | |
>> 1 | Y | N | x | N | x | None
>
> So far so good.
>
>> 2 | Y | Y | x | Y | Y | Forward to guest
>>
>> 3 | Y | Y | N | Y | N | A) Store in vCPU and
>> | | | | | | toggle on VMENTER/EXIT
>> | | | | | |
>> | | | | | | B) SIGBUS or KVM exit code
>
> (2) is problematic for the SMT=y case, because of what happens when #AC
> is disabled on the host---safe guests can start to be susceptible to
> DoS.
>
> For (3), which is the SMT=n case,, the behavior is the same independent of
> guest #AC.
>
> So I would change these two lines to:
>
> 2 | Y | Y | Y | N | x | On first guest #AC,
> | | | | | | disable globally on host.
> | | | | | |
> 3 | Y | Y | N | Y | x | Switch MSR_TEST_CTRL on
> | | | | | | enter/exit, plus:
> | | | | | | A) #AC forwarded to guest.
> | | | | | | B) SIGBUS or KVM exit code
>
I just want to get confirmed that in (3), we should split into 2 case:
a) if host has it enabled, still apply the constraint that guest is
forcibly enabled? so we don't switch MSR_TEST_CTL.
b) if host has it disabled, we can switch MSR_TEST_CTL on enter/exit.
On 16/10/19 18:25, Xiaoyao Li wrote:
>>
>> 3 | Y | Y | N | Y | x | Switch
>> MSR_TEST_CTRL on
>> | | | | | | enter/exit,
>> plus:
>> | | | | | | A) #AC
>> forwarded to guest.
>> | | | | | | B) SIGBUS or
>> KVM exit code
>>
>
> I just want to get confirmed that in (3), we should split into 2 case:
>
> a) if host has it enabled, still apply the constraint that guest is
> forcibly enabled? so we don't switch MSR_TEST_CTL.
>
> b) if host has it disabled, we can switch MSR_TEST_CTL on enter/exit.
That's doable, yes.
Paolo
On Wed, Oct 16, 2019 at 09:23:37AM -0700, Sean Christopherson wrote:
> On Wed, Oct 16, 2019 at 05:43:53PM +0200, Paolo Bonzini wrote:
> > On 16/10/19 17:41, Sean Christopherson wrote:
> > > On Wed, Oct 16, 2019 at 04:08:14PM +0200, Paolo Bonzini wrote:
> > >> SIGBUS (actually a new KVM_EXIT_INTERNAL_ERROR result from KVM_RUN is
> > >> better, but that's the idea) is for when you're debugging guests.
> > >> Global disable (or alternatively, disable SMT) is for production use.
> > >
> > > Alternatively, for guests without split-lock #AC enabled, what if KVM were
> > > to emulate the faulting instruction with split-lock detection temporarily
> > > disabled?
> >
> > Yes we can get fancy, but remember that KVM is not yet supporting
> > emulation of locked instructions. Adding it is possible but shouldn't
> > be in the critical path for the whole feature.
>
> Ah, didn't realize that. I'm surprised emulating all locks with cmpxchg
> doesn't cause problems (or am I misreading the code?). Assuming I'm
> reading the code correctly, the #AC path could kick all other vCPUS on
> emulation failure and then retry emulation to "guarantee" success. Though
> that's starting to build quite the house of cards.
Ugh, doesn't the existing emulation behavior create another KVM issue?
KVM uses a locked cmpxchg in emulator_cmpxchg_emulated() and the address
is guest controlled, e.g. a guest could coerce the host into disabling
split-lock detection via the host's #AC handler by triggering emulation
and inducing an #AC in the emulator.
> > How would you disable split-lock detection temporarily? Just tweak
> > MSR_TEST_CTRL for the time of running the one instruction, and cross
> > fingers that the sibling doesn't notice?
>
> Tweak MSR_TEST_CTRL, with logic to handle the scenario where split-lock
> detection is globally disable during emulation (so KVM doesn't
> inadvertantly re-enable it).
>
> There isn't much for the sibling to notice. The kernel would temporarily
> allow split-locks on the sibling, but that's a performance issue and isn't
> directly fatal. A missed #AC in the host kernel would only delay the
> inevitable global disabling of split-lock. A missed #AC in userspace would
> again just delay the inevitable SIGBUS.
On 10/17/2019 1:42 AM, Sean Christopherson wrote:
> On Wed, Oct 16, 2019 at 09:23:37AM -0700, Sean Christopherson wrote:
>> On Wed, Oct 16, 2019 at 05:43:53PM +0200, Paolo Bonzini wrote:
>>> On 16/10/19 17:41, Sean Christopherson wrote:
>>>> On Wed, Oct 16, 2019 at 04:08:14PM +0200, Paolo Bonzini wrote:
>>>>> SIGBUS (actually a new KVM_EXIT_INTERNAL_ERROR result from KVM_RUN is
>>>>> better, but that's the idea) is for when you're debugging guests.
>>>>> Global disable (or alternatively, disable SMT) is for production use.
>>>>
>>>> Alternatively, for guests without split-lock #AC enabled, what if KVM were
>>>> to emulate the faulting instruction with split-lock detection temporarily
>>>> disabled?
>>>
>>> Yes we can get fancy, but remember that KVM is not yet supporting
>>> emulation of locked instructions. Adding it is possible but shouldn't
>>> be in the critical path for the whole feature.
>>
>> Ah, didn't realize that. I'm surprised emulating all locks with cmpxchg
>> doesn't cause problems (or am I misreading the code?). Assuming I'm
>> reading the code correctly, the #AC path could kick all other vCPUS on
>> emulation failure and then retry emulation to "guarantee" success. Though
>> that's starting to build quite the house of cards.
>
> Ugh, doesn't the existing emulation behavior create another KVM issue?
> KVM uses a locked cmpxchg in emulator_cmpxchg_emulated() and the address
> is guest controlled, e.g. a guest could coerce the host into disabling
> split-lock detection via the host's #AC handler by triggering emulation
> and inducing an #AC in the emulator.
>
Exactly right.
I have tested with force_emulation_prefix. It did go into the #AC
handler and disable the split-lock detection in host.
However, without force_emulation_prefix enabled, I'm not sure whether
malicious guest can create the case causing the emulation with a lock
prefix and going to the emulator_cmpxchg_emulated().
I found it impossible without force_emulation_prefix enabled and I'm not
familiar with emulation at all. If I missed something, please let me know.
>>> How would you disable split-lock detection temporarily? Just tweak
>>> MSR_TEST_CTRL for the time of running the one instruction, and cross
>>> fingers that the sibling doesn't notice?
>>
>> Tweak MSR_TEST_CTRL, with logic to handle the scenario where split-lock
>> detection is globally disable during emulation (so KVM doesn't
>> inadvertantly re-enable it).
>>
>> There isn't much for the sibling to notice. The kernel would temporarily
>> allow split-locks on the sibling, but that's a performance issue and isn't
>> directly fatal. A missed #AC in the host kernel would only delay the
>> inevitable global disabling of split-lock. A missed #AC in userspace would
>> again just delay the inevitable SIGBUS.
The more I look at this trainwreck, the less interested I am in merging any
of this at all.
The fact that it took Intel more than a year to figure out that the MSR is
per core and not per thread is yet another proof that this industry just
works by pure chance.
There is a simple way out of this misery:
Intel issues a microcode update which does:
1) Convert the OR logic of the AC enable bit in the TEST_CTRL MSR to
AND logic, i.e. when one thread disables AC it's automatically
disabled on the core.
Alternatively it supresses the #AC when the current thread has it
disabled.
2) Provide a separate bit which indicates that the AC enable logic is
actually AND based or that #AC is supressed when the current thread
has it disabled.
Which way I don't really care as long as it makes sense.
If that's not going to happen, then we just bury the whole thing and put it
on hold until a sane implementation of that functionality surfaces in
silicon some day in the not so foreseeable future.
Seriously, this makes only sense when it's by default enabled and not
rendered useless by VIRT. Otherwise we never get any reports and none of
the issues are going to be fixed.
Thanks,
tglx
On Thu, Oct 17, 2019 at 02:29:45PM +0200, Thomas Gleixner wrote:
> The more I look at this trainwreck, the less interested I am in merging any
> of this at all.
>
> The fact that it took Intel more than a year to figure out that the MSR is
> per core and not per thread is yet another proof that this industry just
> works by pure chance.
>
> There is a simple way out of this misery:
>
> Intel issues a microcode update which does:
>
> 1) Convert the OR logic of the AC enable bit in the TEST_CTRL MSR to
> AND logic, i.e. when one thread disables AC it's automatically
> disabled on the core.
>
> Alternatively it supresses the #AC when the current thread has it
> disabled.
>
> 2) Provide a separate bit which indicates that the AC enable logic is
> actually AND based or that #AC is supressed when the current thread
> has it disabled.
>
> Which way I don't really care as long as it makes sense.
The #AC bit doesn't use OR-logic, it's straight up shared, i.e. writes on
one CPU are immediately visible on its sibling CPU. It doesn't magically
solve the problem, but I don't think we need IPIs to coordinate between
siblings, e.g. wouldn't something like this work? The per-cpu things
being pointers that are shared by siblings.
void split_lock_disable(void)
{
spinlock_t *ac_lock = this_cpu_ptr(split_lock_ac_lock);
spin_lock(ac_lock);
if (this_cpu_inc_return(*split_lock_ac_disabled) == 1)
WRMSR(RDMSR() & ~bit);
spin_unlock(ac_lock);
}
void split_lock_enable(void)
{
spinlock_t *ac_lock = this_cpu_ptr(split_lock_ac_lock);
spin_lock(ac_lock);
if (this_cpu_dec_return(*split_lock_ac_disabled) == 0)
WRMSR(RDMSR() | bit);
spin_unlock(ac_lock);
}
To avoid the spin_lock and WRMSR latency on every VM-Enter and VM-Exit,
actions (3a) and (4a) from your matrix (copied below) could be changed to
only do split_lock_disable() if the guest actually generates an #AC, and
then do split_lock_enable() on the next VM-Exit. Assuming even legacy
guests are somewhat sane and rarely do split-locks, lazily disabling the
control would eliminate most of the overhead and would also reduce the
time that the sibling CPU is running in the host without #AC protection.
N | #AC | #AC enabled | SMT | Ctrl | Guest | Action
R | available | on host | | exposed | #AC |
--|-----------|-------------|-----|---------|-------|---------------------
| | | | | |
0 | N | x | x | N | x | None
| | | | | |
1 | Y | N | x | N | x | None
| | | | | |
2 | Y | Y | x | Y | Y | Forward to guest
| | | | | |
3 | Y | Y | N | Y | N | A) Store in vCPU and
| | | | | | toggle on VMENTER/EXIT
| | | | | |
| | | | | | B) SIGBUS or KVM exit code
| | | | | |
4 | Y | Y | Y | Y | N | A) Disable globally on
| | | | | | host. Store in vCPU/guest
| | | | | | state and evtl. reenable
| | | | | | when guest goes away.
| | | | | |
| | | | | | B) SIGBUS or KVM exit code
> If that's not going to happen, then we just bury the whole thing and put it
> on hold until a sane implementation of that functionality surfaces in
> silicon some day in the not so foreseeable future.
>
> Seriously, this makes only sense when it's by default enabled and not
> rendered useless by VIRT. Otherwise we never get any reports and none of
> the issues are going to be fixed.
>
> Thanks,
>
> tglx
On Thu, 17 Oct 2019, Sean Christopherson wrote:
> On Thu, Oct 17, 2019 at 02:29:45PM +0200, Thomas Gleixner wrote:
> > The more I look at this trainwreck, the less interested I am in merging any
> > of this at all.
> >
> > The fact that it took Intel more than a year to figure out that the MSR is
> > per core and not per thread is yet another proof that this industry just
> > works by pure chance.
> >
> > There is a simple way out of this misery:
> >
> > Intel issues a microcode update which does:
> >
> > 1) Convert the OR logic of the AC enable bit in the TEST_CTRL MSR to
> > AND logic, i.e. when one thread disables AC it's automatically
> > disabled on the core.
> >
> > Alternatively it supresses the #AC when the current thread has it
> > disabled.
> >
> > 2) Provide a separate bit which indicates that the AC enable logic is
> > actually AND based or that #AC is supressed when the current thread
> > has it disabled.
> >
> > Which way I don't really care as long as it makes sense.
>
> The #AC bit doesn't use OR-logic, it's straight up shared, i.e. writes on
> one CPU are immediately visible on its sibling CPU.
That's less horrible than I read out of your initial explanation.
Thankfully all of this is meticulously documented in the SDM ...
Though it changes the picture radically. The truly shared MSR allows
regular software synchronization without IPIs and without an insane amount
of corner case handling.
So as you pointed out we need a per core state, which is influenced by:
1) The global enablement switch
2) Host induced #AC
3) Guest induced #AC
A) Guest has #AC handling
B) Guest has no #AC handling
#1:
- OFF: #AC is globally disabled
- ON: #AC is globally enabled
- FORCE: same as ON but #AC is enforced on guests
#2:
If the host triggers an #AC then the #AC has to be force disabled on the
affected core independent of the state of #1. Nothing we can do about
that and once the initial wave of #AC issues is fixed this should not
happen on production systems. That disables #3 even for the #3.A case
for simplicity sake.
#3:
A) Guest has #AC handling
#AC is forwarded to the guest. No further action required aside of
accounting
B) Guest has no #AC handling
If #AC triggers the resulting action depends on the state of #1:
- FORCE: Guest is killed with SIGBUS or whatever the virt crowd
thinks is the appropriate solution
- ON: #AC triggered state is recorded per vCPU and the MSR is
toggled on VMENTER/VMEXIT in software from that point on.
So the only interesting case is #3.B and #1.state == ON. There you need
serialization of the state and the MSR write between the cores, but only
when the vCPU triggered an #AC. Until then, nothing to do.
vmenter()
{
if (vcpu->ac_disable)
this_core_disable_ac();
}
vmexit()
{
if (vcpu->ac_disable) {
this_core_enable_ac();
}
this_core_dis/enable_ac() takes the global state into account and has the
necessary serialization in place.
Thanks,
tglx
> If that's not going to happen, then we just bury the whole thing and put it
> on hold until a sane implementation of that functionality surfaces in
> silicon some day in the not so foreseeable future.
We will drop the patches to flip the MSR bits to enable checking.
But we can fix the split lock issues that have already been found in the kernel.
Two strategies:
1) Adjust alignments of arrays passed to set_bit() et. al.
2) Fix set_bit() et. al. to not issue atomic operations that cross boundaries.
Fenghua had been pursuing option #1 in previous iterations. He found a few
more places with the help of the "grep" patterns suggested by David Laight.
So that path is up to ~8 patches now that do one of:
+ Change from u32 to u64
+ Force alignment with a union with a u64
+ Change to non-atomic (places that didn't need atomic)
Downside of strategy #1 is that people will add new misaligned cases in the
future. So this process has no defined end point.
Strategy #2 begun when I looked at the split-lock issue I saw that with a
constant bit argument set_bit() just does a "ORB" on the affected byte (i.e.
no split lock). Similar for clear_bit() and change_bit(). Changing code to also
do that for the variable bit case is easy.
test_and_clr_bit() needs more care, but luckily, we had Peter Anvin nearby
to give us a neat solution.
So strategy #2 is being tried now (and Fenghua will post some patches
soon).
Strategy #2 does increase code size when the bit number argument isn't
a constant. But that isn't the common case (Fenghua is counting and will
give numbers when patches are ready).
So take a look at the option #2 patches when they are posted. If the code
size increase is unacceptable, we can go back to fixing each of the callers
to get alignment right.
-Tony
On Thu, Oct 17, 2019 at 11:31:15PM +0200, Thomas Gleixner wrote:
> On Thu, 17 Oct 2019, Sean Christopherson wrote:
> > On Thu, Oct 17, 2019 at 02:29:45PM +0200, Thomas Gleixner wrote:
> > > The more I look at this trainwreck, the less interested I am in merging any
> > > of this at all.
> > >
> > > The fact that it took Intel more than a year to figure out that the MSR is
> > > per core and not per thread is yet another proof that this industry just
> > > works by pure chance.
> > >
> > > There is a simple way out of this misery:
> > >
> > > Intel issues a microcode update which does:
> > >
> > > 1) Convert the OR logic of the AC enable bit in the TEST_CTRL MSR to
> > > AND logic, i.e. when one thread disables AC it's automatically
> > > disabled on the core.
> > >
> > > Alternatively it supresses the #AC when the current thread has it
> > > disabled.
> > >
> > > 2) Provide a separate bit which indicates that the AC enable logic is
> > > actually AND based or that #AC is supressed when the current thread
> > > has it disabled.
> > >
> > > Which way I don't really care as long as it makes sense.
> >
> > The #AC bit doesn't use OR-logic, it's straight up shared, i.e. writes on
> > one CPU are immediately visible on its sibling CPU.
>
> That's less horrible than I read out of your initial explanation.
>
> Thankfully all of this is meticulously documented in the SDM ...
Preaching to the choir on this one...
> Though it changes the picture radically. The truly shared MSR allows
> regular software synchronization without IPIs and without an insane amount
> of corner case handling.
>
> So as you pointed out we need a per core state, which is influenced by:
>
> 1) The global enablement switch
>
> 2) Host induced #AC
>
> 3) Guest induced #AC
>
> A) Guest has #AC handling
>
> B) Guest has no #AC handling
>
> #1:
>
> - OFF: #AC is globally disabled
>
> - ON: #AC is globally enabled
>
> - FORCE: same as ON but #AC is enforced on guests
>
> #2:
>
> If the host triggers an #AC then the #AC has to be force disabled on the
> affected core independent of the state of #1. Nothing we can do about
> that and once the initial wave of #AC issues is fixed this should not
> happen on production systems. That disables #3 even for the #3.A case
> for simplicity sake.
>
> #3:
>
> A) Guest has #AC handling
>
> #AC is forwarded to the guest. No further action required aside of
> accounting
>
> B) Guest has no #AC handling
>
> If #AC triggers the resulting action depends on the state of #1:
>
> - FORCE: Guest is killed with SIGBUS or whatever the virt crowd
> thinks is the appropriate solution
> - ON: #AC triggered state is recorded per vCPU and the MSR is
> toggled on VMENTER/VMEXIT in software from that point on.
>
> So the only interesting case is #3.B and #1.state == ON. There you need
> serialization of the state and the MSR write between the cores, but only
> when the vCPU triggered an #AC. Until then, nothing to do.
And "vCPU triggered an #AC" should include an explicit check in KVM's
emulator.
> vmenter()
> {
> if (vcpu->ac_disable)
> this_core_disable_ac();
> }
>
> vmexit()
> {
> if (vcpu->ac_disable) {
> this_core_enable_ac();
> }
>
> this_core_dis/enable_ac() takes the global state into account and has the
> necessary serialization in place.
Overall, looks good to me. Although Tony's mail makes it obvious we need
to sync internally...
On 10/17/2019 8:29 PM, Thomas Gleixner wrote:
> The more I look at this trainwreck, the less interested I am in merging any
> of this at all.
>
> The fact that it took Intel more than a year to figure out that the MSR is
> per core and not per thread is yet another proof that this industry just
> works by pure chance.
>
Whether it's per-core or per-thread doesn't affect much how we implement
for host/native.
And also, no matter it's per-core or per-thread, we always can do
something in VIRT.
Maybe what matters is below.
> Seriously, this makes only sense when it's by default enabled and not
> rendered useless by VIRT. Otherwise we never get any reports and none of
> the issues are going to be fixed.
>
For VIRT, it doesn't want old guest to be killed due to #AC. But for
native, it doesn't want VIRT to disable the #AC detection
I think it's just about the default behavior that whether to disable the
host's #AC detection or kill the guest (SIGBUS or something else) once
there is an split-lock #AC in guest.
So we can provide CONFIG option to set the default behavior and module
parameter to let KVM set/change the default behavior.
On Fri, 18 Oct 2019, Xiaoyao Li wrote:
> On 10/17/2019 8:29 PM, Thomas Gleixner wrote:
> > The more I look at this trainwreck, the less interested I am in merging any
> > of this at all.
> >
> > The fact that it took Intel more than a year to figure out that the MSR is
> > per core and not per thread is yet another proof that this industry just
> > works by pure chance.
> >
>
> Whether it's per-core or per-thread doesn't affect much how we implement for
> host/native.
How useful.
> And also, no matter it's per-core or per-thread, we always can do something in
> VIRT.
It matters a lot. If it would be per thread then we would not have this
discussion at all.
> Maybe what matters is below.
>
> > Seriously, this makes only sense when it's by default enabled and not
> > rendered useless by VIRT. Otherwise we never get any reports and none of
> > the issues are going to be fixed.
> >
>
> For VIRT, it doesn't want old guest to be killed due to #AC. But for native,
> it doesn't want VIRT to disable the #AC detection
>
> I think it's just about the default behavior that whether to disable the
> host's #AC detection or kill the guest (SIGBUS or something else) once there
> is an split-lock #AC in guest.
>
> So we can provide CONFIG option to set the default behavior and module
> parameter to let KVM set/change the default behavior.
Care to read through the whole discussion and figure out WHY it's not that
simple?
Thanks,
tglx
On Fri, Oct 18, 2019 at 06:20:44PM +0800, Xiaoyao Li wrote:
> We enable #AC on all cores/threads to detect split lock.
> -If user space causes #AC, sending SIGBUS to it.
> -If kernel causes #AC, we globally disable #AC on all cores/threads,
> letting kernel go on working and WARN. (only disabling #AC on the thread
> generates it just doesn't help, since the buggy kernel code is possible to
> run on any threads and thus disabling #AC on all of them)
>
> As described above, either enabled globally or disabled globally, so whether
> it's per-core or per-thread really doesn't matter
Go back and read the friggin' thread already. A big clue: virt ruins it
(like it tends to do).
On 10/18/2019 5:02 PM, Thomas Gleixner wrote:
> On Fri, 18 Oct 2019, Xiaoyao Li wrote:
>> On 10/17/2019 8:29 PM, Thomas Gleixner wrote:
>>> The more I look at this trainwreck, the less interested I am in merging any
>>> of this at all.
>>>
>>> The fact that it took Intel more than a year to figure out that the MSR is
>>> per core and not per thread is yet another proof that this industry just
>>> works by pure chance.
>>>
>>
>> Whether it's per-core or per-thread doesn't affect much how we implement for
>> host/native.
>
> How useful.
OK. IIUC. We can agree on the use model of native like below:
We enable #AC on all cores/threads to detect split lock.
-If user space causes #AC, sending SIGBUS to it.
-If kernel causes #AC, we globally disable #AC on all cores/threads,
letting kernel go on working and WARN. (only disabling #AC on the thread
generates it just doesn't help, since the buggy kernel code is possible
to run on any threads and thus disabling #AC on all of them)
As described above, either enabled globally or disabled globally, so
whether it's per-core or per-thread really doesn't matter
>> And also, no matter it's per-core or per-thread, we always can do something in
>> VIRT.
>
> It matters a lot. If it would be per thread then we would not have this
> discussion at all.
Indeed, it's the fact that the control MSR bit is per-core to cause this
discussion. But the per-core scope only makes this feature difficult or
impossible to be virtualized.
We could make the decision to not expose it to guest to avoid the really
bad thing. However, even we don't expose this feature to guest and don't
virtualize it, the below problem always here.
If you think it's not a problem and acceptable to add an option to let
KVM disable host's #AC detection, we can just make it this way. And then
we can design the virtualizaion part without any change to native design
at all.
>> Maybe what matters is below.
>>
>>> Seriously, this makes only sense when it's by default enabled and not
>>> rendered useless by VIRT. Otherwise we never get any reports and none of
>>> the issues are going to be fixed.
>>>
>>
>> For VIRT, it doesn't want old guest to be killed due to #AC. But for native,
>> it doesn't want VIRT to disable the #AC detection
>>
>> I think it's just about the default behavior that whether to disable the
>> host's #AC detection or kill the guest (SIGBUS or something else) once there
>> is an split-lock #AC in guest.
>>
>> So we can provide CONFIG option to set the default behavior and module
>> parameter to let KVM set/change the default behavior.
>
> Care to read through the whole discussion and figure out WHY it's not that
> simple?
>
> Thanks,
>
> tglx
>
From: Luck, Tony
> Sent: 18 October 2019 00:28
...
> 2) Fix set_bit() et. al. to not issue atomic operations that cross boundaries.
>
> Fenghua had been pursuing option #1 in previous iterations. He found a few
> more places with the help of the "grep" patterns suggested by David Laight.
> So that path is up to ~8 patches now that do one of:
> + Change from u32 to u64
> + Force alignment with a union with a u64
> + Change to non-atomic (places that didn't need atomic)
>
> Downside of strategy #1 is that people will add new misaligned cases in the
> future. So this process has no defined end point.
>
> Strategy #2 begun when I looked at the split-lock issue I saw that with a
> constant bit argument set_bit() just does a "ORB" on the affected byte (i.e.
> no split lock). Similar for clear_bit() and change_bit(). Changing code to also
> do that for the variable bit case is easy.
>
> test_and_clr_bit() needs more care, but luckily, we had Peter Anvin nearby
> to give us a neat solution.
Changing the x86-64 bitops to use 32bit memory cycles is trivial
(provided you are willing to accept a limit of 2G bits).
OTOH this only works because x86 is LE.
On any BE systems passing an 'int []' to any of the bit-functions is so terribly
wrong it is unbelievable.
So changing the x86-64 bitops is largely papering over a crack.
In essence any code that casts the argument to any of the bitops functions
is almost certainly badly broken on BE systems.
The x86 cpu features code is always LE.
It probably ought to have a typedef for a union of long [] and int [].
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On October 18, 2019 3:45:14 AM PDT, David Laight <[email protected]> wrote:
>From: Luck, Tony
>> Sent: 18 October 2019 00:28
>...
>> 2) Fix set_bit() et. al. to not issue atomic operations that cross
>boundaries.
>>
>> Fenghua had been pursuing option #1 in previous iterations. He found
>a few
>> more places with the help of the "grep" patterns suggested by David
>Laight.
>> So that path is up to ~8 patches now that do one of:
>> + Change from u32 to u64
>> + Force alignment with a union with a u64
>> + Change to non-atomic (places that didn't need atomic)
>>
>> Downside of strategy #1 is that people will add new misaligned cases
>in the
>> future. So this process has no defined end point.
>>
>> Strategy #2 begun when I looked at the split-lock issue I saw that
>with a
>> constant bit argument set_bit() just does a "ORB" on the affected
>byte (i.e.
>> no split lock). Similar for clear_bit() and change_bit(). Changing
>code to also
>> do that for the variable bit case is easy.
>>
>> test_and_clr_bit() needs more care, but luckily, we had Peter Anvin
>nearby
>> to give us a neat solution.
>
>Changing the x86-64 bitops to use 32bit memory cycles is trivial
>(provided you are willing to accept a limit of 2G bits).
>
>OTOH this only works because x86 is LE.
>On any BE systems passing an 'int []' to any of the bit-functions is so
>terribly
>wrong it is unbelievable.
>
>So changing the x86-64 bitops is largely papering over a crack.
>
>In essence any code that casts the argument to any of the bitops
>functions
>is almost certainly badly broken on BE systems.
>
>The x86 cpu features code is always LE.
>It probably ought to have a typedef for a union of long [] and int [].
>
> David
>
>-
>Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes,
>MK1 1PT, UK
>Registration No: 1397386 (Wales)
One thing I suggested is that we should actually expose the violations at committee time either by wrapping them in macros using __alignof__ and/or make the kernel compile with -Wcast-align.
On x86 the btsl/btcl/btrl instructions can be used without limiting to 2Gbit of the address is computed, the way one does for plain and, or, etc. However, if the real toes for the arguments are exposed then or is possible to do better.
Finally, as far as bigendian is concerned: the problem Linux has on bigendian machines is that it tries to use littleendian bitmaps on bigendian machines: on bigendian machines, bit 0 is naturally the MSB. If your reaction is "but that is absurd", then you have just grokked why bigendian is fundamentally broken.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On 16/10/19 18:23, Sean Christopherson wrote:
>> Yes we can get fancy, but remember that KVM is not yet supporting
>> emulation of locked instructions. Adding it is possible but shouldn't
>> be in the critical path for the whole feature.
> Ah, didn't realize that. I'm surprised emulating all locks with cmpxchg
> doesn't cause problems (or am I misreading the code?).
It would cause problems if something was trying to do crazy stuff such
as locked operations on MMIO registers, or more plausibly (sort of...)
SMP in big real mode on pre-Westmere processors. I've personally never
seen X86EMUL_CMPXCHG_FAILED happen in the real world.
Paolo
> reading the code correctly, the #AC path could kick all other vCPUS on
> emulation failure and then retry emulation to "guarantee" success. Though
> that's starting to build quite the house of cards.
>
On 16/10/19 19:42, Sean Christopherson wrote:
> KVM uses a locked cmpxchg in emulator_cmpxchg_emulated() and the address
> is guest controlled, e.g. a guest could coerce the host into disabling
> split-lock detection via the host's #AC handler by triggering emulation
> and inducing an #AC in the emulator.
Yes, that's a possible issue.
Paolo
On 17/10/19 03:23, Xiaoyao Li wrote:
> However, without force_emulation_prefix enabled, I'm not sure whether
> malicious guest can create the case causing the emulation with a lock
> prefix and going to the emulator_cmpxchg_emulated().
> I found it impossible without force_emulation_prefix enabled and I'm not
> familiar with emulation at all. If I missed something, please let me know.
It's always possible to invoke the emulator on arbitrary instructions
without FEP:
1) use big real mode on processors without unrestricted mode
2) set up two processors racing between executing an MMIO access, and
rewriting it so that the emulator sees a different instruction
3) a variant of (2) where you rewrite the page tables so that the
processor's iTLB lookup uses a stale translation. Then the stale
translation can point to an MMIO access, while the emulator sees the
instruction pointed by the current contents of the page tables.
FEP was introduced just to keep the test code clean.
Paolo