ENQCMD requires the IA32_PASID MSR has a valid PASID value which was
allocated to the process during bind. The MSR could be populated eagerly
by an IPI after the PASID is allocated in bind. But the method was
disabled in commit 9bfecd058339 ("x86/cpufeatures: Force disable
X86_FEATURE_ENQCMD and remove update_pasid()")' due to locking and other
issues.
Since the MSR was cleared in fork()/clone(), the first ENQCMD will
generate a #GP fault. The #GP fault handler will initialize the MSR
if a PASID has been allocated for this process.
The lazy enabling of the PASID MSR in the #GP handler is not an elegant
solution. But it has the least complexity that fits with h/w behavior.
Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
---
arch/x86/include/asm/fpu/api.h | 6 ++++
arch/x86/include/asm/iommu.h | 2 ++
arch/x86/kernel/fpu/xstate.c | 59 ++++++++++++++++++++++++++++++++++
arch/x86/kernel/traps.c | 12 +++++++
drivers/iommu/intel/svm.c | 32 ++++++++++++++++++
5 files changed, 111 insertions(+)
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index ca4d0dee1ecd..f146849e5c8c 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -106,4 +106,10 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
*/
#define PASID_DISABLED 0
+#ifdef CONFIG_INTEL_IOMMU_SVM
+void fpu__pasid_write(u32 pasid);
+#else
+static inline void fpu__pasid_write(u32 pasid) { }
+#endif
+
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index bf1ed2ddc74b..9c4bf9b0702f 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -26,4 +26,6 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
return -EINVAL;
}
+bool __fixup_pasid_exception(void);
+
#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c8def1b7f8fb..8a89b2cecd77 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1289,3 +1289,62 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+/**
+ * fpu__pasid_write - Write the current task's PASID state/MSR.
+ * @pasid: the PASID
+ *
+ * The PASID is written to the IA32_PASID MSR directly if the MSR is active.
+ * Otherwise it's written to the PASID. The IA32_PASID MSR should contain
+ * the PASID after returning to the user.
+ *
+ * This is called only when ENQCMD is enabled.
+ */
+void fpu__pasid_write(u32 pasid)
+{
+ struct xregs_state *xsave = ¤t->thread.fpu.state.xsave;
+ u64 msr_val = pasid | MSR_IA32_PASID_VALID;
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ /*
+ * ENQCMD always uses the compacted XSAVE format. Ensure the buffer
+ * has space for the PASID.
+ */
+ BUG_ON(!(xsave->header.xcomp_bv & XFEATURE_MASK_PASID));
+
+ fpregs_lock();
+
+ /*
+ * If the task's FPU doesn't need to be loaded or is valid, directly
+ * write the IA32_PASID MSR. Otherwise, write the PASID state and
+ * the MSR will be loaded from the PASID state before returning to
+ * the user.
+ */
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD) ||
+ fpregs_state_valid(fpu, smp_processor_id())) {
+ wrmsrl(MSR_IA32_PASID, msr_val);
+ } else {
+ struct ia32_pasid_state *ppasid_state;
+ /*
+ * Mark XFEATURE_PASID as non-init in the XSAVE buffer.
+ * This ensures that a subsequent XRSTOR will see the new
+ * value instead of writing the init value to the MSR.
+ */
+ xsave->header.xfeatures |= XFEATURE_MASK_PASID;
+ ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
+ /*
+ * ppasid_state shouldn't be NULL because XFEATURE_PASID
+ * was set just now.
+ *
+ * Please note that the following operation is a "write only"
+ * operation on the PASID state and it writes the *ENTIRE*
+ * state component. Please don't blindly copy this code to
+ * modify other XSAVE states.
+ */
+ ppasid_state->pasid = msr_val;
+ }
+
+ fpregs_unlock();
+}
+#endif /* CONFIG_INTEL_IOMMU_SVM */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a58800973aed..a25d738ae839 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -61,6 +61,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
+#include <asm/iommu.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -526,6 +527,14 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
return GP_CANONICAL;
}
+static bool fixup_pasid_exception(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+ return false;
+
+ return __fixup_pasid_exception();
+}
+
#define GPFSTR "general protection fault"
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
@@ -538,6 +547,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
cond_local_irq_enable(regs);
+ if (user_mode(regs) && fixup_pasid_exception())
+ goto exit;
+
if (static_cpu_has(X86_FEATURE_UMIP)) {
if (user_mode(regs) && fixup_umip_exception(regs))
goto exit;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 5b5d69b04fcc..ab65020019b6 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1179,3 +1179,35 @@ int intel_svm_page_response(struct device *dev,
mutex_unlock(&pasid_mutex);
return ret;
}
+
+/*
+ * Try to figure out if there is a PASID MSR value to propagate to the
+ * thread taking the #GP.
+ */
+bool __fixup_pasid_exception(void)
+{
+ u32 pasid;
+
+ /*
+ * This function is called only when this #GP was triggered from user
+ * space. So the mm cannot be NULL.
+ */
+ pasid = current->mm->pasid;
+
+ /* If no PASID is allocated, there is nothing to propagate. */
+ if (pasid == PASID_DISABLED)
+ return false;
+
+ /*
+ * If the current task already has a valid PASID MSR, then the #GP
+ * fault must be for some non-ENQCMD related reason.
+ */
+ if (current->has_valid_pasid)
+ return false;
+
+ /* Fix up the MSR by the PASID in the mm. */
+ fpu__pasid_write(pasid);
+ current->has_valid_pasid = 1;
+
+ return true;
+}
--
2.33.0
On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index a58800973aed..a25d738ae839 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -61,6 +61,7 @@
> #include <asm/insn.h>
> #include <asm/insn-eval.h>
> #include <asm/vdso.h>
> +#include <asm/iommu.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -526,6 +527,14 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
> return GP_CANONICAL;
> }
>
> +static bool fixup_pasid_exception(void)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> + return false;
> +
> + return __fixup_pasid_exception();
> +}
> +
> #define GPFSTR "general protection fault"
>
> DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
> @@ -538,6 +547,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
>
> cond_local_irq_enable(regs);
>
> + if (user_mode(regs) && fixup_pasid_exception())
> + goto exit;
> +
> if (static_cpu_has(X86_FEATURE_UMIP)) {
> if (user_mode(regs) && fixup_umip_exception(regs))
> goto exit;
So you're eating any random #GP that might or might not be PASID
related. And all that witout a comment... Enlighten?
On Wed, Sep 22, 2021 at 11:07:22PM +0200, Peter Zijlstra wrote:
> On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> > diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> > index a58800973aed..a25d738ae839 100644
> > --- a/arch/x86/kernel/traps.c
> > +++ b/arch/x86/kernel/traps.c
> > @@ -61,6 +61,7 @@
> > #include <asm/insn.h>
> > #include <asm/insn-eval.h>
> > #include <asm/vdso.h>
> > +#include <asm/iommu.h>
> >
> > #ifdef CONFIG_X86_64
> > #include <asm/x86_init.h>
> > @@ -526,6 +527,14 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
> > return GP_CANONICAL;
> > }
> >
> > +static bool fixup_pasid_exception(void)
> > +{
> > + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> > + return false;
> > +
> > + return __fixup_pasid_exception();
> > +}
That is, shouldn't the above at the very least decode the instruction
causing the #GP and check it's this ENQCMD thing?
> > #define GPFSTR "general protection fault"
> >
> > DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
> > @@ -538,6 +547,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
> >
> > cond_local_irq_enable(regs);
> >
> > + if (user_mode(regs) && fixup_pasid_exception())
> > + goto exit;
> > +
> > if (static_cpu_has(X86_FEATURE_UMIP)) {
> > if (user_mode(regs) && fixup_umip_exception(regs))
> > goto exit;
>
> So you're eating any random #GP that might or might not be PASID
> related. And all that witout a comment... Enlighten?
>> > +static bool fixup_pasid_exception(void)
>> > +{
>> > + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
>> > + return false;
>> > +
>> > + return __fixup_pasid_exception();
>> > +}
>
> That is, shouldn't the above at the very least decode the instruction
> causing the #GP and check it's this ENQCMD thing?
It can't reliably do that because some other thread in the process may
have re-written the memory that regs->ip points at (bizarre case, but
I think Dave Hansen brought it up).
So it would just add extra code, and still only be a hint.
Without the check this sequence is possible:
1) Process binds an accelerator (so mm->pasid is set)
2) Task in the process executes something other than ENQCMD that gets a #GP
3) Kernel says "Oh, mm->pasid is set, I'll initialize the IA32_PASID MSR to see if that fixes it"
4) Nope. Re-executing the instruction at step #2 just gives another #GP
5) Kernel says "I already set IA32_PASID, so this must be something else ... do regular #GP actions"
Now if the task catches the signal that results from step #5 and avoids termination, it will have
IA32_PASID set ... but to the right value should it go on to actually execute ENQCMD at some
future point.
So the corner case from not knowing whether this #GP was from ENQCMD or not is harmless.
-Tony
On 9/22/21 2:11 PM, Peter Zijlstra wrote:
>>> +static bool fixup_pasid_exception(void)
>>> +{
>>> + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
>>> + return false;
>>> +
>>> + return __fixup_pasid_exception();
>>> +}
> That is, shouldn't the above at the very least decode the instruction
> causing the #GP and check it's this ENQCMD thing?
To reiterate: on systems with no X86_FEATURE_ENQCMD, there is basically
no additional overhead. It isn't worth doing decoding there.
On systems with X86_FEATURE_ENQCMD, but where it is unused, the #GP
handler gets some new overhead on every #GP. Basically:
> + pasid = current->mm->pasid;
> + if (pasid == PASID_DISABLED)
> + return false;
That's still pretty cheap. Probably not worth doing decoding there either.
So, that leaves us with if you are:
1. On system with X86_FEATURE_ENQCMD
2. In a process/mm that has an allocated pasid
3. Your *task* does not have the MSR set
4. You get a #GP for some other reason
Then, you'll do this double-#GP dance.
So, instruction decoding could absolutely be added between steps 3 and
4. It would absolutely save doing the double-#GP in cases where 1/2/3
are met. But, I wouldn't move it up above and of the 1/2/3 checks
because they're way cheaper than instruction decoding.
In the end, it didn't seem worth it to me to be optimizing a relatively
rare path which 99% of the time ends up in a crash.
If you want instruction decoding in here, though, just say the word. :)
Hi, Peter,
On Wed, Sep 22, 2021 at 11:11:45PM +0200, Peter Zijlstra wrote:
> On Wed, Sep 22, 2021 at 11:07:22PM +0200, Peter Zijlstra wrote:
> > On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> > > +static bool fixup_pasid_exception(void)
> > > +{
> > > + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> > > + return false;
> > > +
> > > + return __fixup_pasid_exception();
> > > +}
>
> That is, shouldn't the above at the very least decode the instruction
> causing the #GP and check it's this ENQCMD thing?
There were comments on a previous version when we used #GP fixup method:
https://lore.kernel.org/linux-iommu/[email protected]/
There are three reasons for not decoding the instruction:
1. Parsing the instruction sets bad architectural precedent and is ugly.
2. The instruction could be modified (e.g. JVM) while decoding the
instruction. It's.
3. Decoding is more complex than this patch and doesn't worth it.
Thanks.
-Fenghua
Hi, Peter,
On Wed, Sep 22, 2021 at 11:07:22PM +0200, Peter Zijlstra wrote:
> On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> >
> > + if (user_mode(regs) && fixup_pasid_exception())
> > + goto exit;
> > +
> > if (static_cpu_has(X86_FEATURE_UMIP)) {
> > if (user_mode(regs) && fixup_umip_exception(regs))
> > goto exit;
>
> So you're eating any random #GP that might or might not be PASID
> related. And all that witout a comment... Enlighten?
I will add a comment here.
Thanks.
-Fenghua
On Wed, Sep 22, 2021 at 09:26:10PM +0000, Luck, Tony wrote:
> >> > +static bool fixup_pasid_exception(void)
> >> > +{
> >> > + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> >> > + return false;
> >> > +
> >> > + return __fixup_pasid_exception();
> >> > +}
> >
> > That is, shouldn't the above at the very least decode the instruction
> > causing the #GP and check it's this ENQCMD thing?
>
> It can't reliably do that because some other thread in the process may
> have re-written the memory that regs->ip points at (bizarre case, but
> I think Dave Hansen brought it up).
I don't buy that argument, any cross modifying code gets to keep the
pieces in that case.
> So it would just add extra code, and still only be a hint.
>
> Without the check this sequence is possible:
>
> 1) Process binds an accelerator (so mm->pasid is set)
> 2) Task in the process executes something other than ENQCMD that gets a #GP
> 3) Kernel says "Oh, mm->pasid is set, I'll initialize the IA32_PASID MSR to see if that fixes it"
> 4) Nope. Re-executing the instruction at step #2 just gives another #GP
> 5) Kernel says "I already set IA32_PASID, so this must be something else ... do regular #GP actions"
>
> Now if the task catches the signal that results from step #5 and avoids termination, it will have
> IA32_PASID set ... but to the right value should it go on to actually execute ENQCMD at some
> future point.
>
> So the corner case from not knowing whether this #GP was from ENQCMD or not is harmless.
And all that *really* should be a in a comment near there, because I'm
100% sure I'll get confused and wonder about that very same thing the
next time I see that code.
On Wed, Sep 22, 2021 at 02:33:09PM -0700, Dave Hansen wrote:
> On 9/22/21 2:11 PM, Peter Zijlstra wrote:
> >>> +static bool fixup_pasid_exception(void)
> >>> +{
> >>> + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> >>> + return false;
> >>> +
> >>> + return __fixup_pasid_exception();
> >>> +}
> > That is, shouldn't the above at the very least decode the instruction
> > causing the #GP and check it's this ENQCMD thing?
>
> To reiterate: on systems with no X86_FEATURE_ENQCMD, there is basically
> no additional overhead. It isn't worth doing decoding there.
Well, they won't get past the X86_FEATURE check anyway, so who cares.
> On systems with X86_FEATURE_ENQCMD, but where it is unused, the #GP
> handler gets some new overhead on every #GP. Basically:
>
> > + pasid = current->mm->pasid;
> > + if (pasid == PASID_DISABLED)
> > + return false;
>
> That's still pretty cheap. Probably not worth doing decoding there either.
>
> So, that leaves us with if you are:
> 1. On system with X86_FEATURE_ENQCMD
> 2. In a process/mm that has an allocated pasid
> 3. Your *task* does not have the MSR set
> 4. You get a #GP for some other reason
>
> Then, you'll do this double-#GP dance.
>
> So, instruction decoding could absolutely be added between steps 3 and
> 4. It would absolutely save doing the double-#GP in cases where 1/2/3
> are met. But, I wouldn't move it up above and of the 1/2/3 checks
> because they're way cheaper than instruction decoding.
>
> In the end, it didn't seem worth it to me to be optimizing a relatively
> rare path which 99% of the time ends up in a crash.
>
> If you want instruction decoding in here, though, just say the word. :)
Instruction deoding makes it obvious you only consume your own #GP, the
alternative is a comment that explains this reasoning. Having neither
gets you confusion as per this thread.
On Mon, Sep 20 2021 at 19:23, Fenghua Yu wrote:
> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> index c8def1b7f8fb..8a89b2cecd77 100644
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -1289,3 +1289,62 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
> return 0;
> }
> #endif /* CONFIG_PROC_PID_ARCH_STATUS */
> +
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> +/**
> + * fpu__pasid_write - Write the current task's PASID state/MSR.
> + * @pasid: the PASID
> + *
> + * The PASID is written to the IA32_PASID MSR directly if the MSR is active.
> + * Otherwise it's written to the PASID. The IA32_PASID MSR should contain
written to the PASID? What's 'the PASID' ?
> + * the PASID after returning to the user.
> + *
> + * This is called only when ENQCMD is enabled.
Well, yes, but it does not explain why it is called and from which context.
> + */
> +void fpu__pasid_write(u32 pasid)
> +{
> + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave;
> + u64 msr_val = pasid | MSR_IA32_PASID_VALID;
> + struct fpu *fpu = ¤t->thread.fpu;
> +
> + /*
> + * ENQCMD always uses the compacted XSAVE format. Ensure the buffer
> + * has space for the PASID.
> + */
> + BUG_ON(!(xsave->header.xcomp_bv & XFEATURE_MASK_PASID));
> +
> + fpregs_lock();
> +
> + /*
> + * If the task's FPU doesn't need to be loaded or is valid, directly
> + * write the IA32_PASID MSR. Otherwise, write the PASID state and
> + * the MSR will be loaded from the PASID state before returning to
> + * the user.
> + */
> + if (!test_thread_flag(TIF_NEED_FPU_LOAD) ||
> + fpregs_state_valid(fpu, smp_processor_id())) {
> + wrmsrl(MSR_IA32_PASID, msr_val);
> + } else {
> + struct ia32_pasid_state *ppasid_state;
> + /*
> + * Mark XFEATURE_PASID as non-init in the XSAVE buffer.
> + * This ensures that a subsequent XRSTOR will see the new
> + * value instead of writing the init value to the MSR.
> + */
This and the above wrmsrl() assume that @pasid is valid which might be
correct, but I don't see any information about pasid lifetime. I assume
that there is no way to drop a PASID, right?
> + xsave->header.xfeatures |= XFEATURE_MASK_PASID;
> + ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
> + /*
> + * ppasid_state shouldn't be NULL because XFEATURE_PASID
> + * was set just now.
> + *
> + * Please note that the following operation is a "write only"
> + * operation on the PASID state and it writes the *ENTIRE*
> + * state component. Please don't blindly copy this code to
> + * modify other XSAVE states.
> + */
> + ppasid_state->pasid = msr_val;
> + }
> +
> + fpregs_unlock();
> +}
> +#endif /* CONFIG_INTEL_IOMMU_SVM */
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index a58800973aed..a25d738ae839 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
>
> +static bool fixup_pasid_exception(void)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> + return false;
> +
> + return __fixup_pasid_exception();
> +}
Ok, so here is the hook into #GP which then calls out into:
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -1179,3 +1179,35 @@ int intel_svm_page_response(struct device *dev,
> mutex_unlock(&pasid_mutex);
> return ret;
> }
> +
> +/*
> + * Try to figure out if there is a PASID MSR value to propagate to the
> + * thread taking the #GP.
> + */
> +bool __fixup_pasid_exception(void)
> +{
> + u32 pasid;
> +
> + /*
> + * This function is called only when this #GP was triggered from user
> + * space. So the mm cannot be NULL.
> + */
> + pasid = current->mm->pasid;
> +
> + /* If no PASID is allocated, there is nothing to propagate. */
> + if (pasid == PASID_DISABLED)
> + return false;
> +
> + /*
> + * If the current task already has a valid PASID MSR, then the #GP
> + * fault must be for some non-ENQCMD related reason.
> + */
> + if (current->has_valid_pasid)
> + return false;
> +
> + /* Fix up the MSR by the PASID in the mm. */
> + fpu__pasid_write(pasid);
> + current->has_valid_pasid = 1;
> +
> + return true;
> +}
What is INTEL SVM specific on this? Nothing at all.
If there is a valid PASID in current->mm and the MSR has not been
updated yet, write it. Otherwise bail.
Thanks,
tglx
On Wed, Sep 22, 2021 at 11:07:22PM +0200, Peter Zijlstra wrote:
> On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> > @@ -538,6 +547,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
> >
> > cond_local_irq_enable(regs);
> >
> > + if (user_mode(regs) && fixup_pasid_exception())
> > + goto exit;
> > +
> So you're eating any random #GP that might or might not be PASID
> related. And all that witout a comment... Enlighten?
This is moderately well commented inside the fixup_pasid_exception()
function. Another copy of the comments here at the call-site seems
overkill.
Would it help to change the name to try_fixup_pasid_exception()
to make it clearer that this is just a heuristic that may or may
not fix this particular #GP?
-Tony
On Mon, Sep 20, 2021, at 12:23 PM, Fenghua Yu wrote:
> ENQCMD requires the IA32_PASID MSR has a valid PASID value which was
> allocated to the process during bind. The MSR could be populated eagerly
> by an IPI after the PASID is allocated in bind. But the method was
> disabled in commit 9bfecd058339 ("x86/cpufeatures: Force disable
> X86_FEATURE_ENQCMD and remove update_pasid()")' due to locking and other
> issues.
>
> Since the MSR was cleared in fork()/clone(), the first ENQCMD will
> generate a #GP fault. The #GP fault handler will initialize the MSR
> if a PASID has been allocated for this process.
>
> The lazy enabling of the PASID MSR in the #GP handler is not an elegant
> solution. But it has the least complexity that fits with h/w behavior.
>
> Signed-off-by: Fenghua Yu <[email protected]>
> Reviewed-by: Tony Luck <[email protected]>
> ---
> arch/x86/include/asm/fpu/api.h | 6 ++++
> arch/x86/include/asm/iommu.h | 2 ++
> arch/x86/kernel/fpu/xstate.c | 59 ++++++++++++++++++++++++++++++++++
> arch/x86/kernel/traps.c | 12 +++++++
> drivers/iommu/intel/svm.c | 32 ++++++++++++++++++
> 5 files changed, 111 insertions(+)
>
> diff --git a/arch/x86/include/asm/fpu/api.h
> b/arch/x86/include/asm/fpu/api.h
> index ca4d0dee1ecd..f146849e5c8c 100644
> --- a/arch/x86/include/asm/fpu/api.h
> +++ b/arch/x86/include/asm/fpu/api.h
> @@ -106,4 +106,10 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask,
> const char **feature_name);
> */
> #define PASID_DISABLED 0
>
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> +void fpu__pasid_write(u32 pasid);
> +#else
> +static inline void fpu__pasid_write(u32 pasid) { }
> +#endif
> +
> #endif /* _ASM_X86_FPU_API_H */
> diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
> index bf1ed2ddc74b..9c4bf9b0702f 100644
> --- a/arch/x86/include/asm/iommu.h
> +++ b/arch/x86/include/asm/iommu.h
> @@ -26,4 +26,6 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
> return -EINVAL;
> }
>
> +bool __fixup_pasid_exception(void);
> +
> #endif /* _ASM_X86_IOMMU_H */
> diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> index c8def1b7f8fb..8a89b2cecd77 100644
> --- a/arch/x86/kernel/fpu/xstate.c
> +++ b/arch/x86/kernel/fpu/xstate.c
> @@ -1289,3 +1289,62 @@ int proc_pid_arch_status(struct seq_file *m,
> struct pid_namespace *ns,
> return 0;
> }
> #endif /* CONFIG_PROC_PID_ARCH_STATUS */
> +
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> +/**
> + * fpu__pasid_write - Write the current task's PASID state/MSR.
> + * @pasid: the PASID
> + *
> + * The PASID is written to the IA32_PASID MSR directly if the MSR is
> active.
> + * Otherwise it's written to the PASID. The IA32_PASID MSR should
> contain
> + * the PASID after returning to the user.
> + *
> + * This is called only when ENQCMD is enabled.
> + */
> +void fpu__pasid_write(u32 pasid)
> +{
> + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave;
> + u64 msr_val = pasid | MSR_IA32_PASID_VALID;
> + struct fpu *fpu = ¤t->thread.fpu;
> +
> + /*
> + * ENQCMD always uses the compacted XSAVE format. Ensure the buffer
> + * has space for the PASID.
> + */
> + BUG_ON(!(xsave->header.xcomp_bv & XFEATURE_MASK_PASID));
> +
> + fpregs_lock();
> +
> + /*
> + * If the task's FPU doesn't need to be loaded or is valid, directly
> + * write the IA32_PASID MSR. Otherwise, write the PASID state and
> + * the MSR will be loaded from the PASID state before returning to
> + * the user.
> + */
> + if (!test_thread_flag(TIF_NEED_FPU_LOAD) ||
> + fpregs_state_valid(fpu, smp_processor_id())) {
> + wrmsrl(MSR_IA32_PASID, msr_val);
Let me try to decode this.
If the current task's FPU state is live or if the state is in memory but the CPU regs just happen to match the copy in memory, then write the MSR. Else write the value to memory.
This is wrong. If !TIF_NEED_FPU_LOAD && fpregs_state_valid, you MUST NOT MODIFY FPU STATE. This is not negotiable -- you will break coherence between CPU regs and the memory image. The way you modify the current task's state is either you modify it in CPU regs (if the kernel knows that the CPU regs are the one and only source of truth) OR you modify it in memory and invalidate any preserved copies (by zapping last_cpu).
In any event, that particular bit of logic really doesn't belong in here -- it belongs in some helper that gets it right, once.
> +
> +/*
> + * Try to figure out if there is a PASID MSR value to propagate to the
> + * thread taking the #GP.
> + */
> +bool __fixup_pasid_exception(void)
> +{
> + u32 pasid;
> +
> + /*
> + * This function is called only when this #GP was triggered from user
> + * space. So the mm cannot be NULL.
> + */
> + pasid = current->mm->pasid;
> +
> + /* If no PASID is allocated, there is nothing to propagate. */
> + if (pasid == PASID_DISABLED)
> + return false;
> +
> + /*
> + * If the current task already has a valid PASID MSR, then the #GP
> + * fault must be for some non-ENQCMD related reason.
> + */
> + if (current->has_valid_pasid)
> + return false;
IMO "has_valid_pasid" is a poor name. An mm can have a PASID. A task has noticed or it hasn't.
If you really need an in-memory cache, call it "pasid_msr_is_loaded". Or just read the field out of FPU state, but that might be painfully slow.
Hi, Andy,
On Thu, Sep 23, 2021 at 04:17:05PM -0700, Andy Lutomirski wrote:
> On Mon, Sep 20, 2021, at 12:23 PM, Fenghua Yu wrote:
> > ENQCMD requires the IA32_PASID MSR has a valid PASID value which was
> > allocated to the process during bind. The MSR could be populated eagerly
> > by an IPI after the PASID is allocated in bind. But the method was
> > disabled in commit 9bfecd058339 ("x86/cpufeatures: Force disable
> > X86_FEATURE_ENQCMD and remove update_pasid()")' due to locking and other
> > issues.
> >
> > Since the MSR was cleared in fork()/clone(), the first ENQCMD will
> > generate a #GP fault. The #GP fault handler will initialize the MSR
> > if a PASID has been allocated for this process.
> >
> > The lazy enabling of the PASID MSR in the #GP handler is not an elegant
> > solution. But it has the least complexity that fits with h/w behavior.
> >
> > Signed-off-by: Fenghua Yu <[email protected]>
> > Reviewed-by: Tony Luck <[email protected]>
> > ---
> > arch/x86/include/asm/fpu/api.h | 6 ++++
> > arch/x86/include/asm/iommu.h | 2 ++
> > arch/x86/kernel/fpu/xstate.c | 59 ++++++++++++++++++++++++++++++++++
> > arch/x86/kernel/traps.c | 12 +++++++
> > drivers/iommu/intel/svm.c | 32 ++++++++++++++++++
> > 5 files changed, 111 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/fpu/api.h
> > b/arch/x86/include/asm/fpu/api.h
> > index ca4d0dee1ecd..f146849e5c8c 100644
> > --- a/arch/x86/include/asm/fpu/api.h
> > +++ b/arch/x86/include/asm/fpu/api.h
> > @@ -106,4 +106,10 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask,
> > const char **feature_name);
> > */
> > #define PASID_DISABLED 0
> >
> > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > +void fpu__pasid_write(u32 pasid);
> > +#else
> > +static inline void fpu__pasid_write(u32 pasid) { }
> > +#endif
> > +
> > #endif /* _ASM_X86_FPU_API_H */
> > diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
> > index bf1ed2ddc74b..9c4bf9b0702f 100644
> > --- a/arch/x86/include/asm/iommu.h
> > +++ b/arch/x86/include/asm/iommu.h
> > @@ -26,4 +26,6 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
> > return -EINVAL;
> > }
> >
> > +bool __fixup_pasid_exception(void);
> > +
> > #endif /* _ASM_X86_IOMMU_H */
> > diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
> > index c8def1b7f8fb..8a89b2cecd77 100644
> > --- a/arch/x86/kernel/fpu/xstate.c
> > +++ b/arch/x86/kernel/fpu/xstate.c
> > @@ -1289,3 +1289,62 @@ int proc_pid_arch_status(struct seq_file *m,
> > struct pid_namespace *ns,
> > return 0;
> > }
> > #endif /* CONFIG_PROC_PID_ARCH_STATUS */
> > +
> > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > +/**
> > + * fpu__pasid_write - Write the current task's PASID state/MSR.
> > + * @pasid: the PASID
> > + *
> > + * The PASID is written to the IA32_PASID MSR directly if the MSR is
> > active.
> > + * Otherwise it's written to the PASID. The IA32_PASID MSR should
> > contain
> > + * the PASID after returning to the user.
> > + *
> > + * This is called only when ENQCMD is enabled.
> > + */
> > +void fpu__pasid_write(u32 pasid)
> > +{
> > + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave;
> > + u64 msr_val = pasid | MSR_IA32_PASID_VALID;
> > + struct fpu *fpu = ¤t->thread.fpu;
> > +
> > + /*
> > + * ENQCMD always uses the compacted XSAVE format. Ensure the buffer
> > + * has space for the PASID.
> > + */
> > + BUG_ON(!(xsave->header.xcomp_bv & XFEATURE_MASK_PASID));
> > +
> > + fpregs_lock();
> > +
> > + /*
> > + * If the task's FPU doesn't need to be loaded or is valid, directly
> > + * write the IA32_PASID MSR. Otherwise, write the PASID state and
> > + * the MSR will be loaded from the PASID state before returning to
> > + * the user.
> > + */
> > + if (!test_thread_flag(TIF_NEED_FPU_LOAD) ||
> > + fpregs_state_valid(fpu, smp_processor_id())) {
> > + wrmsrl(MSR_IA32_PASID, msr_val);
>
> Let me try to decode this.
>
> If the current task's FPU state is live or if the state is in memory but the CPU regs just happen to match the copy in memory, then write the MSR. Else write the value to memory.
>
> This is wrong. If !TIF_NEED_FPU_LOAD && fpregs_state_valid, you MUST NOT MODIFY FPU STATE.
But the FPU state is not modified if !TIF_NEED_FPU_LOAD && fpregs_state_valid.
The FPU state is modified only if TIF_NEED_FPU_LOAD && !fpregs_state_valid.
In this case, the FPU state will be restored to the IA32_PASID MSR when
exiting to the user. In all other cases, the FPU state will be not be
restored on exiting to the user and thus the IA32_PASID MSR is directly
written here.
Is it right?
> This is not negotiable -- you will break coherence between CPU regs and the memory image.
fpregs_assert_state_consistent() warns on !TIF_NEED_FPU_LOAD &&
!fpregs_state_valid. Is that breaking coherence you are talking?
> The way you modify the current task's state is either you modify it in CPU regs (if the kernel knows that the CPU regs are the one and only source of truth) OR you modify it in memory and invalidate any preserved copies (by zapping last_cpu).
>
> In any event, that particular bit of logic really doesn't belong in here -- it belongs in some helper that gets it right, once.
>
> > +
> > +/*
> > + * Try to figure out if there is a PASID MSR value to propagate to the
> > + * thread taking the #GP.
> > + */
> > +bool __fixup_pasid_exception(void)
> > +{
> > + u32 pasid;
> > +
> > + /*
> > + * This function is called only when this #GP was triggered from user
> > + * space. So the mm cannot be NULL.
> > + */
> > + pasid = current->mm->pasid;
> > +
> > + /* If no PASID is allocated, there is nothing to propagate. */
> > + if (pasid == PASID_DISABLED)
> > + return false;
> > +
> > + /*
> > + * If the current task already has a valid PASID MSR, then the #GP
> > + * fault must be for some non-ENQCMD related reason.
> > + */
> > + if (current->has_valid_pasid)
> > + return false;
>
> IMO "has_valid_pasid" is a poor name. An mm can have a PASID. A task has noticed or it hasn't.
>
> If you really need an in-memory cache, call it "pasid_msr_is_loaded". Or just read the field out of FPU state, but that might be painfully slow.
Agree. Thomas wants to change "has_valid_pasid" to "holds_pasid_ref" to
represents if the task takes a reference to the PASID.
Thanks.
-Fenghua
On Thu, Sep 23, 2021, at 7:56 PM, Fenghua Yu wrote:
> Hi, Andy,
>
> On Thu, Sep 23, 2021 at 04:17:05PM -0700, Andy Lutomirski wrote:
>> On Mon, Sep 20, 2021, at 12:23 PM, Fenghua Yu wrote:
>> > ENQCMD requires the IA32_PASID MSR has a valid PASID value which was
>> > allocated to the process during bind. The MSR could be populated eagerly
>> > by an IPI after the PASID is allocated in bind. But the method was
>> > disabled in commit 9bfecd058339 ("x86/cpufeatures: Force disable
>> > X86_FEATURE_ENQCMD and remove update_pasid()")' due to locking and other
>> > issues.
>> >
>> > Since the MSR was cleared in fork()/clone(), the first ENQCMD will
>> > generate a #GP fault. The #GP fault handler will initialize the MSR
>> > if a PASID has been allocated for this process.
>> >
>> > The lazy enabling of the PASID MSR in the #GP handler is not an elegant
>> > solution. But it has the least complexity that fits with h/w behavior.
>> >
>> > Signed-off-by: Fenghua Yu <[email protected]>
>> > Reviewed-by: Tony Luck <[email protected]>
>> > ---
>> > arch/x86/include/asm/fpu/api.h | 6 ++++
>> > arch/x86/include/asm/iommu.h | 2 ++
>> > arch/x86/kernel/fpu/xstate.c | 59 ++++++++++++++++++++++++++++++++++
>> > arch/x86/kernel/traps.c | 12 +++++++
>> > drivers/iommu/intel/svm.c | 32 ++++++++++++++++++
>> > 5 files changed, 111 insertions(+)
>> >
>> > diff --git a/arch/x86/include/asm/fpu/api.h
>> > b/arch/x86/include/asm/fpu/api.h
>> > index ca4d0dee1ecd..f146849e5c8c 100644
>> > --- a/arch/x86/include/asm/fpu/api.h
>> > +++ b/arch/x86/include/asm/fpu/api.h
>> > @@ -106,4 +106,10 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask,
>> > const char **feature_name);
>> > */
>> > #define PASID_DISABLED 0
>> >
>> > +#ifdef CONFIG_INTEL_IOMMU_SVM
>> > +void fpu__pasid_write(u32 pasid);
>> > +#else
>> > +static inline void fpu__pasid_write(u32 pasid) { }
>> > +#endif
>> > +
>> > #endif /* _ASM_X86_FPU_API_H */
>> > diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
>> > index bf1ed2ddc74b..9c4bf9b0702f 100644
>> > --- a/arch/x86/include/asm/iommu.h
>> > +++ b/arch/x86/include/asm/iommu.h
>> > @@ -26,4 +26,6 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
>> > return -EINVAL;
>> > }
>> >
>> > +bool __fixup_pasid_exception(void);
>> > +
>> > #endif /* _ASM_X86_IOMMU_H */
>> > diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
>> > index c8def1b7f8fb..8a89b2cecd77 100644
>> > --- a/arch/x86/kernel/fpu/xstate.c
>> > +++ b/arch/x86/kernel/fpu/xstate.c
>> > @@ -1289,3 +1289,62 @@ int proc_pid_arch_status(struct seq_file *m,
>> > struct pid_namespace *ns,
>> > return 0;
>> > }
>> > #endif /* CONFIG_PROC_PID_ARCH_STATUS */
>> > +
>> > +#ifdef CONFIG_INTEL_IOMMU_SVM
>> > +/**
>> > + * fpu__pasid_write - Write the current task's PASID state/MSR.
>> > + * @pasid: the PASID
>> > + *
>> > + * The PASID is written to the IA32_PASID MSR directly if the MSR is
>> > active.
>> > + * Otherwise it's written to the PASID. The IA32_PASID MSR should
>> > contain
>> > + * the PASID after returning to the user.
>> > + *
>> > + * This is called only when ENQCMD is enabled.
>> > + */
>> > +void fpu__pasid_write(u32 pasid)
>> > +{
>> > + struct xregs_state *xsave = ¤t->thread.fpu.state.xsave;
>> > + u64 msr_val = pasid | MSR_IA32_PASID_VALID;
>> > + struct fpu *fpu = ¤t->thread.fpu;
>> > +
>> > + /*
>> > + * ENQCMD always uses the compacted XSAVE format. Ensure the buffer
>> > + * has space for the PASID.
>> > + */
>> > + BUG_ON(!(xsave->header.xcomp_bv & XFEATURE_MASK_PASID));
>> > +
>> > + fpregs_lock();
>> > +
>> > + /*
>> > + * If the task's FPU doesn't need to be loaded or is valid, directly
>> > + * write the IA32_PASID MSR. Otherwise, write the PASID state and
>> > + * the MSR will be loaded from the PASID state before returning to
>> > + * the user.
>> > + */
>> > + if (!test_thread_flag(TIF_NEED_FPU_LOAD) ||
>> > + fpregs_state_valid(fpu, smp_processor_id())) {
>> > + wrmsrl(MSR_IA32_PASID, msr_val);
>>
>> Let me try to decode this.
>>
>> If the current task's FPU state is live or if the state is in memory but the CPU regs just happen to match the copy in memory, then write the MSR. Else write the value to memory.
>>
>> This is wrong. If !TIF_NEED_FPU_LOAD && fpregs_state_valid, you MUST NOT MODIFY FPU STATE.
Sorry, I typoed that. I meant TIF_NEED_FPU_LOAD && fpregs_state_valid, which is in the case that does wrmsr.
>
> But the FPU state is not modified if !TIF_NEED_FPU_LOAD && fpregs_state_valid.
>
> The FPU state is modified only if TIF_NEED_FPU_LOAD && !fpregs_state_valid.
The MSR is FPU state. If TIF_NEED_FPU_LOAD && fpregs_state_valid, then the authoritative copy of the FPU state is in memory, but the CPU regs are known to match memory. You MUST NOT modify the in-memory state or the regs.
> In this case, the FPU state will be restored to the IA32_PASID MSR when
> exiting to the user. In all other cases, the FPU state will be not be
> restored on exiting to the user and thus the IA32_PASID MSR is directly
> written here.
>
> Is it right?
>
>> This is not negotiable -- you will break coherence between CPU regs and the memory image.
>
> fpregs_assert_state_consistent() warns on !TIF_NEED_FPU_LOAD &&
> !fpregs_state_valid. Is that breaking coherence you are talking?
No. I mean that you broke coherence between memory and registers. If the task resumes on the current CPU without scheduling, the MSR write will take effect. If the task resumes on a different CPU or after something else takes over the current CPU's regs, the MSR write will be lost.
>
>> The way you modify the current task's state is either you modify it in CPU regs (if the kernel knows that the CPU regs are the one and only source of truth) OR you modify it in memory and invalidate any preserved copies (by zapping last_cpu).
>>
>> In any event, that particular bit of logic really doesn't belong in here -- it belongs in some helper that gets it right, once.
>>
>> > +
>> > +/*
>> > + * Try to figure out if there is a PASID MSR value to propagate to the
>> > + * thread taking the #GP.
>> > + */
>> > +bool __fixup_pasid_exception(void)
>> > +{
>> > + u32 pasid;
>> > +
>> > + /*
>> > + * This function is called only when this #GP was triggered from user
>> > + * space. So the mm cannot be NULL.
>> > + */
>> > + pasid = current->mm->pasid;
>> > +
>> > + /* If no PASID is allocated, there is nothing to propagate. */
>> > + if (pasid == PASID_DISABLED)
>> > + return false;
>> > +
>> > + /*
>> > + * If the current task already has a valid PASID MSR, then the #GP
>> > + * fault must be for some non-ENQCMD related reason.
>> > + */
>> > + if (current->has_valid_pasid)
>> > + return false;
>>
>> IMO "has_valid_pasid" is a poor name. An mm can have a PASID. A task has noticed or it hasn't.
>>
>> If you really need an in-memory cache, call it "pasid_msr_is_loaded". Or just read the field out of FPU state, but that might be painfully slow.
>
> Agree. Thomas wants to change "has_valid_pasid" to "holds_pasid_ref" to
> represents if the task takes a reference to the PASID.
That name will stop making sense when tasks stop holding references.
On Fri, Sep 24, 2021 at 03:37:22PM +0200, Peter Zijlstra wrote:
> On Thu, Sep 23, 2021 at 10:14:42AM -0700, Luck, Tony wrote:
> > On Wed, Sep 22, 2021 at 11:07:22PM +0200, Peter Zijlstra wrote:
> > > On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> > > > @@ -538,6 +547,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
> > > >
> > > > cond_local_irq_enable(regs);
> > > >
> > > > + if (user_mode(regs) && fixup_pasid_exception())
> > > > + goto exit;
> > > > +
> >
> > > So you're eating any random #GP that might or might not be PASID
> > > related. And all that witout a comment... Enlighten?
> >
> > This is moderately well commented inside the fixup_pasid_exception()
> > function. Another copy of the comments here at the call-site seems
> > overkill.
>
> +static bool fixup_pasid_exception(void)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
> + return false;
> +
> + return __fixup_pasid_exception();
> +}
>
> /me goes looking for comments in that function, lemme get out the
> electron microscope, because I can't seem to spot them with the naked
> eye.
If you have ctags installed then a ctrl-] on that
__fixup_pasid_exception() will take you to the function with
the comments. No electron microscope needed.
+
+/*
+ * Try to figure out if there is a PASID MSR value to propagate to the
+ * thread taking the #GP.
+ */
+bool __fixup_pasid_exception(void)
+{
+ u32 pasid;
+
+ /*
+ * This function is called only when this #GP was triggered from user
+ * space. So the mm cannot be NULL.
+ */
+ pasid = current->mm->pasid;
+
+ /* If no PASID is allocated, there is nothing to propagate. */
+ if (pasid == PASID_DISABLED)
+ return false;
+
+ /*
+ * If the current task already has a valid PASID MSR, then the #GP
+ * fault must be for some non-ENQCMD related reason.
+ */
+ if (current->has_valid_pasid)
+ return false;
+
+ /* Fix up the MSR by the PASID in the mm. */
+ fpu__pasid_write(pasid);
+ current->has_valid_pasid = 1;
+
+ return true;
+}
-Tony
On Thu, Sep 23, 2021 at 10:14:42AM -0700, Luck, Tony wrote:
> On Wed, Sep 22, 2021 at 11:07:22PM +0200, Peter Zijlstra wrote:
> > On Mon, Sep 20, 2021 at 07:23:45PM +0000, Fenghua Yu wrote:
> > > @@ -538,6 +547,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
> > >
> > > cond_local_irq_enable(regs);
> > >
> > > + if (user_mode(regs) && fixup_pasid_exception())
> > > + goto exit;
> > > +
>
> > So you're eating any random #GP that might or might not be PASID
> > related. And all that witout a comment... Enlighten?
>
> This is moderately well commented inside the fixup_pasid_exception()
> function. Another copy of the comments here at the call-site seems
> overkill.
+static bool fixup_pasid_exception(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+ return false;
+
+ return __fixup_pasid_exception();
+}
/me goes looking for comments in that function, lemme get out the
electron microscope, because I can't seem to spot them with the naked
eye.
On Thu, Sep 23, 2021 at 04:17:05PM -0700, Andy Lutomirski wrote:
> > + fpregs_lock();
> > +
> > + /*
> > + * If the task's FPU doesn't need to be loaded or is valid, directly
> > + * write the IA32_PASID MSR. Otherwise, write the PASID state and
> > + * the MSR will be loaded from the PASID state before returning to
> > + * the user.
> > + */
> > + if (!test_thread_flag(TIF_NEED_FPU_LOAD) ||
> > + fpregs_state_valid(fpu, smp_processor_id())) {
> > + wrmsrl(MSR_IA32_PASID, msr_val);
>
> Let me try to decode this.
>
> If the current task's FPU state is live or if the state is in memory but the CPU regs just happen to match the copy in memory, then write the MSR. Else write the value to memory.
>
> This is wrong. If !TIF_NEED_FPU_LOAD && fpregs_state_valid, you MUST NOT MODIFY FPU STATE. This is not negotiable -- you will break coherence between CPU regs and the memory image. The way you modify the current task's state is either you modify it in CPU regs (if the kernel knows that the CPU regs are the one and only source of truth) OR you modify it in memory and invalidate any preserved copies (by zapping last_cpu).
>
> In any event, that particular bit of logic really doesn't belong in here -- it belongs in some helper that gets it right, once.
Andy,
A helper sounds like a good idea. Can you flesh out what
you would like that to look like?
Is it just the "where is the live register state?" so the
above could be written:
if (xsave_state_in_memory(args ...))
update pasid bit of xsave state in memory
else
wrmsrl(MSR_IA32_PASID, msr_val);
Or are you thinking of a helper that does both the check
and the update ... so the code here could be:
update_one_xsave_feature(XFEATURE_PASID, &msr_val, sizeof(msr_val));
With the helper being something like:
void update_one_xsave_feature(enum xfeature xfeature, void *data, size_t size)
{
if (xsave_state_in_memory(args ...)) {
addr = get_xsave_addr(xsave, xfeature);
memcpy(addr, data, size);
xsave->header.xfeatures |= (1 << xfeature);
return;
}
switch (xfeature) {
case XFEATURE_PASID:
wrmsrl(MSR_IA32_PASID, *(u64 *)data);
break;
case each_of_the_other_XFEATURE_enums:
code to update registers for that XFEATURE
}
}
either way needs the definitive correct coding for xsave_state_in_memory()
-Tony
On 9/27/21 2:02 PM, Luck, Tony wrote:
> Or are you thinking of a helper that does both the check
> and the update ... so the code here could be:
>
> update_one_xsave_feature(XFEATURE_PASID, &msr_val, sizeof(msr_val));
>
> With the helper being something like:
>
> void update_one_xsave_feature(enum xfeature xfeature, void *data, size_t size)
> {
> if (xsave_state_in_memory(args ...)) {
> addr = get_xsave_addr(xsave, xfeature);
> memcpy(addr, data, size);
> xsave->header.xfeatures |= (1 << xfeature);
> return;
> }
>
> switch (xfeature) {
> case XFEATURE_PASID:
> wrmsrl(MSR_IA32_PASID, *(u64 *)data);
> break;
>
> case each_of_the_other_XFEATURE_enums:
> code to update registers for that XFEATURE
> }
> }
>
> either way needs the definitive correct coding for xsave_state_in_memory()
That's close to what we want.
The size should probably be implicit. If it isn't implicit, it needs to
at least be double-checked against the state sizes.
Not to get too fancy, but I think we also want to have a "replace"
operation which is separate from the "update". Think of a case where
you are trying to set a bit:
struct pkru_state *pk = start_update_xstate(tsk, XSTATE_PKRU);
pk->pkru |= 0x100;
finish_update_xstate(tsk, XSTATE_PKRU, pk);
versus setting a whole new value:
struct pkru_state *pk = start_replace_xstate(tsk, XSTATE_PKRU);
memset(pkru, sizeof(*pk), 0);
pk->pkru = 0x1234;
finish_replace_xstate(tsk, XSTATE_PKRU, pk);
They look similar, but they actually might have very different costs for
big states like AMX. We can also do some very different debugging for
them. In normal operation, these _can_ just return pointers directly to
the fpu...->xstate in some case. But, if we're debugging, we could
kmalloc() a buffer and do sanity checking before it goes into the task
buffer.
We don't have to do any of that fancy stuff now. We don't even need to
do an "update" if all we need for now for XFEATURE_PASID is a "replace".
1. Hide whether we need to write to real registers
2. Hide whether we need to update the in-memory image
3. Hide other FPU infrastructure like the TIF flag.
4. Make the users deal with a *whole* state in the replace API
On Mon, Sep 27, 2021 at 04:51:25PM -0700, Dave Hansen wrote:
> That's close to what we want.
>
> The size should probably be implicit. If it isn't implicit, it needs to
> at least be double-checked against the state sizes.
>
> Not to get too fancy, but I think we also want to have a "replace"
> operation which is separate from the "update". Think of a case where
> you are trying to set a bit:
>
> struct pkru_state *pk = start_update_xstate(tsk, XSTATE_PKRU);
> pk->pkru |= 0x100;
> finish_update_xstate(tsk, XSTATE_PKRU, pk);
>
> versus setting a whole new value:
>
> struct pkru_state *pk = start_replace_xstate(tsk, XSTATE_PKRU);
> memset(pkru, sizeof(*pk), 0);
> pk->pkru = 0x1234;
> finish_replace_xstate(tsk, XSTATE_PKRU, pk);
>
> They look similar, but they actually might have very different costs for
> big states like AMX. We can also do some very different debugging for
> them. In normal operation, these _can_ just return pointers directly to
> the fpu...->xstate in some case. But, if we're debugging, we could
> kmalloc() a buffer and do sanity checking before it goes into the task
> buffer.
>
> We don't have to do any of that fancy stuff now. We don't even need to
> do an "update" if all we need for now for XFEATURE_PASID is a "replace".
>
> 1. Hide whether we need to write to real registers
> 2. Hide whether we need to update the in-memory image
> 3. Hide other FPU infrastructure like the TIF flag.
> 4. Make the users deal with a *whole* state in the replace API
Is that difference just whether you need to save the
state from registers to memory (for the "update" case)
or not (for the "replace" case ... where you can ignore
the current register, overwrite the whole per-feature
xsave area and mark it to be restored to registers).
If so, just a "bool full" argument might do the trick?
Also - you have a "tsk" argument in your pseudo code. Is
this needed? Are there places where we need to perform
these operations on something other than "current"?
pseudo-code:
void *begin_update_one_xsave_feature(enum xfeature xfeature, bool full)
{
void *addr;
BUG_ON(!(xsave->header.xcomp_bv & xfeature));
addr = __raw_xsave_addr(xsave, xfeature);
fpregs_lock();
if (full)
return addr;
if (xfeature registers are "live")
xsaves(xstate, 1 << xfeature);
return addr;
}
void finish_update_one_xsave_feature(enum xfeature xfeature)
{
mark feature modified
set TIF bit
fpregs_unlock();
}
-Tony
On 9/28/21 11:50 AM, Luck, Tony wrote:
> On Mon, Sep 27, 2021 at 04:51:25PM -0700, Dave Hansen wrote:
...
>> 1. Hide whether we need to write to real registers
>> 2. Hide whether we need to update the in-memory image
>> 3. Hide other FPU infrastructure like the TIF flag.
>> 4. Make the users deal with a *whole* state in the replace API
>
> Is that difference just whether you need to save the
> state from registers to memory (for the "update" case)
> or not (for the "replace" case ... where you can ignore
> the current register, overwrite the whole per-feature
> xsave area and mark it to be restored to registers).
>
> If so, just a "bool full" argument might do the trick?
I want to be able to hide the complexity of where the old state comes
from. It might be in registers or it might be in memory or it might be
*neither*. It's possible we're running with stale register state and a
current->...->xsave buffer that has XFEATURES&XFEATURE_FOO 0.
In that case, the "old" copy might be memcpy'd out of the init_task.
Or, for pkeys, we might build it ourselves with init_pkru_val.
> Also - you have a "tsk" argument in your pseudo code. Is
> this needed? Are there places where we need to perform
> these operations on something other than "current"?
Two cases come to mind:
1. Fork/clone where we are doing things to our child's XSAVE buffer
2. ptrace() where we are poking into another task's state
ptrace() goes for the *whole* buffer now. I'm not sure it would need
this per-feature API. I just call it out as something that we might
need in the future.
> pseudo-code:
>
> void *begin_update_one_xsave_feature(enum xfeature xfeature, bool full)
> {
> void *addr;
>
> BUG_ON(!(xsave->header.xcomp_bv & xfeature));
>
> addr = __raw_xsave_addr(xsave, xfeature);
>
> fpregs_lock();
>
> if (full)
> return addr;
If the feature is marked as in the init state in the buffer
(XSTATE_BV[feature]==0), this addr *could* contain total garbage. So,
we'd want to make sure that the memory contents have the init state
written before handing them back to the caller. That's not strictly
required if the user is writing the whole thing, but it's the nice thing
to do.
> if (xfeature registers are "live")
> xsaves(xstate, 1 << xfeature);
One little note: I don't think we would necessarily need to do an XSAVES
here. For PKRU, for instance, we could just do a rdpkru.
> return addr;
> }
>
> void finish_update_one_xsave_feature(enum xfeature xfeature)
> {
> mark feature modified
I think we'd want to do this at the "begin" time. Also, do you mean we
should set XSTATE_BV[feature]?
> set TIF bit
Since the XSAVE buffer was updated, it now contains the canonical FPU
state. It may have diverged from the register state, thus we need to
set TIF_NEED_FPU_LOAD.
It's also worth noting that we *could*:
xrstors(xstate, 1<<xfeature);
as well. That would bring the registers back up to day and we could
keep TIF_NEED_FPU_LOAD==0.
> fpregs_unlock();
> }
>
> -Tony
>
On Tue, Sep 28, 2021 at 12:19:22PM -0700, Dave Hansen wrote:
> On 9/28/21 11:50 AM, Luck, Tony wrote:
> > On Mon, Sep 27, 2021 at 04:51:25PM -0700, Dave Hansen wrote:
> ...
> >> 1. Hide whether we need to write to real registers
> >> 2. Hide whether we need to update the in-memory image
> >> 3. Hide other FPU infrastructure like the TIF flag.
> >> 4. Make the users deal with a *whole* state in the replace API
> >
> > Is that difference just whether you need to save the
> > state from registers to memory (for the "update" case)
> > or not (for the "replace" case ... where you can ignore
> > the current register, overwrite the whole per-feature
> > xsave area and mark it to be restored to registers).
> >
> > If so, just a "bool full" argument might do the trick?
>
> I want to be able to hide the complexity of where the old state comes
> from. It might be in registers or it might be in memory or it might be
> *neither*. It's possible we're running with stale register state and a
> current->...->xsave buffer that has XFEATURES&XFEATURE_FOO 0.
>
> In that case, the "old" copy might be memcpy'd out of the init_task.
> Or, for pkeys, we might build it ourselves with init_pkru_val.
So should there be an error case if there isn't an "old" state, and
the user calls:
p = begin_update_one_xsave_feature(XFEATURE_something, false);
Maybe instead of an error, just fill it in with the init state for the feature?
> > Also - you have a "tsk" argument in your pseudo code. Is
> > this needed? Are there places where we need to perform
> > these operations on something other than "current"?
>
> Two cases come to mind:
> 1. Fork/clone where we are doing things to our child's XSAVE buffer
> 2. ptrace() where we are poking into another task's state
>
> ptrace() goes for the *whole* buffer now. I'm not sure it would need
> this per-feature API. I just call it out as something that we might
> need in the future.
Ok - those seem ok ... it is up to the caller to make sure that the
target task is in some "not running, and can't suddenly start running"
state before calling these functions.
>
> > pseudo-code:
> >
> > void *begin_update_one_xsave_feature(enum xfeature xfeature, bool full)
> > {
> > void *addr;
> >
> > BUG_ON(!(xsave->header.xcomp_bv & xfeature));
> >
> > addr = __raw_xsave_addr(xsave, xfeature);
> >
> > fpregs_lock();
> >
> > if (full)
> > return addr;
>
> If the feature is marked as in the init state in the buffer
> (XSTATE_BV[feature]==0), this addr *could* contain total garbage. So,
> we'd want to make sure that the memory contents have the init state
> written before handing them back to the caller. That's not strictly
> required if the user is writing the whole thing, but it's the nice thing
> to do.
Nice guys waste CPU cycles writing to memory that is just going to get
written again.
>
> > if (xfeature registers are "live")
> > xsaves(xstate, 1 << xfeature);
>
> One little note: I don't think we would necessarily need to do an XSAVES
> here. For PKRU, for instance, we could just do a rdpkru.
Like this?
if (tsk == current) {
switch (xfeature) {
case XFEATURE_PKRU:
*(u32 *)addr = rdpkru();
break;
case XFEATURE_PASID:
rdmsrl(MSR_IA32_PASID, msr);
*(u64 *)addr = msr;
break;
... any other "easy" states ...
default:
xsaves(xstate, 1 << xfeature);
break;
}
}
>
> > return addr;
> > }
> >
> > void finish_update_one_xsave_feature(enum xfeature xfeature)
> > {
> > mark feature modified
>
> I think we'd want to do this at the "begin" time. Also, do you mean we
> should set XSTATE_BV[feature]?
Begin? End? It's all inside fpregs_lock(). But whatever seems best.
Yes, I think that this means set XSTATE_BV[feature] ... but I'm
relying on you as the xsave expert to help get the subtle bits right so
the Andy Lutomirski can smile at this code.
> > set TIF bit
>
> Since the XSAVE buffer was updated, it now contains the canonical FPU
> state. It may have diverged from the register state, thus we need to
> set TIF_NEED_FPU_LOAD.
Yes, that's the TIF bit my pseudo-code intended.
> It's also worth noting that we *could*:
>
> xrstors(xstate, 1<<xfeature);
>
> as well. That would bring the registers back up to day and we could
> keep TIF_NEED_FPU_LOAD==0.
Only makes sense if "tsk == current". But does this help. The work seems
to be the same whether we do it now, or later. We don't know for sure
that we will directly return to the task. We might context switch to
another task, so loading the state into registers now would just be
wasted time.
>
> > fpregs_unlock();
> > }
-Tony
On 9/28/21 1:28 PM, Luck, Tony wrote:
> On Tue, Sep 28, 2021 at 12:19:22PM -0700, Dave Hansen wrote:
>> On 9/28/21 11:50 AM, Luck, Tony wrote:
>>> On Mon, Sep 27, 2021 at 04:51:25PM -0700, Dave Hansen wrote:
>> ...
>>>> 1. Hide whether we need to write to real registers
>>>> 2. Hide whether we need to update the in-memory image
>>>> 3. Hide other FPU infrastructure like the TIF flag.
>>>> 4. Make the users deal with a *whole* state in the replace API
>>>
>>> Is that difference just whether you need to save the
>>> state from registers to memory (for the "update" case)
>>> or not (for the "replace" case ... where you can ignore
>>> the current register, overwrite the whole per-feature
>>> xsave area and mark it to be restored to registers).
>>>
>>> If so, just a "bool full" argument might do the trick?
>>
>> I want to be able to hide the complexity of where the old state comes
>> from. It might be in registers or it might be in memory or it might be
>> *neither*. It's possible we're running with stale register state and a
>> current->...->xsave buffer that has XFEATURES&XFEATURE_FOO 0.
>>
>> In that case, the "old" copy might be memcpy'd out of the init_task.
>> Or, for pkeys, we might build it ourselves with init_pkru_val.
>
> So should there be an error case if there isn't an "old" state, and
> the user calls:
>
> p = begin_update_one_xsave_feature(XFEATURE_something, false);
>
> Maybe instead of an error, just fill it in with the init state for the feature?
Yes, please. Let's not generate an error. Let's populate the init
state and let them move on with life.
>>> pseudo-code:
>>>
>>> void *begin_update_one_xsave_feature(enum xfeature xfeature, bool full)
>>> {
>>> void *addr;
>>>
>>> BUG_ON(!(xsave->header.xcomp_bv & xfeature));
>>>
>>> addr = __raw_xsave_addr(xsave, xfeature);
>>>
>>> fpregs_lock();
>>>
>>> if (full)
>>> return addr;
>>
>> If the feature is marked as in the init state in the buffer
>> (XSTATE_BV[feature]==0), this addr *could* contain total garbage. So,
>> we'd want to make sure that the memory contents have the init state
>> written before handing them back to the caller. That's not strictly
>> required if the user is writing the whole thing, but it's the nice thing
>> to do.
>
> Nice guys waste CPU cycles writing to memory that is just going to get
> written again.
Let's skip the "replace" operation for now and focus on "update". A
full replace *can* be faster because it doesn't require the state to be
written out. But, we don't need that for now.
Let's just do the "update" thing, and let's ensure that we reflect the
init state into the buffer that gets returned if the feature was in its
init state.
Sound good?
>>> if (xfeature registers are "live")
>>> xsaves(xstate, 1 << xfeature);
>>
>> One little note: I don't think we would necessarily need to do an XSAVES
>> here. For PKRU, for instance, we could just do a rdpkru.
>
> Like this?
>
> if (tsk == current) {
> switch (xfeature) {
> case XFEATURE_PKRU:
> *(u32 *)addr = rdpkru();
> break;
> case XFEATURE_PASID:
> rdmsrl(MSR_IA32_PASID, msr);
> *(u64 *)addr = msr;
> break;
> ... any other "easy" states ...
> default:
> xsaves(xstate, 1 << xfeature);
> break;
> }
> }
Yep.
>>> return addr;
>>> }
>>>
>>> void finish_update_one_xsave_feature(enum xfeature xfeature)
>>> {
>>> mark feature modified
>>
>> I think we'd want to do this at the "begin" time. Also, do you mean we
>> should set XSTATE_BV[feature]?
>
> Begin? End? It's all inside fpregs_lock(). But whatever seems best.
I'm actually waffling about it.
Does XSTATE_BV[feature] mean that state *is* there or that state *may*
be there? Either way works.
>> It's also worth noting that we *could*:
>>
>> xrstors(xstate, 1<<xfeature);
>>
>> as well. That would bring the registers back up to day and we could
>> keep TIF_NEED_FPU_LOAD==0.
>
> Only makes sense if "tsk == current". But does this help. The work seems
> to be the same whether we do it now, or later. We don't know for sure
> that we will directly return to the task. We might context switch to
> another task, so loading the state into registers now would just be
> wasted time.
True, but the other side of the coin is that setting TIF_NEED_FPU_LOAD
subjects us to an XRSTOR on the way out to userspace. That XRSTOR might
just be updating one state component in practice.
Either way, sorry for the distraction. We (me) should really be
focusing on getting something that is functional but slow.
Moving beyond pseudo-code and into compiles-but-probably-broken-code.
The intent of the functions below is that Fenghua should be able to
do:
void fpu__pasid_write(u32 pasid)
{
u64 msr_val = pasid | MSR_IA32_PASID_VALID;
struct ia32_pasid_state *addr;
addr = begin_update_one_xsave_feature(current, XFEATURE_PASID, true);
addr->pasid = msr_val;
finish_update_one_xsave_feature(current);
}
So here's the two new functions that would be added to
arch/x86/kernel/fpu/xstate.c
----
void *begin_update_one_xsave_feature(struct task_struct *tsk,
enum xfeature xfeature, bool full)
{
struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
struct xregs_state *xinit = &init_fpstate.xsave;
u64 fmask = 1ull << xfeature;
void *addr;
BUG_ON(!(xsave->header.xcomp_bv & fmask));
fpregs_lock();
addr = __raw_xsave_addr(xsave, xfeature);
if (full || tsk != current) {
memcpy(addr, __raw_xsave_addr(xinit, xfeature), xstate_sizes[xfeature]);
goto out;
}
/* could optimize some cases where xsaves() isn't fastest option */
if (!(xsave->header.xfeatures & fmask))
xsaves(xsave, fmask);
out:
xsave->header.xfeatures |= fmask;
return addr;
}
void finish_update_one_xsave_feature(struct task_struct *tsk)
{
set_ti_thread_flag(task_thread_info(tsk), TIF_NEED_FPU_LOAD);
fpregs_unlock();
}
----
-Tony
Hi, Tony,
On Tue, Sep 28, 2021 at 04:10:39PM -0700, Luck, Tony wrote:
> Moving beyond pseudo-code and into compiles-but-probably-broken-code.
>
>
> The intent of the functions below is that Fenghua should be able to
> do:
>
> void fpu__pasid_write(u32 pasid)
> {
> u64 msr_val = pasid | MSR_IA32_PASID_VALID;
> struct ia32_pasid_state *addr;
>
> addr = begin_update_one_xsave_feature(current, XFEATURE_PASID, true);
> addr->pasid = msr_val;
> finish_update_one_xsave_feature(current);
> }
>
> So here's the two new functions that would be added to
> arch/x86/kernel/fpu/xstate.c
>
> ----
>
> void *begin_update_one_xsave_feature(struct task_struct *tsk,
> enum xfeature xfeature, bool full)
> {
> struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
> struct xregs_state *xinit = &init_fpstate.xsave;
> u64 fmask = 1ull << xfeature;
> void *addr;
>
> BUG_ON(!(xsave->header.xcomp_bv & fmask));
>
> fpregs_lock();
>
> addr = __raw_xsave_addr(xsave, xfeature);
>
> if (full || tsk != current) {
> memcpy(addr, __raw_xsave_addr(xinit, xfeature), xstate_sizes[xfeature]);
> goto out;
> }
>
> /* could optimize some cases where xsaves() isn't fastest option */
> if (!(xsave->header.xfeatures & fmask))
> xsaves(xsave, fmask);
If xfeatures's feature bit is 0, xsaves will not write its init value to the
memory due to init optimization. So the xsaves will do nothing and the
state is not initialized and may have random data.
>
> out:
> xsave->header.xfeatures |= fmask;
> return addr;
> }
>
> void finish_update_one_xsave_feature(struct task_struct *tsk)
> {
> set_ti_thread_flag(task_thread_info(tsk), TIF_NEED_FPU_LOAD);
Setting TIF_NEED_FPU_LOAD cannot guaranteed to execute XRSTORS on exiting
to user. In fpregs_restore_userregs():
if (!fpregs_state_valid(fpu, cpu)) {
...
__restore_fpregs_from_fpstate(&fpu->state, mask);
...
}
fpregs state should be invalid to get the XRSTROS executed.
So setting TIF_NEED_FPU_LOAD may get the FPU register unchanged on exiting
to user.
> fpregs_unlock();
> }
Thanks.
-Fenghua
On Tue, Sep 28, 2021 at 11:50:37PM +0000, Fenghua Yu wrote:
> If xfeatures's feature bit is 0, xsaves will not write its init value to the
> memory due to init optimization. So the xsaves will do nothing and the
> state is not initialized and may have random data.
> Setting TIF_NEED_FPU_LOAD cannot guaranteed to execute XRSTORS on exiting
> to user. In fpregs_restore_userregs():
> if (!fpregs_state_valid(fpu, cpu)) {
> ...
> __restore_fpregs_from_fpstate(&fpu->state, mask);
> ...
> }
>
> fpregs state should be invalid to get the XRSTROS executed.
>
> So setting TIF_NEED_FPU_LOAD may get the FPU register unchanged on exiting
> to user.
Does this help?
Changed lines marked with //<<<<<
-Tony
void *begin_update_one_xsave_feature(struct task_struct *tsk,
enum xfeature xfeature, bool full)
{
struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
struct xregs_state *xinit = &init_fpstate.xsave;
u64 fmask = 1ull << xfeature;
void *addr;
BUG_ON(!(xsave->header.xcomp_bv & fmask));
fpregs_lock();
addr = __raw_xsave_addr(xsave, xfeature);
if (full || tsk != current) {
memcpy(addr, __raw_xsave_addr(xinit, xfeature), xstate_sizes[xfeature]);
goto out;
}
if (!(xsave->header.xfeatures & fmask)) {
xsave->header.xfeatures |= fmask; //<<<<<
xsaves(xsave, fmask);
}
out:
xsave->header.xfeatures |= fmask;
return addr;
}
void finish_update_one_xsave_feature(struct task_struct *tsk)
{
set_ti_thread_flag(task_thread_info(tsk), TIF_NEED_FPU_LOAD);
if (tsk == current) //<<<<<
__cpu_invalidate_fpregs_state(); //<<<<<
fpregs_unlock();
}
Hi, Tony,
> void *begin_update_one_xsave_feature(struct task_struct *tsk,
> enum xfeature xfeature, bool full) {
> struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
> struct xregs_state *xinit = &init_fpstate.xsave;
> u64 fmask = 1ull << xfeature;
> void *addr;
>
> BUG_ON(!(xsave->header.xcomp_bv & fmask));
>
> fpregs_lock();
I'm afraid we may hit the same locking issue when we send IPI to notify another task to modify its
PASID state. Here the API is called to modify another running task's PASID state as well without a right lock.
fpregs_lock() is not enough to deal with this, I'm afraid.
Quote from Thomas in https://lore.kernel.org/linux-iommu/[email protected]/
"FPU state of a running task is protected by fregs_lock() which is
nothing else than a local_bh_disable(). As BH disabled regions run
usually with interrupts enabled the IPI can hit a code section which
modifies FPU state and there is absolutely no guarantee that any of the
assumptions which are made for the IPI case is true."
Maybe restrict the API's scope: don't modify another task's FPU state, just the current task's state?
> addr = __raw_xsave_addr(xsave, xfeature);
>
> if (full || tsk != current) {
> memcpy(addr, __raw_xsave_addr(xinit, xfeature),
> xstate_sizes[xfeature]);
> goto out;
> }
>
> if (!(xsave->header.xfeatures & fmask)) {
> xsave->header.xfeatures |= fmask; //<<<<<
> xsaves(xsave, fmask);
> }
>
> out:
> xsave->header.xfeatures |= fmask;
> return addr;
> }
>
> void finish_update_one_xsave_feature(struct task_struct *tsk) {
> set_ti_thread_flag(task_thread_info(tsk), TIF_NEED_FPU_LOAD);
> if (tsk == current) //<<<<<
> __cpu_invalidate_fpregs_state(); //<<<<<
> fpregs_unlock();
> }
Thanks.
-Fenghua
>> fpregs_lock();
>
> I'm afraid we may hit the same locking issue when we send IPI to notify another task to modify its
> PASID state. Here the API is called to modify another running task's PASID state as well without a right lock.
> fpregs_lock() is not enough to deal with this, I'm afraid.
We don't send IPI any more to change PASID state. The only place that the
current patch series touches the PASID MSR is in the #GP fault handler.
-Tony
Hi, Tony,
On Tue, Sep 28, 2021 at 06:06:52PM -0700, Luck, Tony wrote:
> >> fpregs_lock();
> >
> > I'm afraid we may hit the same locking issue when we send IPI to notify another task to modify its
> > PASID state. Here the API is called to modify another running task's PASID state as well without a right lock.
> > fpregs_lock() is not enough to deal with this, I'm afraid.
>
> We don't send IPI any more to change PASID state. The only place that the
> current patch series touches the PASID MSR is in the #GP fault handler.
It's safe for the helpers to handle the PASID case (modifying the current task's
PASID state in #GP).
But the helpers seem to be generic. They take "task" as a parameter and
handle the task as non-current case. So the helpers are not for PASID only.
They may be used by others to modify a running task's FPU state. But
It's not safe to do so.
At least need some comments/restriction for the helpers to be used on
a running task?
Thanks.
-Fenghua
Hi, Tony,
> void *begin_update_one_xsave_feature(struct task_struct *tsk,
> enum xfeature xfeature, bool full) {
> struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
> struct xregs_state *xinit = &init_fpstate.xsave;
> u64 fmask = 1ull << xfeature;
> void *addr;
>
> BUG_ON(!(xsave->header.xcomp_bv & fmask));
>
> fpregs_lock();
>
> addr = __raw_xsave_addr(xsave, xfeature);
>
> if (full || tsk != current) {
> memcpy(addr, __raw_xsave_addr(xinit, xfeature),
> xstate_sizes[xfeature]);
> goto out;
> }
>
> if (!(xsave->header.xfeatures & fmask)) {
> xsave->header.xfeatures |= fmask; //<<<<<
> xsaves(xsave, fmask);
> }
I'm not sure why the FPU state is initialized here.
For updating the PASID state, it's unnecessary to init the PASID state.
Maybe it is necessary in other cases?
>
> out:
> xsave->header.xfeatures |= fmask;
Setting the xfeatures bit plus updating the PASID state is enough
to restore the PASID state to the IA32_PASID MSR.
> return addr;
> }
>
> void finish_update_one_xsave_feature(struct task_struct *tsk) {
> set_ti_thread_flag(task_thread_info(tsk), TIF_NEED_FPU_LOAD);
> if (tsk == current) //<<<<<
> __cpu_invalidate_fpregs_state(); //<<<<<
> fpregs_unlock();
> }
Thanks.
-Fenghua
> But the helpers seem to be generic. They take "task" as a parameter and
> handle the task as non-current case. So the helpers are not for PASID only.
> They may be used by others to modify a running task's FPU state. But
> It's not safe to do so.
>
> At least need some comments/restriction for the helpers to be used on
> a running task?
Fenghua,
Correct. When I add some comments I'll make it very clear that it is the
responsibility of the caller to make sure that the task that is targeted
cannot run.
Earlier in this thread Dave suggested there are two cases where these
helpers might be useful:
1) Fork/clone - to set up some xsave state in the child ... but this would be
done before the child is allowed to run.
2) ptrace - this is a "maybe" because ptrace already has code to handle all
the xsave state as a single entity. Perhaps someone might want to change it
to only modify a single feature ... but this seems unlikely. In any case the
ptrace code already "stops" the target process while it is reading/writing state.
-Tony
>> if (!(xsave->header.xfeatures & fmask)) {
>> xsave->header.xfeatures |= fmask; //<<<<<
>> xsaves(xsave, fmask);
>> }
>
> I'm not sure why the FPU state is initialized here.
>
> For updating the PASID state, it's unnecessary to init the PASID state.
>
> Maybe it is necessary in other cases?
Dave had suggested initializing feature state when it is unknown (could
be garbage). This is my attempt to follow that guidance. I'm not confident
that my tests for "is the state in registers, in memory, or is garbage"
really capture all the cases.
-Tony
On Fri, Sep 24, 2021 at 08:39:24AM -0700, Luck, Tony wrote:
> If you have ctags installed then a ctrl-] on that
> __fixup_pasid_exception() will take you to the function with
> the comments. No electron microscope needed.
I to use ctags, but when reading the #GP handler, this is a whole
different file. Also, I don't find any of those comments explaining the
not-our-#GP-but-harmless-cycle issue.
The current->has_valid_pasid one comes close, but just misses it. But
really the place to put this is in the #GP handler itself so we don't
have to dig through every call there to figure out how it's supposed to
work.
> +
> +/*
> + * Try to figure out if there is a PASID MSR value to propagate to the
> + * thread taking the #GP.
> + */
> +bool __fixup_pasid_exception(void)
> +{
> + u32 pasid;
> +
> + /*
> + * This function is called only when this #GP was triggered from user
> + * space. So the mm cannot be NULL.
> + */
> + pasid = current->mm->pasid;
> +
> + /* If no PASID is allocated, there is nothing to propagate. */
> + if (pasid == PASID_DISABLED)
> + return false;
> +
> + /*
> + * If the current task already has a valid PASID MSR, then the #GP
> + * fault must be for some non-ENQCMD related reason.
> + */
> + if (current->has_valid_pasid)
> + return false;
> +
> + /* Fix up the MSR by the PASID in the mm. */
> + fpu__pasid_write(pasid);
> + current->has_valid_pasid = 1;
> +
> + return true;
> +}
>
> -Tony
On 9/28/21 16:10, Luck, Tony wrote:
> Moving beyond pseudo-code and into compiles-but-probably-broken-code.
>
>
> The intent of the functions below is that Fenghua should be able to
> do:
>
> void fpu__pasid_write(u32 pasid)
> {
> u64 msr_val = pasid | MSR_IA32_PASID_VALID;
> struct ia32_pasid_state *addr;
>
> addr = begin_update_one_xsave_feature(current, XFEATURE_PASID, true);
> addr->pasid = msr_val;
> finish_update_one_xsave_feature(current);
> }
>
This gets gnarly because we would presumably like to optimize the case
where we can do the update directly in registers. I wonder if we can do
it with a bit of macro magic in a somewhat generic way:
typedef fpu__pasid_type u32;
static inline void fpu__set_pasid_in_register(const u32 *value)
{
wrmsr(...);
}
#define DEFINE_FPU_HELPER(name) \
static inline void fpu__set_##name(const fpu__##name##_type *val) \
{ \
fpregs_lock(); \
if (should write in memory) { \
->xfeatures |= XFEATURE_##name; \
ptr = get_xsave_addr(...); \
memcpy(ptr, val, sizeof(*val)); \
__fpu_invalidate_fpregs_state(...); \
} else { \
fpu__set_##name##_in_register(val); \
} \
}
On Wed, Sep 29, 2021 at 09:58:22AM -0700, Andy Lutomirski wrote:
> On 9/28/21 16:10, Luck, Tony wrote:
> > Moving beyond pseudo-code and into compiles-but-probably-broken-code.
> >
> >
> > The intent of the functions below is that Fenghua should be able to
> > do:
> >
> > void fpu__pasid_write(u32 pasid)
> > {
> > u64 msr_val = pasid | MSR_IA32_PASID_VALID;
> > struct ia32_pasid_state *addr;
> >
> > addr = begin_update_one_xsave_feature(current, XFEATURE_PASID, true);
> > addr->pasid = msr_val;
> > finish_update_one_xsave_feature(current);
> > }
> >
>
> This gets gnarly because we would presumably like to optimize the case where
> we can do the update directly in registers. I wonder if we can do it with a
> bit of macro magic in a somewhat generic way:
Can we defere the optimizations until there is a use case that
cares? The existing use case (fixing up the #GP fault by setting
the PASID MSR) isn't performance critical.
Let's just have something that is clear (or as clear as any xsave
code can be) and correct.
-Tony
On Wed, Sep 29, 2021, at 10:07 AM, Luck, Tony wrote:
> On Wed, Sep 29, 2021 at 09:58:22AM -0700, Andy Lutomirski wrote:
>> On 9/28/21 16:10, Luck, Tony wrote:
>> > Moving beyond pseudo-code and into compiles-but-probably-broken-code.
>> >
>> >
>> > The intent of the functions below is that Fenghua should be able to
>> > do:
>> >
>> > void fpu__pasid_write(u32 pasid)
>> > {
>> > u64 msr_val = pasid | MSR_IA32_PASID_VALID;
>> > struct ia32_pasid_state *addr;
>> >
>> > addr = begin_update_one_xsave_feature(current, XFEATURE_PASID, true);
>> > addr->pasid = msr_val;
>> > finish_update_one_xsave_feature(current);
>> > }
>> >
>>
>> This gets gnarly because we would presumably like to optimize the case where
>> we can do the update directly in registers. I wonder if we can do it with a
>> bit of macro magic in a somewhat generic way:
>
> Can we defere the optimizations until there is a use case that
> cares? The existing use case (fixing up the #GP fault by setting
> the PASID MSR) isn't performance critical.
>
> Let's just have something that is clear (or as clear as any xsave
> code can be) and correct.
>
>
The goal would be to use the same code for CET and PKRU, I think.