Provide a mechanism to flush the L1D cache on context switch. The goal
is to allow tasks that are paranoid due to the recent snoop assisted data
sampling vulnerabilites, to flush their L1D on being switched out.
This protects their data from being snooped or leaked via side channels
after the task has context switched out.
The core of the patches is patch 3, the rest largely refactor the code
so that common bits can be reused.
Changelog v3:
- Refactor the return value of what flush_l1d_cache_hw() returns
- Refactor the code, so that the generic setup bits come first
(patch 3 from previous posting is now patches 3 and 4)
- Move from arch_prctl() to the prctl() interface as recommend
in the reviews.
Changelog v2:
- Fix a miss of mutex_unlock (caught by Borislav Petkov <[email protected]>)
- Add documentation about the changes (Josh Poimboeuf
<[email protected]>)
Changelog:
- Refactor the code and reuse cond_ibpb() - code bits provided by tglx
- Merge mm state tracking for ibpb and l1d flush
- Rename TIF_L1D_FLUSH to TIF_SPEC_FLUSH_L1D
Changelog RFC:
- Reuse existing code for allocation and flush
- Simplify the goto logic in the actual l1d_flush function
- Optimize the code path with jump labels/static functions
The previous version of this patch posted at:
https://lore.kernel.org/lkml/[email protected]/
Balbir Singh (5):
arch/x86/kvm: Refactor l1d flush lifecycle management
arch/x86: Refactor tlbflush and l1d flush
arch/x86/mm: Refactor cond_ibpb() to support other use cases
arch/x86: Optionally flush L1D on context switch
arch/x86: Add L1D flushing Documentation
Documentation/admin-guide/hw-vuln/index.rst | 1 +
.../admin-guide/hw-vuln/l1d_flush.rst | 40 +++++++
arch/x86/include/asm/cacheflush.h | 6 +
arch/x86/include/asm/thread_info.h | 6 +-
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/l1d_flush.c | 85 ++++++++++++++
arch/x86/kvm/vmx/vmx.c | 56 ++-------
arch/x86/mm/tlb.c | 109 ++++++++++++++----
include/uapi/linux/prctl.h | 4 +
kernel/sys.c | 20 ++++
11 files changed, 259 insertions(+), 71 deletions(-)
create mode 100644 Documentation/admin-guide/hw-vuln/l1d_flush.rst
create mode 100644 arch/x86/kernel/l1d_flush.c
--
2.17.1
Implement a mechanism to selectively flush the L1D cache. The goal is to
allow tasks that are paranoid due to the recent snoop assisted data sampling
vulnerabilites, to flush their L1D on being switched out. This protects
their data from being snooped or leaked via side channels after the task
has context switched out.
There are two scenarios we might want to protect against, a task leaving
the CPU with data still in L1D (which is the main concern of this patch),
the second scenario is a malicious task coming in (not so well trusted)
for which we want to clean up the cache before it starts. Only the case
for the former is addressed.
Add prctl()'s to opt-in to the L1D cache on context switch out, the
existing mechanisms of tracking prev_mm via cpu_tlbstate is
reused.
A new thread_info flag TIF_SPEC_FLUSH_L1D is added to track tasks which
opt-into L1D flushing. cpu_tlbstate.last_user_mm_spec is used to convert
the TIF flags into mm state (per cpu via last_user_mm_spec) in
cond_mitigation(), which then used to do decide when to call flush_l1d().
The current version benefited from discussions with Kees and Thomas.
Thomas suggested and provided the code snippet for refactoring the
existing cond_ibpb() code.
Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/thread_info.h | 6 ++-
arch/x86/mm/tlb.c | 69 +++++++++++++++++++++++++++++-
include/uapi/linux/prctl.h | 4 ++
kernel/sys.c | 20 +++++++++
4 files changed, 96 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8de8ceccb8bc..be25cc0c677d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -84,7 +84,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
-#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
+#define TIF_SPEC_FLUSH_L1D 10 /* Flush L1D on mm switches (processes) */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
@@ -96,6 +96,7 @@ struct thread_info {
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -132,6 +133,7 @@ struct thread_info {
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
#define _TIF_FSCHECK (1 << TIF_FSCHECK)
+#define _TIF_SPEC_FLUSH_L1D (1 << TIF_SPEC_FLUSH_L1D)
/* Work to do before invoking the actual syscall. */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -239,6 +241,8 @@ extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
extern void arch_setup_new_exec(void);
+extern int arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable);
+extern int arch_prctl_l1d_flush_get(struct task_struct *tsk);
#define arch_setup_new_exec arch_setup_new_exec
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index da5c94286c7d..85b8eec0ff07 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -13,6 +13,7 @@
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
#include <asm/cache.h>
+#include <asm/cacheflush.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
@@ -33,11 +34,12 @@
*/
/*
- * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
* stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
-#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
+#define LAST_USER_MM_FLUSH_L1D 0x2UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_FLUSH_L1D)
/*
* We get here when we do something requiring a TLB invalidation
@@ -152,6 +154,64 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);
+static void *l1d_flush_pages;
+static DEFINE_MUTEX(l1d_flush_mutex);
+
+int enable_l1d_flush_for_task(struct task_struct *tsk)
+{
+ struct page *page;
+ int ret = 0;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
+ goto done;
+
+ page = READ_ONCE(l1d_flush_pages);
+ if (unlikely(!page)) {
+ mutex_lock(&l1d_flush_mutex);
+ if (!l1d_flush_pages) {
+ l1d_flush_pages = alloc_l1d_flush_pages();
+ if (!l1d_flush_pages) {
+ mutex_unlock(&l1d_flush_mutex);
+ return -ENOMEM;
+ }
+ }
+ mutex_unlock(&l1d_flush_mutex);
+ }
+done:
+ set_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
+ return ret;
+}
+
+int disable_l1d_flush_for_task(struct task_struct *tsk)
+{
+ clear_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
+ return 0;
+}
+
+int arch_prctl_l1d_flush_get(struct task_struct *tsk)
+{
+ return test_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
+}
+
+int arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable)
+{
+ if (enable)
+ return enable_l1d_flush_for_task(tsk);
+ return disable_l1d_flush_for_task(tsk);
+}
+
+/*
+ * Flush the L1D cache for this CPU. We want to this at switch mm time,
+ * this is a pessimistic security measure and an opt-in for those tasks
+ * that host sensitive information.
+ */
+static void flush_l1d(void)
+{
+ if (flush_l1d_cache_hw() == 0)
+ return;
+ flush_l1d_cache_sw(l1d_flush_pages);
+}
+
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -195,6 +255,8 @@ static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
+ BUILD_BUG_ON(TIF_SPEC_FLUSH_L1D != TIF_SPEC_IB + 1);
+
return (unsigned long)next->mm | spec_bits;
}
@@ -268,6 +330,9 @@ static void cond_mitigation(struct task_struct *next)
indirect_branch_prediction_barrier();
}
+ if (prev_mm & LAST_USER_MM_FLUSH_L1D)
+ flush_l1d();
+
this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 07b4f8131e36..42cb3038c81a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -238,4 +238,8 @@ struct prctl_mm_map {
#define PR_SET_IO_FLUSHER 57
#define PR_GET_IO_FLUSHER 58
+/* Flush L1D on context switch (mm) */
+#define PR_SET_L1D_FLUSH 59
+#define PR_GET_L1D_FLUSH 60
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index d325f3ab624a..578aa8b6d87e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2262,6 +2262,16 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}
+int __weak arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable)
+{
+ return -EINVAL;
+}
+
+int __weak arch_prctl_l1d_flush_get(struct task_struct *t)
+{
+ return -EINVAL;
+}
+
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2514,6 +2524,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_SET_L1D_FLUSH:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_l1d_flush_set(me, arg2);
+ break;
+ case PR_GET_L1D_FLUSH:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_l1d_flush_get(me);
+ break;
default:
error = -EINVAL;
break;
--
2.17.1
Balbir Singh <[email protected]> writes:
> +static void *l1d_flush_pages;
> +static DEFINE_MUTEX(l1d_flush_mutex);
> +
> +int enable_l1d_flush_for_task(struct task_struct *tsk)
static ?
> +{
> + struct page *page;
> + int ret = 0;
> +
> + if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
> + goto done;
> +
> + page = READ_ONCE(l1d_flush_pages);
> + if (unlikely(!page)) {
> + mutex_lock(&l1d_flush_mutex);
> + if (!l1d_flush_pages) {
> + l1d_flush_pages = alloc_l1d_flush_pages();
> + if (!l1d_flush_pages) {
> + mutex_unlock(&l1d_flush_mutex);
> + return -ENOMEM;
> + }
> + }
> + mutex_unlock(&l1d_flush_mutex);
> + }
So this is +/- the mutex the same code as KVM has. Why is this not moved
into l1d_flush.c, i.e.
static void *l1d_flush_pages;
static DEFINE_MUTEX(l1d_flush_mutex);
int l1d_flush_init(void)
{
int ret;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D) || l1d_flush_pages)
return 0;
mutex_lock(&l1d_flush_mutex);
if (!l1d_flush_pages)
l1d_flush_pages = l1d_flush_alloc_pages();
ret = l1d_flush_pages ? 0 : -ENOMEM;
mutex_unlock(&l1d_flush_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(l1d_flush_init);
which removes the export of l1d_flush_alloc_pages() and gets rid of the
cleanup counterpart. In a real world deployment unloading of VMX if used
once is unlikely and with the task based one you end up with these pages
'leaked' anyway if used once.
Can we please make this simple and consistent?
> +done:
> + set_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
> + return ret;
> +}
> +
> +int disable_l1d_flush_for_task(struct task_struct *tsk)
static void?
> +{
> + clear_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
> + return 0;
> +}
> +
> +int arch_prctl_l1d_flush_get(struct task_struct *tsk)
> +{
> + return test_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
> +}
> +
> +int arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable)
> +{
> + if (enable)
> + return enable_l1d_flush_for_task(tsk);
> + return disable_l1d_flush_for_task(tsk);
> +}
If any other architecture enables this, then it will have _ALL_ of this
code duplicated. So we should rather have:
- CONFIG_FLUSH_L1D which gets selected by features requesting
it, i.e. X86 + VMX
- CONFIG_FLUSH_L1D_PRCTL which gets selected by architectures
supporting it, i.e. X86. This selects CONFIG_FLUSH_L1D and enables
the prctl logic.
- arch/xxx/include/asm/l1d_flush.h which has for x86:
#include <linux/l1d_flush.h>
#include <asm/thread_info.h>
#define L1D_CACHE_ORDER 4
static inline bool arch_has_l1d_flush_hw(void)
{
return static_cpu_has(X86_FEATURE_FLUSH_L1D);
}
// This is to make the allocation function conditional or if an
// architecture knows upfront compile time optimized.
static inline void arch_l1d_flush(void *pages, unsigned long options)
{
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
...
return;
}
if (options & POPULATE_TLB)
l1d_flush_populate_tlb(pages);
l1d_flush_sw(pages);
}
// The option bits go into linux/l1d_flush.h and the asm header has
// exactly one file which includes it: lib/l1d_flush.c
//
// All other places (VMX, arch context switch code) include
// linux/l1d_flush.h which also contains the prototypes for
// l1d_flush_init() and l1d_flush().
- Have l1d_flush_init() and the alloc function in lib/l1d_flush.c
- The flush invocation in lib/l1d_flush.c:
void l1d_flush(unsigned long options)
{
arch_l1d_flush(l1d_flush_pages, options);
}
EXPORT_SYMBOL_GPL(l1d_flush);
- All architectures have to use TIF_SPEC_FLUSH_L1D if they want to
support the prctl.
So all of the above arch_prctl*() stuff becomes generic and sits in
lib/l1d_flush.c hidden behind CONFIG_FLUSH_L1D_PRCTL.
The only architecture specific bits are in the actual context switch
code.
Hmm?
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 07b4f8131e36..42cb3038c81a 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -238,4 +238,8 @@ struct prctl_mm_map {
> #define PR_SET_IO_FLUSHER 57
> #define PR_GET_IO_FLUSHER 58
>
> +/* Flush L1D on context switch (mm) */
> +#define PR_SET_L1D_FLUSH 59
> +#define PR_GET_L1D_FLUSH 60
> +
> #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/sys.c b/kernel/sys.c
> index d325f3ab624a..578aa8b6d87e 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2262,6 +2262,16 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
> return -EINVAL;
> }
>
> +int __weak arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable)
> +{
> + return -EINVAL;
> +}
> +
> +int __weak arch_prctl_l1d_flush_get(struct task_struct *t)
> +{
> + return -EINVAL;
> +}
> +
> #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
>
> SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> @@ -2514,6 +2524,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>
> error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> break;
> + case PR_SET_L1D_FLUSH:
> + if (arg3 || arg4 || arg5)
> + return -EINVAL;
> + error = arch_prctl_l1d_flush_set(me, arg2);
> + break;
> + case PR_GET_L1D_FLUSH:
> + if (arg2 || arg3 || arg4 || arg5)
> + return -EINVAL;
> + error = arch_prctl_l1d_flush_get(me);
> + break;
> default:
> error = -EINVAL;
> break;
The prctl itself looks fine to me.
Thanks,
tglx
On Fri, 2020-04-17 at 16:41 +0200, Thomas Gleixner wrote:
>
> Balbir Singh <[email protected]> writes:
>
> > +static void *l1d_flush_pages;
> > +static DEFINE_MUTEX(l1d_flush_mutex);
> > +
> > +int enable_l1d_flush_for_task(struct task_struct *tsk)
>
> static ?
>
> > +{
> > + struct page *page;
> > + int ret = 0;
> > +
> > + if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
> > + goto done;
> > +
> > + page = READ_ONCE(l1d_flush_pages);
> > + if (unlikely(!page)) {
> > + mutex_lock(&l1d_flush_mutex);
> > + if (!l1d_flush_pages) {
> > + l1d_flush_pages = alloc_l1d_flush_pages();
> > + if (!l1d_flush_pages) {
> > + mutex_unlock(&l1d_flush_mutex);
> > + return -ENOMEM;
> > + }
> > + }
> > + mutex_unlock(&l1d_flush_mutex);
> > + }
>
> So this is +/- the mutex the same code as KVM has. Why is this not moved
> into l1d_flush.c, i.e.
>
> static void *l1d_flush_pages;
> static DEFINE_MUTEX(l1d_flush_mutex);
>
> int l1d_flush_init(void)
> {
> int ret;
>
> if (static_cpu_has(X86_FEATURE_FLUSH_L1D) || l1d_flush_pages)
> return 0;
>
> mutex_lock(&l1d_flush_mutex);
> if (!l1d_flush_pages)
> l1d_flush_pages = l1d_flush_alloc_pages();
> ret = l1d_flush_pages ? 0 : -ENOMEM;
> mutex_unlock(&l1d_flush_mutex);
> return ret;
> }
> EXPORT_SYMBOL_GPL(l1d_flush_init);
>
> which removes the export of l1d_flush_alloc_pages() and gets rid of the
> cleanup counterpart. In a real world deployment unloading of VMX if used
> once is unlikely and with the task based one you end up with these pages
> 'leaked' anyway if used once.
>
I don't want the patches to be enforce that one cannot unload the kvm module,
but I can refactor those bits a bit more
> Can we please make this simple and consistent?
>
> > +done:
> > + set_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
> > + return ret;
> > +}
> > +
> > +int disable_l1d_flush_for_task(struct task_struct *tsk)
>
> static void?
>
> > +{
> > + clear_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
> > + return 0;
> > +}
> > +
> > +int arch_prctl_l1d_flush_get(struct task_struct *tsk)
> > +{
> > + return test_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
> > +}
> > +
> > +int arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long
> > enable)
> > +{
> > + if (enable)
> > + return enable_l1d_flush_for_task(tsk);
> > + return disable_l1d_flush_for_task(tsk);
> > +}
>
> If any other architecture enables this, then it will have _ALL_ of this
> code duplicated. So we should rather have:
But that is being a bit prescriptive to arch's to implement their L1D flushing
using TIF flags, arch's should be free to use bits in struct_mm for their arch
if they feel so.
>
> - CONFIG_FLUSH_L1D which gets selected by features requesting
> it, i.e. X86 + VMX
>
> - CONFIG_FLUSH_L1D_PRCTL which gets selected by architectures
> supporting it, i.e. X86. This selects CONFIG_FLUSH_L1D and enables
> the prctl logic.
>
> - arch/xxx/include/asm/l1d_flush.h which has for x86:
>
> #include <linux/l1d_flush.h>
> #include <asm/thread_info.h>
>
> #define L1D_CACHE_ORDER 4
>
> static inline bool arch_has_l1d_flush_hw(void)
> {
> return static_cpu_has(X86_FEATURE_FLUSH_L1D);
> }
>
> // This is to make the allocation function conditional or if an
> // architecture knows upfront compile time optimized.
>
> static inline void arch_l1d_flush(void *pages, unsigned long options)
> {
> if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> ...
> return;
> }
>
> if (options & POPULATE_TLB)
> l1d_flush_populate_tlb(pages);
> l1d_flush_sw(pages);
> }
>
> // The option bits go into linux/l1d_flush.h and the asm header has
> // exactly one file which includes it: lib/l1d_flush.c
> //
> // All other places (VMX, arch context switch code) include
> // linux/l1d_flush.h which also contains the prototypes for
> // l1d_flush_init() and l1d_flush().
>
> - Have l1d_flush_init() and the alloc function in lib/l1d_flush.c
>
> - The flush invocation in lib/l1d_flush.c:
>
> void l1d_flush(unsigned long options)
> {
> arch_l1d_flush(l1d_flush_pages, options);
> }
> EXPORT_SYMBOL_GPL(l1d_flush);
>
> - All architectures have to use TIF_SPEC_FLUSH_L1D if they want to
> support the prctl.
>
That is a concern (see above), should we enforce this?
> So all of the above arch_prctl*() stuff becomes generic and sits in
> lib/l1d_flush.c hidden behind CONFIG_FLUSH_L1D_PRCTL.
>
> The only architecture specific bits are in the actual context switch
> code.
>
> Hmm?
>
Let me revisit the code and see if I can refactor it.
Balbir
> > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> > index 07b4f8131e36..42cb3038c81a 100644
> > --- a/include/uapi/linux/prctl.h
> > +++ b/include/uapi/linux/prctl.h
> > @@ -238,4 +238,8 @@ struct prctl_mm_map {
> > #define PR_SET_IO_FLUSHER 57
> > #define PR_GET_IO_FLUSHER 58
> >
> > +/* Flush L1D on context switch (mm) */
> > +#define PR_SET_L1D_FLUSH 59
> > +#define PR_GET_L1D_FLUSH 60
> > +
> > #endif /* _LINUX_PRCTL_H */
> > diff --git a/kernel/sys.c b/kernel/sys.c
> > index d325f3ab624a..578aa8b6d87e 100644
> > --- a/kernel/sys.c
> > +++ b/kernel/sys.c
> > @@ -2262,6 +2262,16 @@ int __weak arch_prctl_spec_ctrl_set(struct
> > task_struct *t, unsigned long which,
> > return -EINVAL;
> > }
> >
> > +int __weak arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned
> > long enable)
> > +{
> > + return -EINVAL;
> > +}
> > +
> > +int __weak arch_prctl_l1d_flush_get(struct task_struct *t)
> > +{
> > + return -EINVAL;
> > +}
> > +
> > #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
> >
> > SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long,
> > arg3,
> > @@ -2514,6 +2524,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long,
> > arg2, unsigned long, arg3,
> >
> > error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> > break;
> > + case PR_SET_L1D_FLUSH:
> > + if (arg3 || arg4 || arg5)
> > + return -EINVAL;
> > + error = arch_prctl_l1d_flush_set(me, arg2);
> > + break;
> > + case PR_GET_L1D_FLUSH:
> > + if (arg2 || arg3 || arg4 || arg5)
> > + return -EINVAL;
> > + error = arch_prctl_l1d_flush_get(me);
> > + break;
> > default:
> > error = -EINVAL;
> > break;
>
> The prctl itself looks fine to me.
>
> Thanks,
>
> tglx
>
Thanks,
Balbir
"Singh, Balbir" <[email protected]> writes:
> On Fri, 2020-04-17 at 16:41 +0200, Thomas Gleixner wrote:
>> Balbir Singh <[email protected]> writes:
>> static void *l1d_flush_pages;
>> static DEFINE_MUTEX(l1d_flush_mutex);
>>
>> int l1d_flush_init(void)
>> {
>> int ret;
>>
>> if (static_cpu_has(X86_FEATURE_FLUSH_L1D) || l1d_flush_pages)
>> return 0;
>>
>> mutex_lock(&l1d_flush_mutex);
>> if (!l1d_flush_pages)
>> l1d_flush_pages = l1d_flush_alloc_pages();
>> ret = l1d_flush_pages ? 0 : -ENOMEM;
>> mutex_unlock(&l1d_flush_mutex);
>> return ret;
>> }
>> EXPORT_SYMBOL_GPL(l1d_flush_init);
>>
>> which removes the export of l1d_flush_alloc_pages() and gets rid of the
>> cleanup counterpart. In a real world deployment unloading of VMX if used
>> once is unlikely and with the task based one you end up with these pages
>> 'leaked' anyway if used once.
>>
> I don't want the patches to be enforce that one cannot unload the kvm module,
> but I can refactor those bits a bit more
Not freeing the l1d flush pages does not prevent unloading the kvm
module. It just keeps the around. It's the same problem with your L1D
flush for tasks. If one tasks uses it then the pages stay around until
the system reboots.
>> If any other architecture enables this, then it will have _ALL_ of this
>> code duplicated. So we should rather have:
>
> But that is being a bit prescriptive to arch's to implement their L1D flushing
> using TIF flags, arch's should be free to use bits in struct_mm for their arch
> if they feel so.
>> - All architectures have to use TIF_SPEC_FLUSH_L1D if they want to
>> support the prctl.
>>
>
> That is a concern (see above), should we enforce this?
Fair enough, but it's trivial enough to have:
static inline void arch_task_l1d_flush_update(bool enable)
static inline bool arch_task_l1d_flush_state(void)
and the rest of the logic is just identical.
Thanks,
tglx
On Sat, 2020-04-18 at 12:17 +0200, Thomas Gleixner wrote:
>
>
> "Singh, Balbir" <[email protected]> writes:
> > On Fri, 2020-04-17 at 16:41 +0200, Thomas Gleixner wrote:
> > > Balbir Singh <[email protected]> writes:
> > > static void *l1d_flush_pages;
> > > static DEFINE_MUTEX(l1d_flush_mutex);
> > >
> > > int l1d_flush_init(void)
> > > {
> > > int ret;
> > >
> > > if (static_cpu_has(X86_FEATURE_FLUSH_L1D) || l1d_flush_pages)
> > > return 0;
> > >
> > > mutex_lock(&l1d_flush_mutex);
> > > if (!l1d_flush_pages)
> > > l1d_flush_pages = l1d_flush_alloc_pages();
> > > ret = l1d_flush_pages ? 0 : -ENOMEM;
> > > mutex_unlock(&l1d_flush_mutex);
> > > return ret;
> > > }
> > > EXPORT_SYMBOL_GPL(l1d_flush_init);
> > >
> > > which removes the export of l1d_flush_alloc_pages() and gets rid of the
> > > cleanup counterpart. In a real world deployment unloading of VMX if used
> > > once is unlikely and with the task based one you end up with these pages
> > > 'leaked' anyway if used once.
> > >
> >
> > I don't want the patches to be enforce that one cannot unload the kvm
> > module,
> > but I can refactor those bits a bit more
>
> Not freeing the l1d flush pages does not prevent unloading the kvm
> module. It just keeps the around. It's the same problem with your L1D
> flush for tasks. If one tasks uses it then the pages stay around until
> the system reboots.
Yes, Fair enough, you also seem to suggest that the same set of pages can be
shared across VMX and L1D flushes (which is fine by me), I suspect at some
point we'd need to to per NUMA node allocations, but lets not prematurely
optimize.
>
> > > If any other architecture enables this, then it will have _ALL_ of this
> > > code duplicated. So we should rather have:
> >
> > But that is being a bit prescriptive to arch's to implement their L1D
> > flushing
> > using TIF flags, arch's should be free to use bits in struct_mm for their
> > arch
> > if they feel so.
> > > - All architectures have to use TIF_SPEC_FLUSH_L1D if they want to
> > > support the prctl.
> > >
> >
> > That is a concern (see above), should we enforce this?
>
> Fair enough, but it's trivial enough to have:
>
> static inline void arch_task_l1d_flush_update(bool enable)
> static inline bool arch_task_l1d_flush_state(void)
>
> and the rest of the logic is just identical.
>
OK, so you'd still like to see the logic move to lib/l1d_flush.c? Let me get
working on that and see what the changes look like
Thanks for the review,
Balbir Singh