Provide a mechanism to flush the L1D cache on context switch. The goal
is to allow tasks that are paranoid due to the recent snoop assisted data
sampling vulnerabilites, to flush their L1D on being switched out.
This protects their data from being snooped or leaked via side channels
after the task has context switched out.
Changelog v5:
- Based on Tom's recommendation, restrict the patches to Intel CPUs
only ([email protected])
- Update reviewed-by tags based on v4.
Changelog v4:
- Refactor the L1D flushing code even further, pages are now allocated
once and never freed. Simplify the exported functions.
- Change the name prefixs to be more consistent (l1d_flush_*)
- Refactoring of the code done in the spirit of the comments, prctl
still requires arch bits for get/set L1D flush and ofcourse in
the arch switch_mm bits flushing the L1D cache.
Changelog v3:
- Refactor the return value of what flush_l1d_cache_hw() returns
- Refactor the code, so that the generic setup bits come first
(patch 3 from previous posting is now patches 3 and 4)
- Move from arch_prctl() to the prctl() interface as recommend
in the reviews.
Changelog v2:
- Fix a miss of mutex_unlock (caught by Borislav Petkov <[email protected]>)
- Add documentation about the changes (Josh Poimboeuf
<[email protected]>)
Changelog:
- Refactor the code and reuse cond_ibpb() - code bits provided by tglx
- Merge mm state tracking for ibpb and l1d flush
- Rename TIF_L1D_FLUSH to TIF_SPEC_FLUSH_L1D
Changelog RFC:
- Reuse existing code for allocation and flush
- Simplify the goto logic in the actual l1d_flush function
- Optimize the code path with jump labels/static functions
The previous version of these patches are posted at:
https://lore.kernel.org/lkml/[email protected]/
Balbir Singh (6):
arch/x86/kvm: Refactor l1d flush lifecycle management
arch/x86/kvm: Refactor tlbflush and l1d flush
arch/x86/mm: Refactor cond_ibpb() to support other use cases
arch/x86/kvm: Refactor L1D flushing
Optionally flush L1D on context switch
Documentation: Add L1D flushing Documentation
Documentation/admin-guide/hw-vuln/index.rst | 1 +
.../admin-guide/hw-vuln/l1d_flush.rst | 40 ++++++
arch/x86/include/asm/cacheflush.h | 8 ++
arch/x86/include/asm/thread_info.h | 7 +-
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/l1d_flush.c | 120 ++++++++++++++++++
arch/x86/kvm/vmx/vmx.c | 62 +--------
arch/x86/mm/tlb.c | 83 +++++++++---
include/uapi/linux/prctl.h | 4 +
kernel/sys.c | 20 +++
11 files changed, 266 insertions(+), 82 deletions(-)
create mode 100644 Documentation/admin-guide/hw-vuln/l1d_flush.rst
create mode 100644 arch/x86/kernel/l1d_flush.c
--
2.17.1
cond_ibpb() has the necessary bits required to track the
previous mm in switch_mm_irqs_off(). This can be reused for
other use cases like L1D flushing (on context switch out).
Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/mm/tlb.c | 43 +++++++++++++++++----------------
2 files changed, 23 insertions(+), 22 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e0b660..a927d40664df 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,7 +83,7 @@ struct tlb_state {
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
- unsigned long last_user_mm_ibpb;
+ unsigned long last_user_mm_spec;
};
u16 loaded_mm_asid;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index cf81902e6992..10056b8d8f01 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -43,10 +43,11 @@
*/
/*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -345,19 +346,24 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}
-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+ unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
- return (unsigned long)next->mm | ibpb;
+ return (unsigned long)next->mm | spec_bits;
}
-static void cond_ibpb(struct task_struct *next)
+static void cond_mitigation(struct task_struct *next)
{
+ unsigned long prev_mm, next_mm;
+
if (!next || !next->mm)
return;
+ next_mm = mm_mangle_tif_spec_bits(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+
/*
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
@@ -368,8 +374,6 @@ static void cond_ibpb(struct task_struct *next)
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
- unsigned long prev_mm, next_mm;
-
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
@@ -399,20 +403,14 @@ static void cond_ibpb(struct task_struct *next)
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
- * cpu_tlbstate.last_user_mm_ibpb for comparison.
- */
- next_mm = mm_mangle_tif_spec_ib(next);
- prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
- /*
+ * cpu_tlbstate.last_user_mm_spec for comparison.
+ *
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
-
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -421,11 +419,12 @@ static void cond_ibpb(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+ (unsigned long)next->mm)
indirect_branch_prediction_barrier();
- this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
- }
}
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}
#ifdef CONFIG_PERF_EVENTS
@@ -550,8 +549,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
+ * The hook can also be used for mitigations that rely
+ * on switch_mm for hooks.
*/
- cond_ibpb(tsk);
+ cond_mitigation(tsk);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -677,7 +678,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_IBPB);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
--
2.17.1