2020-05-10 01:50:13

by Singh, Balbir

[permalink] [raw]
Subject: [PATCH v6 0/6] Optionally flush L1D on context switch

Provide a mechanism to flush the L1D cache on context switch. The goal
is to allow tasks that are paranoid due to the recent snoop assisted data
sampling vulnerabilites, to flush their L1D on being switched out.
This protects their data from being snooped or leaked via side channels
after the task has context switched out.


Changelog v6:
- Fix the complaint about variable shadowing (Reported-by: kbuild test
robot <[email protected]>)
Changelog v5:
- Based on Tom's recommendation, restrict the patches to Intel CPUs
only ([email protected])
- Update reviewed-by tags based on v4.
Changelog v4:
- Refactor the L1D flushing code even further, pages are now allocated
once and never freed. Simplify the exported functions.
- Change the name prefixs to be more consistent (l1d_flush_*)
- Refactoring of the code done in the spirit of the comments, prctl
still requires arch bits for get/set L1D flush and ofcourse in
the arch switch_mm bits flushing the L1D cache.
Changelog v3:
- Refactor the return value of what flush_l1d_cache_hw() returns
- Refactor the code, so that the generic setup bits come first
(patch 3 from previous posting is now patches 3 and 4)
- Move from arch_prctl() to the prctl() interface as recommend
in the reviews.
Changelog v2:
- Fix a miss of mutex_unlock (caught by Borislav Petkov <[email protected]>)
- Add documentation about the changes (Josh Poimboeuf
<[email protected]>)

Changelog:
- Refactor the code and reuse cond_ibpb() - code bits provided by tglx
- Merge mm state tracking for ibpb and l1d flush
- Rename TIF_L1D_FLUSH to TIF_SPEC_FLUSH_L1D

Changelog RFC:
- Reuse existing code for allocation and flush
- Simplify the goto logic in the actual l1d_flush function
- Optimize the code path with jump labels/static functions

The previous version of these patches are posted at:

https://lore.kernel.org/lkml/[email protected]/

Balbir Singh (6):
arch/x86/kvm: Refactor l1d flush lifecycle management
arch/x86/kvm: Refactor tlbflush and l1d flush
arch/x86/mm: Refactor cond_ibpb() to support other use cases
arch/x86/kvm: Refactor L1D flushing
Optionally flush L1D on context switch
Documentation: Add L1D flushing Documentation

Documentation/admin-guide/hw-vuln/index.rst | 1 +
.../admin-guide/hw-vuln/l1d_flush.rst | 40 ++++++
arch/x86/include/asm/cacheflush.h | 8 ++
arch/x86/include/asm/thread_info.h | 7 +-
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/l1d_flush.c | 120 ++++++++++++++++++
arch/x86/kvm/vmx/vmx.c | 62 +--------
arch/x86/mm/tlb.c | 83 +++++++++---
include/uapi/linux/prctl.h | 4 +
kernel/sys.c | 20 +++
11 files changed, 266 insertions(+), 82 deletions(-)
create mode 100644 Documentation/admin-guide/hw-vuln/l1d_flush.rst
create mode 100644 arch/x86/kernel/l1d_flush.c

--
2.17.1


2020-05-10 01:50:21

by Singh, Balbir

[permalink] [raw]
Subject: [PATCH v6 2/6] arch/x86/kvm: Refactor tlbflush and l1d flush

Refactor the existing assembly bits into smaller helper functions
and also abstract L1D_FLUSH into a helper function. Use these
functions in kvm for L1D flushing.

Reviewed-by: Kees Cook <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/cacheflush.h | 3 ++
arch/x86/kernel/l1d_flush.c | 54 +++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.c | 29 ++---------------
3 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index bac56fcd9790..21cc3b28fa63 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -8,7 +8,10 @@

#define L1D_CACHE_ORDER 4
void clflush_cache_range(void *addr, unsigned int size);
+void l1d_flush_populate_tlb(void *l1d_flush_pages);
void *l1d_flush_alloc_pages(void);
void l1d_flush_cleanup_pages(void *l1d_flush_pages);
+void l1d_flush_sw(void *l1d_flush_pages);
+int l1d_flush_hw(void);

#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
index d605878c8f28..5871794f890d 100644
--- a/arch/x86/kernel/l1d_flush.c
+++ b/arch/x86/kernel/l1d_flush.c
@@ -34,3 +34,57 @@ void l1d_flush_cleanup_pages(void *l1d_flush_pages)
free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
}
EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);
+
+/*
+ * Not all users of l1d flush would want to populate the TLB first
+ * split out the function so that callers can optionally flush the L1D
+ * cache via sw without prefetching the TLB.
+ */
+void l1d_flush_populate_tlb(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+EXPORT_SYMBOL_GPL(l1d_flush_populate_tlb);
+
+int l1d_flush_hw(void)
+{
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return 0;
+ }
+ return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(l1d_flush_hw);
+
+void l1d_flush_sw(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* Fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ecx");
+}
+EXPORT_SYMBOL_GPL(l1d_flush_sw);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f35654db904a..4f95927aad4c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6031,8 +6031,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
*/
static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
/*
* This code is only executed when the the flush mode is 'cond' or
* 'always'
@@ -6061,32 +6059,11 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)

vcpu->stat.l1d_flush++;

- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ if (!l1d_flush_hw())
return;
- }

- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
+ l1d_flush_populate_tlb(vmx_l1d_flush_pages);
+ l1d_flush_sw(vmx_l1d_flush_pages);
}

static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
--
2.17.1

2020-05-10 01:50:23

by Singh, Balbir

[permalink] [raw]
Subject: [PATCH v6 3/6] arch/x86/mm: Refactor cond_ibpb() to support other use cases

cond_ibpb() has the necessary bits required to track the
previous mm in switch_mm_irqs_off(). This can be reused for
other use cases like L1D flushing (on context switch out).

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/mm/tlb.c | 43 +++++++++++++++++----------------
2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e0b660..a927d40664df 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,7 +83,7 @@ struct tlb_state {
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
- unsigned long last_user_mm_ibpb;
+ unsigned long last_user_mm_spec;
};

u16 loaded_mm_asid;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index cf81902e6992..10056b8d8f01 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -43,10 +43,11 @@
*/

/*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)

/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -345,19 +346,24 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}

-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+ unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;

- return (unsigned long)next->mm | ibpb;
+ return (unsigned long)next->mm | spec_bits;
}

-static void cond_ibpb(struct task_struct *next)
+static void cond_mitigation(struct task_struct *next)
{
+ unsigned long prev_mm, next_mm;
+
if (!next || !next->mm)
return;

+ next_mm = mm_mangle_tif_spec_bits(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+
/*
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
@@ -368,8 +374,6 @@ static void cond_ibpb(struct task_struct *next)
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
- unsigned long prev_mm, next_mm;
-
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
@@ -399,20 +403,14 @@ static void cond_ibpb(struct task_struct *next)
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
- * cpu_tlbstate.last_user_mm_ibpb for comparison.
- */
- next_mm = mm_mangle_tif_spec_ib(next);
- prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
- /*
+ * cpu_tlbstate.last_user_mm_spec for comparison.
+ *
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
-
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}

if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -421,11 +419,12 @@ static void cond_ibpb(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+ (unsigned long)next->mm)
indirect_branch_prediction_barrier();
- this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
- }
}
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}

#ifdef CONFIG_PERF_EVENTS
@@ -550,8 +549,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
+ * The hook can also be used for mitigations that rely
+ * on switch_mm for hooks.
*/
- cond_ibpb(tsk);
+ cond_mitigation(tsk);

if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -677,7 +678,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));

/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_IBPB);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
--
2.17.1

2020-05-10 01:51:09

by Singh, Balbir

[permalink] [raw]
Subject: [PATCH v6 6/6] Documentation: Add L1D flushing Documentation

Add documentation of l1d flushing, explain the need for the
feature and how it can be used.

Signed-off-by: Balbir Singh <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
---
Documentation/admin-guide/hw-vuln/index.rst | 1 +
.../admin-guide/hw-vuln/l1d_flush.rst | 40 +++++++++++++++++++
2 files changed, 41 insertions(+)
create mode 100644 Documentation/admin-guide/hw-vuln/l1d_flush.rst

diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 0795e3c2643f..35633b299d45 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -14,3 +14,4 @@ are configurable at compile, boot or run time.
mds
tsx_async_abort
multihit.rst
+ l1d_flush
diff --git a/Documentation/admin-guide/hw-vuln/l1d_flush.rst b/Documentation/admin-guide/hw-vuln/l1d_flush.rst
new file mode 100644
index 000000000000..7d515b8c29f1
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst
@@ -0,0 +1,40 @@
+L1D Flushing for the paranoid
+=============================
+
+With an increasing number of vulnerabilities being reported around data
+leaks from L1D, a new user space mechanism to flush the L1D cache on
+context switch is added to the kernel. This should help address
+CVE-2020-0550 and for paranoid applications, keep them safe from any
+yet to be discovered vulnerabilities, related to leaks from the L1D
+cache.
+
+Tasks can opt in to this mechanism by using a prctl (implemented only
+for x86 at the moment).
+
+Related CVES
+------------
+At the present moment, the following CVEs can be addressed by this
+mechanism
+
+ ============= ======================== ==================
+ CVE-2020-0550 Improper Data Forwarding OS related aspects
+ ============= ======================== ==================
+
+Usage Guidelines
+----------------
+Applications can call ``prctl(2)`` with one of these two arguments
+
+1. PR_SET_L1D_FLUSH - flush the L1D cache on context switch (out)
+2. PR_GET_L1D_FLUSH - get the current state of the L1D cache flush, returns 1
+ if set and 0 if not set.
+
+**NOTE**: The feature is disabled by default, applications to need to specifically
+opt into the feature to enable it.
+
+Mitigation
+----------
+When PR_SET_L1D_FLUSH is enabled for a task, on switching tasks (when
+the address space changes), a flush of the L1D cache is performed for
+the task when it leaves the CPU. If the underlying CPU supports L1D
+flushing in hardware, the hardware mechanism is used, otherwise a software
+fallback, similar to the mechanism used by L1TF is used.
--
2.17.1

2020-05-10 01:52:40

by Singh, Balbir

[permalink] [raw]
Subject: [PATCH v6 5/6] Optionally flush L1D on context switch

Implement a mechanism to selectively flush the L1D cache. The goal is to
allow tasks that are paranoid due to the recent snoop assisted data sampling
vulnerabilites, to flush their L1D on being switched out. This protects
their data from being snooped or leaked via side channels after the task
has context switched out.

There are two scenarios we might want to protect against, a task leaving
the CPU with data still in L1D (which is the main concern of this patch),
the second scenario is a malicious task coming in (not so well trusted)
for which we want to clean up the cache before it starts. Only the case
for the former is addressed.

A new thread_info flag TIF_SPEC_FLUSH_L1D is added to track tasks which
opt-into L1D flushing. cpu_tlbstate.last_user_mm_spec is used to convert
the TIF flags into mm state (per cpu via last_user_mm_spec) in
cond_mitigation(), which then used to do decide when to call flush_l1d().

Add prctl()'s to opt-in to the L1D cache on context switch out, the
existing mechanisms of tracking prev_mm via cpu_tlbstate is
reused to track state of the tasks and to flush the L1D cache.
The prctl interface is generic and can be ported over to other
architectures.

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
---
arch/x86/include/asm/thread_info.h | 7 ++++-
arch/x86/mm/tlb.c | 44 ++++++++++++++++++++++++++++--
include/uapi/linux/prctl.h | 4 +++
kernel/sys.c | 20 ++++++++++++++
4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8de8ceccb8bc..67de693d9ba1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -84,7 +84,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
-#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
+#define TIF_SPEC_FLUSH_L1D 10 /* Flush L1D on mm switches (processes) */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
@@ -96,6 +96,7 @@ struct thread_info {
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -132,6 +133,7 @@ struct thread_info {
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
#define _TIF_FSCHECK (1 << TIF_FSCHECK)
+#define _TIF_SPEC_FLUSH_L1D (1 << TIF_SPEC_FLUSH_L1D)

/* Work to do before invoking the actual syscall. */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -235,6 +237,9 @@ static inline int arch_within_stack_frames(const void * const stack,
current_thread_info()->status & TS_COMPAT)
#endif

+extern int arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable);
+extern int arch_prctl_l1d_flush_get(struct task_struct *tsk);
+
extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 10056b8d8f01..7ea9bc9e089f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -13,6 +13,7 @@
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
#include <asm/cache.h>
+#include <asm/cacheflush.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>

@@ -43,11 +44,12 @@
*/

/*
- * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
* stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
-#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
+#define LAST_USER_MM_L1D_FLUSH 0x2UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH)

/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -308,6 +310,35 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);

+static int enable_l1d_flush_for_task(struct task_struct *tsk)
+{
+ int ret = l1d_flush_init_once();
+
+ if (ret < 0)
+ return ret;
+
+ set_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
+ return ret;
+}
+
+static int disable_l1d_flush_for_task(struct task_struct *tsk)
+{
+ clear_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
+ return 0;
+}
+
+int arch_prctl_l1d_flush_get(struct task_struct *tsk)
+{
+ return test_ti_thread_flag(&tsk->thread_info, TIF_SPEC_FLUSH_L1D);
+}
+
+int arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable)
+{
+ if (enable)
+ return enable_l1d_flush_for_task(tsk);
+ return disable_l1d_flush_for_task(tsk);
+}
+
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -424,6 +455,9 @@ static void cond_mitigation(struct task_struct *next)
indirect_branch_prediction_barrier();
}

+ if (prev_mm & LAST_USER_MM_L1D_FLUSH)
+ arch_l1d_flush(0); /* Just flush, don't populate the TLB */
+
this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}

@@ -678,6 +712,12 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));

/* Reinitialize tlbstate. */
+
+ /*
+ * Leave last_user_mm_spec at LAST_USER_MM_IBPB, we don't
+ * want to set LAST_USER_MM_L1D_FLUSH and force a flush before
+ * we've allocated the flush pages.
+ */
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_IBPB);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 07b4f8131e36..42cb3038c81a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -238,4 +238,8 @@ struct prctl_mm_map {
#define PR_SET_IO_FLUSHER 57
#define PR_GET_IO_FLUSHER 58

+/* Flush L1D on context switch (mm) */
+#define PR_SET_L1D_FLUSH 59
+#define PR_GET_L1D_FLUSH 60
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index d325f3ab624a..578aa8b6d87e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2262,6 +2262,16 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}

+int __weak arch_prctl_l1d_flush_set(struct task_struct *tsk, unsigned long enable)
+{
+ return -EINVAL;
+}
+
+int __weak arch_prctl_l1d_flush_get(struct task_struct *t)
+{
+ return -EINVAL;
+}
+
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)

SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2514,6 +2524,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,

error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_SET_L1D_FLUSH:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_l1d_flush_set(me, arg2);
+ break;
+ case PR_GET_L1D_FLUSH:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_prctl_l1d_flush_get(me);
+ break;
default:
error = -EINVAL;
break;
--
2.17.1

2020-05-10 01:53:55

by Singh, Balbir

[permalink] [raw]
Subject: [PATCH v6 4/6] arch/x86/kvm: Refactor L1D flushing

Move out the initialization function to l1d_flush_init_once()
so that it can be reused for subsequent patches. The side-effect
of this patch is that the memory allocated for l1d flush pages
is no longer freed up and the memory allocated once is shared
amongst callers.

l1d_flush_sw/hw() are now abstracted under arch_l1d_flush().
vmx_l1d_flush_mutex however continues to exist as it also used
from other code paths.

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/cacheflush.h | 12 +++---
arch/x86/kernel/l1d_flush.c | 64 +++++++++++++++++++++++--------
arch/x86/kvm/vmx/vmx.c | 20 ++--------
3 files changed, 57 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 21cc3b28fa63..851d8f1ab827 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,11 +7,13 @@
#include <asm/special_insns.h>

#define L1D_CACHE_ORDER 4
+
+enum l1d_flush_options {
+ L1D_FLUSH_POPULATE_TLB = 0x1,
+};
+
void clflush_cache_range(void *addr, unsigned int size);
-void l1d_flush_populate_tlb(void *l1d_flush_pages);
-void *l1d_flush_alloc_pages(void);
-void l1d_flush_cleanup_pages(void *l1d_flush_pages);
-void l1d_flush_sw(void *l1d_flush_pages);
-int l1d_flush_hw(void);
+int l1d_flush_init_once(void);
+void arch_l1d_flush(enum l1d_flush_options options);

#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
index 5871794f890d..ad66e5fe1565 100644
--- a/arch/x86/kernel/l1d_flush.c
+++ b/arch/x86/kernel/l1d_flush.c
@@ -1,10 +1,10 @@
#include <linux/mm.h>
#include <asm/cacheflush.h>

-void *l1d_flush_alloc_pages(void)
+static void *l1d_flush_alloc_pages(void)
{
struct page *page;
- void *l1d_flush_pages = NULL;
+ void *flush_pages = NULL;
int i;

/*
@@ -14,7 +14,7 @@ void *l1d_flush_alloc_pages(void)
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return NULL;
- l1d_flush_pages = page_address(page);
+ flush_pages = page_address(page);

/*
* Initialize each page with a different pattern in
@@ -22,25 +22,19 @@ void *l1d_flush_alloc_pages(void)
* virtualization case.
*/
for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
- memset(l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ memset(flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
- return l1d_flush_pages;
+ return flush_pages;
}
-EXPORT_SYMBOL_GPL(l1d_flush_alloc_pages);

-void l1d_flush_cleanup_pages(void *l1d_flush_pages)
-{
- free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
-}
-EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);

/*
* Not all users of l1d flush would want to populate the TLB first
* split out the function so that callers can optionally flush the L1D
* cache via sw without prefetching the TLB.
*/
-void l1d_flush_populate_tlb(void *l1d_flush_pages)
+static void l1d_flush_populate_tlb(void *l1d_flush_pages)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;

@@ -58,9 +52,8 @@ void l1d_flush_populate_tlb(void *l1d_flush_pages)
[size] "r" (size)
: "eax", "ebx", "ecx", "edx");
}
-EXPORT_SYMBOL_GPL(l1d_flush_populate_tlb);

-int l1d_flush_hw(void)
+static int l1d_flush_hw(void)
{
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
@@ -68,9 +61,8 @@ int l1d_flush_hw(void)
}
return -ENOTSUPP;
}
-EXPORT_SYMBOL_GPL(l1d_flush_hw);

-void l1d_flush_sw(void *l1d_flush_pages)
+static void l1d_flush_sw(void *l1d_flush_pages)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;

@@ -87,4 +79,42 @@ void l1d_flush_sw(void *l1d_flush_pages)
[size] "r" (size)
: "eax", "ecx");
}
-EXPORT_SYMBOL_GPL(l1d_flush_sw);
+
+static void *l1d_flush_pages;
+static DEFINE_MUTEX(l1d_flush_mutex);
+
+/*
+ * Initialize and setup L1D flush once, each caller will reuse the
+ * l1d_flush_pages for flushing, no per CPU allocations or NUMA aware
+ * allocations at the moment.
+ */
+int l1d_flush_init_once(void)
+{
+ int ret = 0;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return -ENOTSUPP;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D) || l1d_flush_pages)
+ return ret;
+
+ mutex_lock(&l1d_flush_mutex);
+ if (!l1d_flush_pages)
+ l1d_flush_pages = l1d_flush_alloc_pages();
+ ret = l1d_flush_pages ? 0 : -ENOMEM;
+ mutex_unlock(&l1d_flush_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(l1d_flush_init_once);
+
+void arch_l1d_flush(enum l1d_flush_options options)
+{
+ if (!l1d_flush_hw())
+ return;
+
+ if (options & L1D_FLUSH_POPULATE_TLB)
+ l1d_flush_populate_tlb(l1d_flush_pages);
+
+ l1d_flush_sw(l1d_flush_pages);
+}
+EXPORT_SYMBOL_GPL(arch_l1d_flush);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4f95927aad4c..d56702578588 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -203,8 +203,6 @@ static const struct {
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
};

-static void *vmx_l1d_flush_pages;
-
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
@@ -247,12 +245,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}

- if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
- !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- vmx_l1d_flush_pages = l1d_flush_alloc_pages();
- if (!vmx_l1d_flush_pages)
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ if (l1d_flush_init_once())
return -ENOMEM;
- }

l1tf_vmx_mitigation = l1tf;

@@ -6058,12 +6053,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
}

vcpu->stat.l1d_flush++;
-
- if (!l1d_flush_hw())
- return;
-
- l1d_flush_populate_tlb(vmx_l1d_flush_pages);
- l1d_flush_sw(vmx_l1d_flush_pages);
+ arch_l1d_flush(L1D_FLUSH_POPULATE_TLB);
}

static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -8056,10 +8046,6 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {

static void vmx_cleanup_l1d_flush(void)
{
- if (vmx_l1d_flush_pages) {
- l1d_flush_cleanup_pages(vmx_l1d_flush_pages);
- vmx_l1d_flush_pages = NULL;
- }
/* Restore state so sysfs ignores VMX */
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
--
2.17.1

2020-05-13 13:38:19

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6 6/6] Documentation: Add L1D flushing Documentation

Balbir Singh <[email protected]> writes:
> +With an increasing number of vulnerabilities being reported around data
> +leaks from L1D, a new user space mechanism to flush the L1D cache on
> +context switch is added to the kernel. This should help address

is added to the kernel? This is documentation of an existing feature...

> +Mitigation
> +----------
> +When PR_SET_L1D_FLUSH is enabled for a task, on switching tasks (when
> +the address space changes), a flush of the L1D cache is performed for
> +the task when it leaves the CPU. If the underlying CPU supports L1D
> +flushing in hardware, the hardware mechanism is used, otherwise a software
> +fallback, similar to the mechanism used by L1TF is used.

This lacks documentation of the limitations, especially that this does
not help against cross Hyperthread attacks.

I've massaged the whole thing a bit. See below.

Thanks,

tglx
8<-----------------

--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -14,3 +14,4 @@ are configurable at compile, boot or run
mds
tsx_async_abort
multihit.rst
+ l1d_flush
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst
@@ -0,0 +1,53 @@
+L1D Flushing for the paranoid
+=============================
+
+With an increasing number of vulnerabilities being reported around data
+leaks from the Level 1 Data cache (L1D) the kernel provides an opt-in
+mechanism to flush the L1D cache on context switch.
+
+This mechanism can be used to address e.g. CVE-2020-0550. For paranoid
+applications the mechanism keeps them safe from any yet to be discovered
+vulnerabilities, related to leaks from the L1D cache.
+
+
+Related CVEs
+------------
+At the present moment, the following CVEs can be addressed by this
+mechanism
+
+ ============= ======================== ==================
+ CVE-2020-0550 Improper Data Forwarding OS related aspects
+ ============= ======================== ==================
+
+Usage Guidelines
+----------------
+Applications can call ``prctl(2)`` with one of these two arguments
+
+1. PR_SET_L1D_FLUSH - flush the L1D cache on context switch (out)
+2. PR_GET_L1D_FLUSH - get the current state of the L1D cache flush, returns 1
+ if set and 0 if not set.
+
+**NOTE**: The feature is disabled by default, applications need to
+specifically opt into the feature to enable it.
+
+Mitigation
+----------
+
+When PR_SET_L1D_FLUSH is enabled for a task a flush of the L1D cache is
+performed when the task is scheduled out and the incoming task belongs to a
+different process and therefore to a different address space.
+
+If the underlying CPU supports L1D flushing in hardware, the hardware
+mechanism is used, otherwise a software fallback, similar to the L1TF
+mitigation, is invoked.
+
+Limitations
+-----------
+
+The mechanism does not mitigate L1D data leaks between tasks belonging to
+different processes which are concurrently executing on sibling threads of
+a physical CPU core when SMT is enabled on the system.
+
+This can be addressed by controlled placement of processes on physical CPU
+cores or by disabling SMT. See the relevant chapter in the L1TF mitigation
+document: :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`.

2020-05-13 15:07:21

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

Balbir Singh <[email protected]> writes:
>
> + if (prev_mm & LAST_USER_MM_L1D_FLUSH)
> + arch_l1d_flush(0); /* Just flush, don't populate the TLB */

Bah. I fundamentally hate tail comments. They are just disturbing the
reading flow. Aside of that, this states the WHAT but not the WHY. And
if you add that explanation then you need more than 20 characters and
end up with

if (prev_mm & LAST_USER_MM_L1D_FLUSH) {
/*
* Proper comment explaining why this is flushing
* without prepopulating the TLB.
*/
arch_l1d_flush(0);
}

anyway. And even for a short comment which fits after the function call
it's way better to have:

if (prev_mm & LAST_USER_MM_L1D_FLUSH) {
/* Short explanation */
arch_l1d_flush(0);
}

Hmm?

> + /*
> + * Leave last_user_mm_spec at LAST_USER_MM_IBPB, we don't
> + * want to set LAST_USER_MM_L1D_FLUSH and force a flush before
> + * we've allocated the flush pages.

Ah here is the comment. I still like the explicit define for the (re)
init.

2020-05-13 15:32:12

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

Balbir Singh <[email protected]> writes:

> Implement a mechanism to selectively flush the L1D cache. The goal is to
> allow tasks that are paranoid due to the recent snoop assisted data sampling
> vulnerabilites, to flush their L1D on being switched out. This protects
> their data from being snooped or leaked via side channels after the task
> has context switched out.
>
> There are two scenarios we might want to protect against, a task leaving
> the CPU with data still in L1D (which is the main concern of this patch),
> the second scenario is a malicious task coming in (not so well trusted)
> for which we want to clean up the cache before it starts. Only the case
> for the former is addressed.
>
> A new thread_info flag TIF_SPEC_FLUSH_L1D is added to track tasks which
> opt-into L1D flushing. cpu_tlbstate.last_user_mm_spec is used to convert
> the TIF flags into mm state (per cpu via last_user_mm_spec) in
> cond_mitigation(), which then used to do decide when to call flush_l1d().
>
> Add prctl()'s to opt-in to the L1D cache on context switch out, the
> existing mechanisms of tracking prev_mm via cpu_tlbstate is
> reused to track state of the tasks and to flush the L1D cache.
> The prctl interface is generic and can be ported over to other
> architectures.
>
> Suggested-by: Thomas Gleixner <[email protected]>
> Signed-off-by: Balbir Singh <[email protected]>
> Reviewed-by: Kees Cook <[email protected]>
> ---
> arch/x86/include/asm/thread_info.h | 7 ++++-
> arch/x86/mm/tlb.c | 44 ++++++++++++++++++++++++++++--
> include/uapi/linux/prctl.h | 4 +++
> kernel/sys.c | 20 ++++++++++++++
> 4 files changed, 72 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index 8de8ceccb8bc..67de693d9ba1 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -84,7 +84,7 @@ struct thread_info {
> #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
> #define TIF_SECCOMP 8 /* secure computing */
> #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
> -#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
> +#define TIF_SPEC_FLUSH_L1D 10 /* Flush L1D on mm switches (processes) */
> #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
> #define TIF_UPROBE 12 /* breakpointed or singlestepping */
> #define TIF_PATCH_PENDING 13 /* pending live patching update */
> @@ -96,6 +96,7 @@ struct thread_info {
> #define TIF_MEMDIE 20 /* is terminating due to OOM killer */
> #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
> #define TIF_IO_BITMAP 22 /* uses I/O bitmap */
> +#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */
> #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
> #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
> #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
> @@ -132,6 +133,7 @@ struct thread_info {
> #define _TIF_ADDR32 (1 << TIF_ADDR32)
> #define _TIF_X32 (1 << TIF_X32)
> #define _TIF_FSCHECK (1 << TIF_FSCHECK)
> +#define _TIF_SPEC_FLUSH_L1D (1 << TIF_SPEC_FLUSH_L1D)

Bah. These defines are ordered in the same way as the TIF defines....

> /*
> - * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
> + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
> * stored in cpu_tlb_state.last_user_mm_spec.
> */
> #define LAST_USER_MM_IBPB 0x1UL
> -#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
> +#define LAST_USER_MM_L1D_FLUSH 0x2UL
> +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH)

You lost

+ BUILD_BUG_ON(TIF_SPEC_FLUSH_L1D != TIF_SPEC_IB + 1);

from patch I gave you.

2020-05-13 20:46:47

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6 3/6] arch/x86/mm: Refactor cond_ibpb() to support other use cases

Balbir Singh <[email protected]> writes:
> @@ -550,8 +549,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> * Avoid user/user BTB poisoning by flushing the branch
> * predictor when switching between processes. This stops
> * one process from doing Spectre-v2 attacks on another.
> + * The hook can also be used for mitigations that rely
> + * on switch_mm for hooks.

The new function name has absolutely nothing to do with IBPB and is
clearly talking about mitigations. So the IBPB comment wants to move and
that extra sentence you bolted on can go away. It's nonsensical word
salad anyway.
>
> /* Reinitialize tlbstate. */
> - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
> + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_IBPB);

There is still no comment why this only needs MM_IBPB. I'll change this
to LAST_USER_MM_INIT and put that define close to the others.

Thanks,

tglx

2020-05-13 20:51:48

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

Balbir Singh <[email protected]> writes:

This part:

> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -238,4 +238,8 @@ struct prctl_mm_map {
> #define PR_SET_IO_FLUSHER 57
> #define PR_GET_IO_FLUSHER 58
>
> +/* Flush L1D on context switch (mm) */
> +#define PR_SET_L1D_FLUSH 59
> +#define PR_GET_L1D_FLUSH 60

...

> @@ -2514,6 +2524,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>
> error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
> break;
> + case PR_SET_L1D_FLUSH:
> + if (arg3 || arg4 || arg5)
> + return -EINVAL;
> + error = arch_prctl_l1d_flush_set(me, arg2);
> + break;
> + case PR_GET_L1D_FLUSH:
> + if (arg2 || arg3 || arg4 || arg5)
> + return -EINVAL;
> + error = arch_prctl_l1d_flush_get(me);
> + break;
> default:
> error = -EINVAL;
> break;

wants to be split into a separate patch, really. Then we get a proper
subject lines with proper subsystem prefixes. This part also lacks a
description in Documentation/userspace-api/ and function prototypes for
the arch_prctl* functions.

But looking at this deeper (yes I should have noticed earlier):

Why do we need yet another PRCTL?

We already have PR_SET_SPECULATION_CTRL/PR_GET_SPECULATION_CTRL. That
L1D flush thingy fits into this category, right?

This makes even more sense if you think about the second use case for
L1D flush, i.e. the flush when a untrusted task comes in. If we ever
want to support that case then this will be imposed by seccomp and then
we'd need yet another interface there.

And for this reason we should also name that current opt-in thingy:
L1D_FLUSH_OUT in the prctl and also for the TIF bits.

Hmm? Kees?

I've applied the first 4 patches to:

git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git x86/mm

so the polishing I did gets preserved and you don't have to resend the
whole pile.

Thanks,

tglx

2020-05-14 01:15:04

by Singh, Balbir

[permalink] [raw]
Subject: Re: [PATCH v6 6/6] Documentation: Add L1D flushing Documentation

On Wed, 2020-05-13 at 15:33 +0200, Thomas Gleixner wrote:
>
>
> Balbir Singh <[email protected]> writes:
> > +With an increasing number of vulnerabilities being reported around
> > data
> > +leaks from L1D, a new user space mechanism to flush the L1D cache
> > on
> > +context switch is added to the kernel. This should help address
>
> is added to the kernel? This is documentation of an existing
> feature...
>

Good catch! Thanks

> > +Mitigation
> > +----------
> > +When PR_SET_L1D_FLUSH is enabled for a task, on switching tasks
> > (when
> > +the address space changes), a flush of the L1D cache is performed
> > for
> > +the task when it leaves the CPU. If the underlying CPU supports L1D
> > +flushing in hardware, the hardware mechanism is used, otherwise a
> > software
> > +fallback, similar to the mechanism used by L1TF is used.
>
> This lacks documentation of the limitations, especially that this does
> not help against cross Hyperthread attacks.
>

Yes, true

> I've massaged the whole thing a bit. See below.
>
> Thanks,
>
> tglx
> 8<-----------------
>
> --- a/Documentation/admin-guide/hw-vuln/index.rst
> +++ b/Documentation/admin-guide/hw-vuln/index.rst
> @@ -14,3 +14,4 @@ are configurable at compile, boot or run
> mds
> tsx_async_abort
> multihit.rst
> + l1d_flush
> --- /dev/null
> +++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst
> @@ -0,0 +1,53 @@
> +L1D Flushing for the paranoid
> +=============================
> +
> +With an increasing number of vulnerabilities being reported around
> data
> +leaks from the Level 1 Data cache (L1D) the kernel provides an opt-in
> +mechanism to flush the L1D cache on context switch.
> +
> +This mechanism can be used to address e.g. CVE-2020-0550. For
> paranoid
> +applications the mechanism keeps them safe from any yet to be
> discovered
> +vulnerabilities, related to leaks from the L1D cache.
> +
> +
> +Related CVEs
> +------------
> +At the present moment, the following CVEs can be addressed by this
> +mechanism
> +
> + ============= ======================== ================
> ==
> + CVE-2020-0550 Improper Data Forwarding OS related
> aspects
> + ============= ======================== ================
> ==
> +
> +Usage Guidelines
> +----------------
> +Applications can call ``prctl(2)`` with one of these two arguments
> +
> +1. PR_SET_L1D_FLUSH - flush the L1D cache on context switch (out)
> +2. PR_GET_L1D_FLUSH - get the current state of the L1D cache flush,
> returns 1
> + if set and 0 if not set.
> +
> +**NOTE**: The feature is disabled by default, applications need to
> +specifically opt into the feature to enable it.
> +
> +Mitigation
> +----------
> +
> +When PR_SET_L1D_FLUSH is enabled for a task a flush of the L1D cache
> is
> +performed when the task is scheduled out and the incoming task
> belongs to a
> +different process and therefore to a different address space.
> +
> +If the underlying CPU supports L1D flushing in hardware, the hardware
> +mechanism is used, otherwise a software fallback, similar to the L1TF
> +mitigation, is invoked.
> +
> +Limitations
> +-----------
> +
> +The mechanism does not mitigate L1D data leaks between tasks
> belonging to
> +different processes which are concurrently executing on sibling
> threads of
> +a physical CPU core when SMT is enabled on the system.
> +
> +This can be addressed by controlled placement of processes on
> physical CPU
> +cores or by disabling SMT. See the relevant chapter in the L1TF
> mitigation
> +document: :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst
> <smt_control>`.

I like your addition above

Thanks,
Balbir Singh.

2020-05-14 07:45:57

by Singh, Balbir

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

On Wed, 2020-05-13 at 18:16 +0200, Thomas Gleixner wrote:
> Balbir Singh <[email protected]> writes:
>
> This part:
>
> > --- a/include/uapi/linux/prctl.h
> > +++ b/include/uapi/linux/prctl.h
> > @@ -238,4 +238,8 @@ struct prctl_mm_map {
> > #define PR_SET_IO_FLUSHER 57
> > #define PR_GET_IO_FLUSHER 58
> >
> > +/* Flush L1D on context switch (mm) */
> > +#define PR_SET_L1D_FLUSH 59
> > +#define PR_GET_L1D_FLUSH 60
>
> ...
>
> > @@ -2514,6 +2524,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned
> > long, arg2, unsigned long, arg3,
> >
> > error = (current->flags & PR_IO_FLUSHER) ==
> > PR_IO_FLUSHER;
> > break;
> > + case PR_SET_L1D_FLUSH:
> > + if (arg3 || arg4 || arg5)
> > + return -EINVAL;
> > + error = arch_prctl_l1d_flush_set(me, arg2);
> > + break;
> > + case PR_GET_L1D_FLUSH:
> > + if (arg2 || arg3 || arg4 || arg5)
> > + return -EINVAL;
> > + error = arch_prctl_l1d_flush_get(me);
> > + break;
> > default:
> > error = -EINVAL;
> > break;
>
> wants to be split into a separate patch, really. Then we get a proper
> subject lines with proper subsystem prefixes. This part also lacks a
> description in Documentation/userspace-api/ and function prototypes
> for
> the arch_prctl* functions.
>
> But looking at this deeper (yes I should have noticed earlier):
>
> Why do we need yet another PRCTL?
>
> We already have PR_SET_SPECULATION_CTRL/PR_GET_SPECULATION_CTRL. That
> L1D flush thingy fits into this category, right?

It does, I thought about it for a while when I was changing the code and
left it aside because, looking at the definition

1 PR_SPEC_ENABLE The speculation feature is enabled,
mitigation is disabled.
2 PR_SPEC_DISABLE The speculation feature is disabled,
mitigation is enabled.

With L1D flush, there is no overriding of the feature as such (as in
enable when the mitigation is disabled and vice-versa). I am happy to
reconsider my initial thought though.


>
> This makes even more sense if you think about the second use case for
> L1D flush, i.e. the flush when a untrusted task comes in. If we ever
> want to support that case then this will be imposed by seccomp and
> then
> we'd need yet another interface there.
>

Yep, I see what you mean


> And for this reason we should also name that current opt-in thingy:
> L1D_FLUSH_OUT in the prctl and also for the TIF bits.
>
> Hmm? Kees?
>
> I've applied the first 4 patches to:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git x86/mm
>
> so the polishing I did gets preserved and you don't have to resend the
> whole pile.
>

Thanks, I think your change to patch 6 makes sense as well. Let me
respin this based on what you think of the argument above

Balbir

2020-05-14 08:25:29

by Singh, Balbir

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

On Wed, 2020-05-13 at 17:04 +0200, Thomas Gleixner wrote:
>
>
> Balbir Singh <[email protected]> writes:
> >
> > + if (prev_mm & LAST_USER_MM_L1D_FLUSH)
> > + arch_l1d_flush(0); /* Just flush, don't populate the
> > TLB */
>
> Bah. I fundamentally hate tail comments. They are just disturbing the
> reading flow. Aside of that, this states the WHAT but not the WHY. And
> if you add that explanation then you need more than 20 characters and
> end up with
>
> if (prev_mm & LAST_USER_MM_L1D_FLUSH) {
> /*
> * Proper comment explaining why this is flushing
> * without prepopulating the TLB.
> */
> arch_l1d_flush(0);
> }
>

I added a comment due to the use of 0, 0 is usually seen as true or
false and I wanted to add some comments in there to indicate we don't
populate the TLB, the reason we don't do it is, I don't think we need
to. I am happy to revisit the placement of the comment.

> anyway. And even for a short comment which fits after the function
> call
> it's way better to have:
>
> if (prev_mm & LAST_USER_MM_L1D_FLUSH) {
> /* Short explanation */
> arch_l1d_flush(0);
> }
>
> Hmm?
>
> > + /*
> > + * Leave last_user_mm_spec at LAST_USER_MM_IBPB, we don't
> > + * want to set LAST_USER_MM_L1D_FLUSH and force a flush before
> > + * we've allocated the flush pages.
>
> Ah here is the comment. I still like the explicit define for the (re)
> init.
>

I saw your tree and it sounds like you fixed it up in there in patch 3.

Balbir Singh.

2020-05-14 11:35:07

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

Balbir,

"Singh, Balbir" <[email protected]> writes:
> On Wed, 2020-05-13 at 18:16 +0200, Thomas Gleixner wrote:
>> Balbir Singh <[email protected]> writes:
>> But looking at this deeper (yes I should have noticed earlier):
>>
>> Why do we need yet another PRCTL?
>>
>> We already have PR_SET_SPECULATION_CTRL/PR_GET_SPECULATION_CTRL. That
>> L1D flush thingy fits into this category, right?
>
> It does, I thought about it for a while when I was changing the code and
> left it aside because, looking at the definition
>
> 1 PR_SPEC_ENABLE The speculation feature is enabled,
> mitigation is disabled.
> 2 PR_SPEC_DISABLE The speculation feature is disabled,
> mitigation is enabled.
>
> With L1D flush, there is no overriding of the feature as such (as in
> enable when the mitigation is disabled and vice-versa). I am happy to
> reconsider my initial thought though.

L1D is always enabled as L1D will be a source of trouble forever :)

Thanks,

tglx

2020-05-14 21:31:53

by Singh, Balbir

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] Optionally flush L1D on context switch

On Wed, 2020-05-13 at 17:27 +0200, Thomas Gleixner wrote:
> CAUTION: This email originated from outside of the organization. Do
> not click links or open attachments unless you can confirm the sender
> and know the content is safe.
>
>
>
> Balbir Singh <[email protected]> writes:
>
> > Implement a mechanism to selectively flush the L1D cache. The goal
> > is to
> > allow tasks that are paranoid due to the recent snoop assisted data
> > sampling
> > vulnerabilites, to flush their L1D on being switched out. This
> > protects
> > their data from being snooped or leaked via side channels after the
> > task
> > has context switched out.
> >
> > There are two scenarios we might want to protect against, a task
> > leaving
> > the CPU with data still in L1D (which is the main concern of this
> > patch),
> > the second scenario is a malicious task coming in (not so well
> > trusted)
> > for which we want to clean up the cache before it starts. Only the
> > case
> > for the former is addressed.
> >
> > A new thread_info flag TIF_SPEC_FLUSH_L1D is added to track tasks
> > which
> > opt-into L1D flushing. cpu_tlbstate.last_user_mm_spec is used to
> > convert
> > the TIF flags into mm state (per cpu via last_user_mm_spec) in
> > cond_mitigation(), which then used to do decide when to call
> > flush_l1d().
> >
> > Add prctl()'s to opt-in to the L1D cache on context switch out, the
> > existing mechanisms of tracking prev_mm via cpu_tlbstate is
> > reused to track state of the tasks and to flush the L1D cache.
> > The prctl interface is generic and can be ported over to other
> > architectures.
> >
> > Suggested-by: Thomas Gleixner <[email protected]>
> > Signed-off-by: Balbir Singh <[email protected]>
> > Reviewed-by: Kees Cook <[email protected]>
> > ---
> > arch/x86/include/asm/thread_info.h | 7 ++++-
> > arch/x86/mm/tlb.c | 44
> > ++++++++++++++++++++++++++++--
> > include/uapi/linux/prctl.h | 4 +++
> > kernel/sys.c | 20 ++++++++++++++
> > 4 files changed, 72 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/thread_info.h
> > b/arch/x86/include/asm/thread_info.h
> > index 8de8ceccb8bc..67de693d9ba1 100644
> > --- a/arch/x86/include/asm/thread_info.h
> > +++ b/arch/x86/include/asm/thread_info.h
> > @@ -84,7 +84,7 @@ struct thread_info {
> > #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
> > #define TIF_SECCOMP 8 /* secure computing */
> > #define TIF_SPEC_IB 9 /* Indirect branch speculation
> > mitigation */
> > -#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation
> > MSR update in context switch */
> > +#define TIF_SPEC_FLUSH_L1D 10 /* Flush L1D on mm switches
> > (processes) */
> > #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of
> > userspace return */
> > #define TIF_UPROBE 12 /* breakpointed or
> > singlestepping */
> > #define TIF_PATCH_PENDING 13 /* pending live patching
> > update */
> > @@ -96,6 +96,7 @@ struct thread_info {
> > #define TIF_MEMDIE 20 /* is terminating due to OOM
> > killer */
> > #define TIF_POLLING_NRFLAG 21 /* idle is polling for
> > TIF_NEED_RESCHED */
> > #define TIF_IO_BITMAP 22 /* uses I/O bitmap */
> > +#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation
> > MSR update in context switch */
> > #define TIF_FORCED_TF 24 /* true if TF in
> > eflags artificially */
> > #define TIF_BLOCKSTEP 25 /* set when we want
> > DEBUGCTLMSR_BTF */
> > #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu
> > lazily */
> > @@ -132,6 +133,7 @@ struct thread_info {
> > #define _TIF_ADDR32 (1 << TIF_ADDR32)
> > #define _TIF_X32 (1 << TIF_X32)
> > #define _TIF_FSCHECK (1 << TIF_FSCHECK)
> > +#define _TIF_SPEC_FLUSH_L1D (1 << TIF_SPEC_FLUSH_L1D)
>
> Bah. These defines are ordered in the same way as the TIF defines....
>
> > /*
> > - * Bits to mangle the TIF_SPEC_IB state into the mm pointer which
> > is
> > + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is
> > * stored in cpu_tlb_state.last_user_mm_spec.
> > */
> > #define LAST_USER_MM_IBPB 0x1UL
> > -#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
> > +#define LAST_USER_MM_L1D_FLUSH 0x2UL
> > +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB |
> > LAST_USER_MM_L1D_FLUSH)
>
> You lost
>
> + BUILD_BUG_ON(TIF_SPEC_FLUSH_L1D != TIF_SPEC_IB + 1);
>
> from patch I gave you.


Oops.. I'll fix up both and redo patch 5/6, by splitting it up, into
interface vs flush bits

Thanks,
Balbir Singh.

Subject: [tip: x86/mm] x86/kvm: Refactor L1D flush operations

The following commit has been merged into the x86/mm branch of tip:

Commit-ID: e3efae20ec69e9a8c9db1ad81b37de629219bbc4
Gitweb: https://git.kernel.org/tip/e3efae20ec69e9a8c9db1ad81b37de629219bbc4
Author: Balbir Singh <[email protected]>
AuthorDate: Sun, 10 May 2020 11:47:59 +10:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 13 May 2020 18:12:19 +02:00

x86/kvm: Refactor L1D flush operations

Move the L1D flush functions into builtin code so they can be reused for
L1D flush on context switch.

Split them up into:
- Hardware L1D flush
- TLB pre-populating of L1D pages for software based flushing
- Software based L1D flush

Adjust the KVM code accordingly.

[ tglx: Massaged changelog ]

Signed-off-by: Balbir Singh <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/include/asm/cacheflush.h | 3 ++-
arch/x86/kernel/l1d_flush.c | 49 ++++++++++++++++++++++++++++++-
arch/x86/kvm/vmx/vmx.c | 29 +-----------------
3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index bac56fc..21cc3b2 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -8,7 +8,10 @@

#define L1D_CACHE_ORDER 4
void clflush_cache_range(void *addr, unsigned int size);
+void l1d_flush_populate_tlb(void *l1d_flush_pages);
void *l1d_flush_alloc_pages(void);
void l1d_flush_cleanup_pages(void *l1d_flush_pages);
+void l1d_flush_sw(void *l1d_flush_pages);
+int l1d_flush_hw(void);

#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
index 4f298b7..32119ee 100644
--- a/arch/x86/kernel/l1d_flush.c
+++ b/arch/x86/kernel/l1d_flush.c
@@ -37,3 +37,52 @@ void l1d_flush_cleanup_pages(void *l1d_flush_pages)
free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
}
EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);
+
+void l1d_flush_populate_tlb(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+EXPORT_SYMBOL_GPL(l1d_flush_populate_tlb);
+
+int l1d_flush_hw(void)
+{
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return 0;
+ }
+ return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(l1d_flush_hw);
+
+void l1d_flush_sw(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* Fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ecx");
+}
+EXPORT_SYMBOL_GPL(l1d_flush_sw);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 225aa82..786d161 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5983,8 +5983,6 @@ unexpected_vmexit:
*/
static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
/*
* This code is only executed when the the flush mode is 'cond' or
* 'always'
@@ -6013,32 +6011,11 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)

vcpu->stat.l1d_flush++;

- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ if (!l1d_flush_hw())
return;
- }

- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
+ l1d_flush_populate_tlb(vmx_l1d_flush_pages);
+ l1d_flush_sw(vmx_l1d_flush_pages);
}

static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)

Subject: [tip: x86/mm] x86/kvm: Refactor L1D flushing

The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 3f768f0032dbc0657ed7e48f4735a3c4e49e25d7
Gitweb: https://git.kernel.org/tip/3f768f0032dbc0657ed7e48f4735a3c4e49e25d7
Author: Balbir Singh <[email protected]>
AuthorDate: Sun, 10 May 2020 11:48:01 +10:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 13 May 2020 18:12:20 +02:00

x86/kvm: Refactor L1D flushing

Move more L1D flush related code out of KVM/VMX into builtin code to allow
reuse for L1D flushing:

- Move the initialization to l1d_flush_init_once() and remove the
deallocation of the L1D flush pages.

This avoids adding complex refcounting of users (VMX or tasks which
opt into a L1D flush on context switch) for the price of a few pages
potentially wasted when no users are left.

- Unify the flush invocations as arch_l1d_flush() which attempts
hardware flushing and falls back to the software implementation
with the option of prepopulating the TLB entries first.

[ tglx: Massage changelog and add a paranoid check of the flush pages
pointer ]

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/include/asm/cacheflush.h | 12 ++---
arch/x86/kernel/l1d_flush.c | 68 ++++++++++++++++++++++--------
arch/x86/kvm/vmx/vmx.c | 20 +--------
3 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 21cc3b2..851d8f1 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,11 +7,13 @@
#include <asm/special_insns.h>

#define L1D_CACHE_ORDER 4
+
+enum l1d_flush_options {
+ L1D_FLUSH_POPULATE_TLB = 0x1,
+};
+
void clflush_cache_range(void *addr, unsigned int size);
-void l1d_flush_populate_tlb(void *l1d_flush_pages);
-void *l1d_flush_alloc_pages(void);
-void l1d_flush_cleanup_pages(void *l1d_flush_pages);
-void l1d_flush_sw(void *l1d_flush_pages);
-int l1d_flush_hw(void);
+int l1d_flush_init_once(void);
+void arch_l1d_flush(enum l1d_flush_options options);

#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
index 32119ee..4662f90 100644
--- a/arch/x86/kernel/l1d_flush.c
+++ b/arch/x86/kernel/l1d_flush.c
@@ -4,10 +4,10 @@

#include <asm/cacheflush.h>

-void *l1d_flush_alloc_pages(void)
+static void *l1d_flush_alloc_pages(void)
{
struct page *page;
- void *l1d_flush_pages = NULL;
+ void *flush_pages = NULL;
int i;

/*
@@ -17,7 +17,7 @@ void *l1d_flush_alloc_pages(void)
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return NULL;
- l1d_flush_pages = page_address(page);
+ flush_pages = page_address(page);

/*
* Initialize each page with a different pattern in
@@ -25,20 +25,13 @@ void *l1d_flush_alloc_pages(void)
* virtualization case.
*/
for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
- memset(l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ memset(flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
- return l1d_flush_pages;
+ return flush_pages;
}
-EXPORT_SYMBOL_GPL(l1d_flush_alloc_pages);

-void l1d_flush_cleanup_pages(void *l1d_flush_pages)
-{
- free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
-}
-EXPORT_SYMBOL_GPL(l1d_flush_cleanup_pages);
-
-void l1d_flush_populate_tlb(void *l1d_flush_pages)
+static void l1d_flush_populate_tlb(void *l1d_flush_pages)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;

@@ -56,9 +49,8 @@ void l1d_flush_populate_tlb(void *l1d_flush_pages)
[size] "r" (size)
: "eax", "ebx", "ecx", "edx");
}
-EXPORT_SYMBOL_GPL(l1d_flush_populate_tlb);

-int l1d_flush_hw(void)
+static int l1d_flush_hw(void)
{
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
@@ -66,9 +58,8 @@ int l1d_flush_hw(void)
}
return -ENOTSUPP;
}
-EXPORT_SYMBOL_GPL(l1d_flush_hw);

-void l1d_flush_sw(void *l1d_flush_pages)
+static void l1d_flush_sw(void *l1d_flush_pages)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;

@@ -85,4 +76,45 @@ void l1d_flush_sw(void *l1d_flush_pages)
[size] "r" (size)
: "eax", "ecx");
}
-EXPORT_SYMBOL_GPL(l1d_flush_sw);
+
+static void *l1d_flush_pages;
+static DEFINE_MUTEX(l1d_flush_mutex);
+
+/*
+ * Initialize and setup L1D flush once, each caller will reuse the
+ * l1d_flush_pages for flushing, no per CPU allocations or NUMA aware
+ * allocations at the moment.
+ */
+int l1d_flush_init_once(void)
+{
+ int ret = 0;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return -ENOTSUPP;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D) || l1d_flush_pages)
+ return ret;
+
+ mutex_lock(&l1d_flush_mutex);
+ if (!l1d_flush_pages)
+ l1d_flush_pages = l1d_flush_alloc_pages();
+ ret = l1d_flush_pages ? 0 : -ENOMEM;
+ mutex_unlock(&l1d_flush_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(l1d_flush_init_once);
+
+void arch_l1d_flush(enum l1d_flush_options options)
+{
+ if (!l1d_flush_hw())
+ return;
+
+ if (WARN_ON_ONCE(!l1d_flush_pages))
+ return;
+
+ if (options & L1D_FLUSH_POPULATE_TLB)
+ l1d_flush_populate_tlb(l1d_flush_pages);
+
+ l1d_flush_sw(l1d_flush_pages);
+}
+EXPORT_SYMBOL_GPL(arch_l1d_flush);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 786d161..d489234 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -203,8 +203,6 @@ static const struct {
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
};

-static void *vmx_l1d_flush_pages;
-
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
@@ -247,12 +245,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}

- if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
- !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- vmx_l1d_flush_pages = l1d_flush_alloc_pages();
- if (!vmx_l1d_flush_pages)
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ if (l1d_flush_init_once())
return -ENOMEM;
- }

l1tf_vmx_mitigation = l1tf;

@@ -6010,12 +6005,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
}

vcpu->stat.l1d_flush++;
-
- if (!l1d_flush_hw())
- return;
-
- l1d_flush_populate_tlb(vmx_l1d_flush_pages);
- l1d_flush_sw(vmx_l1d_flush_pages);
+ arch_l1d_flush(L1D_FLUSH_POPULATE_TLB);
}

static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -7983,10 +7973,6 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {

static void vmx_cleanup_l1d_flush(void)
{
- if (vmx_l1d_flush_pages) {
- l1d_flush_cleanup_pages(vmx_l1d_flush_pages);
- vmx_l1d_flush_pages = NULL;
- }
/* Restore state so sysfs ignores VMX */
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}

Subject: [tip: x86/mm] x86/mm: Refactor cond_ibpb() to support other use cases

The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 83ce56f712af79eac5f761e6b058359336803500
Gitweb: https://git.kernel.org/tip/83ce56f712af79eac5f761e6b058359336803500
Author: Balbir Singh <[email protected]>
AuthorDate: Sun, 10 May 2020 11:48:00 +10:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 13 May 2020 18:12:20 +02:00

x86/mm: Refactor cond_ibpb() to support other use cases

cond_ibpb() has the necessary bits required to track the previous mm in
switch_mm_irqs_off(). This can be reused for other use cases like L1D
flushing on context switch.

[ tglx: Moved comment, added a separate define for state (re)initialization ]

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/mm/tlb.c | 53 +++++++++++++++++---------------
2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e..a927d40 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,7 +83,7 @@ struct tlb_state {
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
- unsigned long last_user_mm_ibpb;
+ unsigned long last_user_mm_spec;
};

u16 loaded_mm_asid;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index cf81902..35017a0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -43,10 +43,14 @@
*/

/*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
+
+/* Bits to set when tlbstate and flush is (re)initialized */
+#define LAST_USER_MM_INIT LAST_USER_MM_IBPB

/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -345,20 +349,29 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}

-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+ unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;

- return (unsigned long)next->mm | ibpb;
+ return (unsigned long)next->mm | spec_bits;
}

-static void cond_ibpb(struct task_struct *next)
+static void cond_mitigation(struct task_struct *next)
{
+ unsigned long prev_mm, next_mm;
+
if (!next || !next->mm)
return;

+ next_mm = mm_mangle_tif_spec_bits(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+
/*
+ * Avoid user/user BTB poisoning by flushing the branch predictor
+ * when switching between processes. This stops one process from
+ * doing Spectre-v2 attacks on another.
+ *
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
* same process. Using the mm pointer instead of mm->context.ctx_id
@@ -368,8 +381,6 @@ static void cond_ibpb(struct task_struct *next)
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
- unsigned long prev_mm, next_mm;
-
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
@@ -399,20 +410,14 @@ static void cond_ibpb(struct task_struct *next)
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
- * cpu_tlbstate.last_user_mm_ibpb for comparison.
- */
- next_mm = mm_mangle_tif_spec_ib(next);
- prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
- /*
+ * cpu_tlbstate.last_user_mm_spec for comparison.
+ *
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
-
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}

if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -421,11 +426,12 @@ static void cond_ibpb(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+ (unsigned long)next->mm)
indirect_branch_prediction_barrier();
- this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
- }
}
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}

#ifdef CONFIG_PERF_EVENTS
@@ -547,11 +553,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
need_flush = true;
} else {
/*
- * Avoid user/user BTB poisoning by flushing the branch
- * predictor when switching between processes. This stops
- * one process from doing Spectre-v2 attacks on another.
+ * Apply process to process speculation vulnerability
+ * mitigations if applicable.
*/
- cond_ibpb(tsk);
+ cond_mitigation(tsk);

if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -677,7 +682,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));

/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);

Subject: [tip: x86/pti] x86/mm: Refactor cond_ibpb() to support other use cases

The following commit has been merged into the x86/pti branch of tip:

Commit-ID: 81f449985c12b83b91849d94724b803ebf856301
Gitweb: https://git.kernel.org/tip/81f449985c12b83b91849d94724b803ebf856301
Author: Balbir Singh <[email protected]>
AuthorDate: Wed, 29 Jul 2020 10:11:00 +10:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 16 Sep 2020 15:08:02 +02:00

x86/mm: Refactor cond_ibpb() to support other use cases

cond_ibpb() has the necessary bits required to track the previous mm in
switch_mm_irqs_off(). This can be reused for other use cases like L1D
flushing on context switch.

[ tglx: Moved comment, added a separate define for state (re)initialization ]

Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Balbir Singh <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Link: https://lore.kernel.org/r/[email protected]

---
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/mm/tlb.c | 53 +++++++++++++++++---------------
2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e..a927d40 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -83,7 +83,7 @@ struct tlb_state {
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
- unsigned long last_user_mm_ibpb;
+ unsigned long last_user_mm_spec;
};

u16 loaded_mm_asid;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0951b47..6bbd758 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -43,10 +43,14 @@
*/

/*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB)
+
+/* Bits to set when tlbstate and flush is (re)initialized */
+#define LAST_USER_MM_INIT LAST_USER_MM_IBPB

/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
@@ -317,20 +321,29 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}

-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static inline unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+ unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;

- return (unsigned long)next->mm | ibpb;
+ return (unsigned long)next->mm | spec_bits;
}

-static void cond_ibpb(struct task_struct *next)
+static void cond_mitigation(struct task_struct *next)
{
+ unsigned long prev_mm, next_mm;
+
if (!next || !next->mm)
return;

+ next_mm = mm_mangle_tif_spec_bits(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+
/*
+ * Avoid user/user BTB poisoning by flushing the branch predictor
+ * when switching between processes. This stops one process from
+ * doing Spectre-v2 attacks on another.
+ *
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
* same process. Using the mm pointer instead of mm->context.ctx_id
@@ -340,8 +353,6 @@ static void cond_ibpb(struct task_struct *next)
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
- unsigned long prev_mm, next_mm;
-
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
@@ -371,20 +382,14 @@ static void cond_ibpb(struct task_struct *next)
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
- * cpu_tlbstate.last_user_mm_ibpb for comparison.
- */
- next_mm = mm_mangle_tif_spec_ib(next);
- prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
- /*
+ * cpu_tlbstate.last_user_mm_spec for comparison.
+ *
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
-
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}

if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -393,11 +398,12 @@ static void cond_ibpb(struct task_struct *next)
* different context than the user space task which ran
* last on this CPU.
*/
- if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+ (unsigned long)next->mm)
indirect_branch_prediction_barrier();
- this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
- }
}
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}

#ifdef CONFIG_PERF_EVENTS
@@ -519,11 +525,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
need_flush = true;
} else {
/*
- * Avoid user/user BTB poisoning by flushing the branch
- * predictor when switching between processes. This stops
- * one process from doing Spectre-v2 attacks on another.
+ * Apply process to process speculation vulnerability
+ * mitigations if applicable.
*/
- cond_ibpb(tsk);
+ cond_mitigation(tsk);

/*
* Stop remote flushes for the previous mm.
@@ -631,7 +636,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));

/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);