LinuxLists.cc - [RFC PATCH v2 0/4] arch/x86: Optionally flush L1D on context switch

2020-03-25 07:12:19

Subject: [RFC PATCH v2 0/4] arch/x86: Optionally flush L1D on context switch

This patch is a continuation of RFC/PoC to start the discussion on optionally
flushing L1D cache. The goal is to allow tasks that are paranoid due to the
recent snoop assisted data sampling vulnerabilites, to flush their L1D on being
switched out. This protects their data from being snooped or leaked via
side channels after the task has context switched out.

The points of discussion/review are (with updates):

1. Discuss the use case and the right approach to address this
A. Generally there seems to be consensus that we need this

2. Does an arch prctl allowing for opt-in flushing make sense, would other
arches care about something similar?
A. We definitely build this for x86, have not heard from any other arch
maintainers. There was suggestion to make this a prctl and let each
arch implement L1D flushing if needed, there is no arch agnostic
software L1D flush.

3. There is a fallback software L1D load, similar to what L1TF does, but
we don't prefetch the TLB, is that sufficient?
A. There was no conclusion, I suspect we don't need this

4. Should we consider cleaning up the L1D on arrival of tasks?
A. For now, we think this case is not the priority for this patchset.

In summary, this is an early PoC to start the discussion on the need for
conditional L1D flushing based on the security posture of the
application and the sensitivity of the data it has access to or might
have access to.

Changelog v2:
- Reuse existing code for allocation and flush
- Simplify the goto logic in the actual l1d_flush function
- Optimize the code path with jump labels/static functions

Cc: [email protected]

Balbir Singh (4):
arch/x86/kvm: Refactor l1d flush lifecycle management
arch/x86: Refactor tlbflush and l1d flush
arch/x86: Optionally flush L1D on context switch
arch/x86: L1D flush, optimize the context switch

arch/x86/include/asm/cacheflush.h | 6 ++
arch/x86/include/asm/nospec-branch.h | 2 +
arch/x86/include/asm/thread_info.h | 4 ++
arch/x86/include/asm/tlbflush.h | 6 ++
arch/x86/include/uapi/asm/prctl.h | 3 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/l1d_flush.c | 85 ++++++++++++++++++++++++++++
arch/x86/kernel/process_64.c | 10 +++-
arch/x86/kvm/vmx/vmx.c | 56 +++---------------
arch/x86/mm/tlb.c | 85 ++++++++++++++++++++++++++++
10 files changed, 209 insertions(+), 49 deletions(-)
create mode 100644 arch/x86/kernel/l1d_flush.c

--
2.17.1

2020-03-25 07:12:24

by Singh, Balbir

[permalink] [raw]

Subject: [RFC PATCH v2 3/4] arch/x86: Optionally flush L1D on context switch

This patch implements a mechanisn to selectively flush the L1D cache.
The goal is to allow tasks that are paranoid due to the recent
snoop assisted data sampling vulnerabilites, to flush their L1D on being
switched out. This protects their data from being snooped or leaked via
side channels after the task has context switched out.

There are two scenarios we might want to protect against, a task leaving
the CPU with data still in L1D (which is the main concern of this
patch), the second scenario is a malicious task coming in (not so well
trusted) for which we want to clean up the cache before it starts

This patch adds an arch specific prctl() to flush L1D cache on context
switch out, the existing mechanisms of tracking prev_mm via cpu_tlbstate
is reused (very similar to the cond_ipbp() changes). The patch has been
lighted tested.

The current version benefited from discussions with Kees and Thomas.

Cc: [email protected]

Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/thread_info.h | 4 ++
arch/x86/include/asm/tlbflush.h | 6 +++
arch/x86/include/uapi/asm/prctl.h | 3 ++
arch/x86/kernel/process_64.c | 10 ++++-
arch/x86/mm/tlb.c | 72 ++++++++++++++++++++++++++++++
5 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8de8ceccb8bc..2a626a6a9ea6 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -96,6 +96,7 @@ struct thread_info {
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_L1D_FLUSH 23 /* Flush L1D on mm switches (processes) */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -132,6 +133,7 @@ struct thread_info {
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
#define _TIF_FSCHECK (1 << TIF_FSCHECK)
+#define _TIF_L1D_FLUSH (1 << TIF_L1D_FLUSH)

/* Work to do before invoking the actual syscall. */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -239,6 +241,8 @@ extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
extern void arch_setup_new_exec(void);
+extern int enable_l1d_flush_for_task(struct task_struct *tsk);
+extern int disable_l1d_flush_for_task(struct task_struct *tsk);
#define arch_setup_new_exec arch_setup_new_exec
#endif /* !__ASSEMBLY__ */

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6f66d841262d..1d535059b358 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -219,6 +219,12 @@ struct tlb_state {
*/
unsigned long cr4;

+ /*
+ * Flush the L1D cache on switch_mm_irqs_off() for a
+ * task getting off the CPU, if it opted in to do so
+ */
+ bool last_user_mm_l1d_flush;
+
/*
* This is a list of all contexts that might exist in the TLB.
* There is one per ASID that we use, and the ASID (what the
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..1361e5e25791 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,7 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003

+#define ARCH_SET_L1D_FLUSH 0x3001
+#define ARCH_GET_L1D_FLUSH 0x3002
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ffd497804dbc..555fa3fd102e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -700,7 +700,15 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
case ARCH_MAP_VDSO_64:
return prctl_map_vdso(&vdso_image_64, arg2);
#endif
-
+ case ARCH_GET_L1D_FLUSH:
+ return test_ti_thread_flag(&task->thread_info, TIF_L1D_FLUSH);
+ case ARCH_SET_L1D_FLUSH: {
+ if (arg2 >= 1)
+ return enable_l1d_flush_for_task(task);
+ else
+ return disable_l1d_flush_for_task(task);
+ break;
+ }
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 66f96f21a7b6..22f96c5f74f0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -13,6 +13,7 @@
#include <asm/mmu_context.h>
#include <asm/nospec-branch.h>
#include <asm/cache.h>
+#include <asm/cacheflush.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>

@@ -151,6 +152,74 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);

+static void *l1d_flush_pages;
+static DEFINE_MUTEX(l1d_flush_mutex);
+
+int enable_l1d_flush_for_task(struct task_struct *tsk)
+{
+ struct page *page;
+ int ret = 0;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
+ goto done;
+
+ page = READ_ONCE(l1d_flush_pages);
+ if (unlikely(!page)) {
+ mutex_lock(&l1d_flush_mutex);
+ if (!l1d_flush_pages) {
+ l1d_flush_pages = alloc_l1d_flush_pages();
+ if (!l1d_flush_pages)
+ return -ENOMEM;
+ }
+ mutex_unlock(&l1d_flush_mutex);
+ }
+ /* I don't think we need to worry about KSM */
+done:
+ set_ti_thread_flag(&tsk->thread_info, TIF_L1D_FLUSH);
+ return ret;
+}
+
+int disable_l1d_flush_for_task(struct task_struct *tsk)
+{
+ clear_ti_thread_flag(&tsk->thread_info, TIF_L1D_FLUSH);
+ return 0;
+}
+
+/*
+ * Flush the L1D cache for this CPU. We want to this at switch mm time,
+ * this is a pessimistic security measure and an opt-in for those tasks
+ * that host sensitive information and there are concerns about spills
+ * from fill buffers.
+ */
+static void l1d_flush(struct mm_struct *next, struct task_struct *tsk)
+{
+ struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+ /*
+ * If we are not really switching mm's, we can just return
+ */
+ if (real_prev == next)
+ return;
+
+ /*
+ * Do we need flushing for by the previous task
+ */
+ if (this_cpu_read(cpu_tlbstate.last_user_mm_l1d_flush) != 0) {
+ if (!flush_l1d_cache_hw())
+ flush_l1d_cache_sw(l1d_flush_pages);
+ this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 0);
+ /* Make sure we clear the values before we set it again */
+ barrier();
+ }
+
+ if (tsk == NULL)
+ return;
+
+ /* We don't need stringent checks as we opt-in/opt-out */
+ if (test_ti_thread_flag(&tsk->thread_info, TIF_L1D_FLUSH))
+ this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 1);
+}
+
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -433,6 +502,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}

+ l1d_flush(next, tsk);
+
/* Make sure we write CR3 before loaded_mm. */
barrier();

@@ -503,6 +574,7 @@ void initialize_tlbstate_and_flush(void)
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
+ this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
--
2.17.1

2020-03-25 07:12:32

by Singh, Balbir

[permalink] [raw]

Subject: [RFC PATCH v2 2/4] arch/x86: Refactor tlbflush and l1d flush

Refactor the existing assembly bits into smaller helper functions
and also abstract L1D_FLUSH into a helper function. Use these
functions in kvm for L1D flushing.

Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/cacheflush.h | 3 ++
arch/x86/kernel/l1d_flush.c | 49 +++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmx.c | 31 ++++---------------
3 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 6419a4cef0e8..66a46db7aadd 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -10,5 +10,8 @@
void clflush_cache_range(void *addr, unsigned int size);
void *alloc_l1d_flush_pages(void);
void cleanup_l1d_flush_pages(void *l1d_flush_pages);
+void populate_tlb_with_flush_pages(void *l1d_flush_pages);
+void flush_l1d_cache_sw(void *l1d_flush_pages);
+int flush_l1d_cache_hw(void);

#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/l1d_flush.c b/arch/x86/kernel/l1d_flush.c
index 05f375c33423..60499f773046 100644
--- a/arch/x86/kernel/l1d_flush.c
+++ b/arch/x86/kernel/l1d_flush.c
@@ -34,3 +34,52 @@ void cleanup_l1d_flush_pages(void *l1d_flush_pages)
free_pages((unsigned long)l1d_flush_pages, L1D_CACHE_ORDER);
}
EXPORT_SYMBOL_GPL(cleanup_l1d_flush_pages);
+
+void populate_tlb_with_flush_pages(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+EXPORT_SYMBOL_GPL(populate_tlb_with_flush_pages);
+
+int flush_l1d_cache_hw(void)
+{
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(flush_l1d_cache_hw);
+
+void flush_l1d_cache_sw(void *l1d_flush_pages)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ asm volatile(
+ /* Fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ecx");
+}
+EXPORT_SYMBOL_GPL(flush_l1d_cache_sw);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 66d798e1a9d8..9605589bb294 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5963,8 +5963,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu,
*/
static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
/*
* This code is only executed when the the flush mode is 'cond' or
* 'always'
@@ -5993,32 +5991,13 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)

vcpu->stat.l1d_flush++;

- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ if (flush_l1d_cache_hw())
return;
- }

- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
+ preempt_disable();
+ populate_tlb_with_flush_pages(vmx_l1d_flush_pages);
+ flush_l1d_cache_sw(vmx_l1d_flush_pages);
+ preempt_enable();
}

static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
--
2.17.1

2020-03-25 07:12:47

by Singh, Balbir

[permalink] [raw]

Subject: [RFC PATCH v2 4/4] arch/x86: L1D flush, optimize the context switch

Use a static branch/jump label to optimize the code. Right now
we don't ref count the users, but that could be done if needed
in the future.

Signed-off-by: Balbir Singh <[email protected]>
---
arch/x86/include/asm/nospec-branch.h | 2 ++
arch/x86/mm/tlb.c | 13 +++++++++++++
2 files changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 07e95dcb40ad..371e28cea1f4 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -310,6 +310,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
DECLARE_STATIC_KEY_FALSE(mds_user_clear);
DECLARE_STATIC_KEY_FALSE(mds_idle_clear);

+DECLARE_STATIC_KEY_FALSE(switch_mm_l1d_flush);
+
#include <asm/segment.h>

/**
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 22f96c5f74f0..bed2b6a5490d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -155,6 +155,11 @@ EXPORT_SYMBOL_GPL(leave_mm);
static void *l1d_flush_pages;
static DEFINE_MUTEX(l1d_flush_mutex);

+/* Flush L1D on switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_l1d_flush);
+EXPORT_SYMBOL_GPL(switch_mm_l1d_flush);
+
+
int enable_l1d_flush_for_task(struct task_struct *tsk)
{
struct page *page;
@@ -170,6 +175,11 @@ int enable_l1d_flush_for_task(struct task_struct *tsk)
l1d_flush_pages = alloc_l1d_flush_pages();
if (!l1d_flush_pages)
return -ENOMEM;
+ /*
+ * We could do more accurate ref counting
+ * if needed
+ */
+ static_branch_enable(&switch_mm_l1d_flush);
}
mutex_unlock(&l1d_flush_mutex);
}
@@ -195,6 +205,9 @@ static void l1d_flush(struct mm_struct *next, struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);

+ if (static_branch_unlikely(&switch_mm_l1d_flush))
+ return;
+
/*
* If we are not really switching mm's, we can just return
*/
--
2.17.1

2020-03-25 07:16:49

by Singh, Balbir

[permalink] [raw]

Subject: Re: [RFC PATCH v2 4/4] arch/x86: L1D flush, optimize the context switch

On Wed, 2020-03-25 at 18:11 +1100, Balbir Singh wrote:
> Use a static branch/jump label to optimize the code. Right now
> we don't ref count the users, but that could be done if needed
> in the future.
>
> Signed-off-by: Balbir Singh <[email protected]>
>

I sent an older version of the patch, here is the updated version

Balbir Singh

Attachments:

v2-0004-arch-x86-L1D-flush-optimize-the-context-switch.patch (3.41 kB)
v2-0004-arch-x86-L1D-flush-optimize-the-context-switch.patch

2020-03-30 01:25:37

by Singh, Balbir

[permalink] [raw]

Subject: Re: [RFC PATCH v2 0/4] arch/x86: Optionally flush L1D on context switch

On Wed, 2020-03-25 at 18:10 +1100, Balbir Singh wrote:
> This patch is a continuation of RFC/PoC to start the discussion on
> optionally
> flushing L1D cache. The goal is to allow tasks that are paranoid due to the
> recent snoop assisted data sampling vulnerabilites, to flush their L1D on
> being
> switched out. This protects their data from being snooped or leaked via
> side channels after the task has context switched out.
>
> The points of discussion/review are (with updates):
>
> 1. Discuss the use case and the right approach to address this
> A. Generally there seems to be consensus that we need this
>
> 2. Does an arch prctl allowing for opt-in flushing make sense, would other
> arches care about something similar?
> A. We definitely build this for x86, have not heard from any other arch
> maintainers. There was suggestion to make this a prctl and let each
> arch implement L1D flushing if needed, there is no arch agnostic
> software L1D flush.
>
> 3. There is a fallback software L1D load, similar to what L1TF does, but
> we don't prefetch the TLB, is that sufficient?
> A. There was no conclusion, I suspect we don't need this
>
> 4. Should we consider cleaning up the L1D on arrival of tasks?
> A. For now, we think this case is not the priority for this patchset.
>
> In summary, this is an early PoC to start the discussion on the need for
> conditional L1D flushing based on the security posture of the
> application and the sensitivity of the data it has access to or might
> have access to.
>
> Changelog v2:
> - Reuse existing code for allocation and flush
> - Simplify the goto logic in the actual l1d_flush function
> - Optimize the code path with jump labels/static functions
>
> Cc: [email protected]
>
> Balbir Singh (4):
> arch/x86/kvm: Refactor l1d flush lifecycle management
> arch/x86: Refactor tlbflush and l1d flush
> arch/x86: Optionally flush L1D on context switch
> arch/x86: L1D flush, optimize the context switch
>

Ping, looking for comments and criticism of the approach. I understand with
the merge window around the corner everyone is busy. There is a bug in the v2
RFC series, I am happy to post a version without the RFC for broader testing
and feedback.

I am quite keen to hear about the interface and any concerns with the
arch_prctl() interface.

Balbir Singh.

2020-03-31 18:35:41

by Thomas Gleixner

[permalink] [raw]

Subject: Re: [RFC PATCH v2 3/4] arch/x86: Optionally flush L1D on context switch

Balbir Singh <[email protected]> writes:

> This patch implements a mechanisn to selectively flush the L1D cache.

git grep 'This patch' Documentation/process/

> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 6f66d841262d..1d535059b358 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -219,6 +219,12 @@ struct tlb_state {
> */
> unsigned long cr4;
>
> + /*
> + * Flush the L1D cache on switch_mm_irqs_off() for a
> + * task getting off the CPU, if it opted in to do so
> + */
> + bool last_user_mm_l1d_flush;

...

> +/*
> + * Flush the L1D cache for this CPU. We want to this at switch mm time,
> + * this is a pessimistic security measure and an opt-in for those tasks
> + * that host sensitive information and there are concerns about spills
> + * from fill buffers.

Color me confused, but how is L1D flush mitigating fill buffer spills
(MFBDS)? The MDS family is mitigated by MD_CLEAR, i.e VERW.

> + */
> +static void l1d_flush(struct mm_struct *next, struct task_struct *tsk)
> +{
> + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> +
> + /*
> + * If we are not really switching mm's, we can just return
> + */
> + if (real_prev == next)
> + return;

Instead of having the same check here, please stick the call into the
corresponding path in switch_mm_irqs_off(), i.e. where we already have
the cond_ibpb() invocation.

> + /*
> + * Do we need flushing for by the previous task

for by? Perhaps:

Did the previous task request L1D flush when it scheduled in?

> + */
> + if (this_cpu_read(cpu_tlbstate.last_user_mm_l1d_flush) != 0) {

This is a bool, so != 0 is pointless.

> + if (!flush_l1d_cache_hw())
> + flush_l1d_cache_sw(l1d_flush_pages);
> + this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 0);

s/0/false/

> + /* Make sure we clear the values before we set it again */
> + barrier();
> + }
> +
> + if (tsk == NULL)
> + return;
> +
> + /* We don't need stringent checks as we opt-in/opt-out */
> + if (test_ti_thread_flag(&tsk->thread_info, TIF_L1D_FLUSH))
> + this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 1);

s/1/true/

That aside looking at the gazillion of conditionals here. That's 4 in
the worst case. So how about extending cond_ibpb() like the below?

Thanks,

tglx

8<---------------------

--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -84,7 +84,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
-#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
+#define TIF_SPEC_FLUSH_L1D 10 /* L1D Flush in switch_mm() */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
@@ -96,6 +96,7 @@ struct thread_info {
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,7 +172,7 @@ struct tlb_state {
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
- unsigned long last_user_mm_ibpb;
+ unsigned long last_user_mm_spec;
};

u16 loaded_mm_asid;
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -33,10 +33,13 @@
*/

/*
- * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
- * stored in cpu_tlb_state.last_user_mm_ibpb.
+ * Bits mangle the TIF_SPEC_* state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_spec.
*/
#define LAST_USER_MM_IBPB 0x1UL
+#define LAST_USER_MM_FLUSH_L1D 0x2UL
+
+#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_FLUSH_L1D)

/*
* We get here when we do something requiring a TLB invalidation
@@ -189,18 +192,22 @@ static void sync_current_stack_to_mm(str
}
}

-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static inline unsigned long mm_mangle_tif_spec(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+ unsigned long bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
+
+ BUILD_BUG_ON(TIF_SPEC_FLUSH_L1D != TIF_SPEC_IB + 1);

- return (unsigned long)next->mm | ibpb;
+ return (unsigned long)next->mm | bits;
}

-static void cond_ibpb(struct task_struct *next)
+static void cond_mitigations(struct task_struct *next)
{
- if (!next || !next->mm)
- return;
+ unsigned long prev_mm, next_mm;
+
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
+ next_mm = mm_mangle_tif_spec(next);

/*
* Both, the conditional and the always IBPB mode use the mm
@@ -212,8 +219,6 @@ static void cond_ibpb(struct task_struct
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
- unsigned long prev_mm, next_mm;
-
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
@@ -243,20 +248,14 @@ static void cond_ibpb(struct task_struct
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
- * cpu_tlbstate.last_user_mm_ibpb for comparison.
- */
- next_mm = mm_mangle_tif_spec_ib(next);
- prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
-
- /*
+ * cpu_tlbstate.last_user_mm_spec for comparison.
+ *
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
-
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}

if (static_branch_unlikely(&switch_mm_always_ibpb)) {
@@ -265,11 +264,15 @@ static void cond_ibpb(struct task_struct
* different context than the user space task which ran
* last on this CPU.
*/
- if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
+ (unsigned long)next->mm)
indirect_branch_prediction_barrier();
- this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
- }
}
+
+ if (prev_mm & LAST_USER_MM_FLUSH_L1D)
+ flush_l1d();
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
}

void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
@@ -371,11 +374,10 @@ void switch_mm_irqs_off(struct mm_struct
need_flush = true;
} else {
/*
- * Avoid user/user BTB poisoning by flushing the branch
- * predictor when switching between processes. This stops
- * one process from doing Spectre-v2 attacks on another.
+ * Speculation vulnerability mitigations when switching
+ * to a different user space process.
*/
- cond_ibpb(tsk);
+ cond_mitigations(tsk);

if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -501,7 +503,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));

/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_SPEC_MASK);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);

2020-03-31 23:58:09

by Singh, Balbir

[permalink] [raw]

Subject: Re: [RFC PATCH v2 3/4] arch/x86: Optionally flush L1D on context switch

On Tue, 2020-03-31 at 20:34 +0200, Thomas Gleixner wrote:
>
> Balbir Singh <[email protected]> writes:
>
> > This patch implements a mechanisn to selectively flush the L1D cache.
>
> git grep 'This patch' Documentation/process/
>

I'll get more imperative, thanks!

> > diff --git a/arch/x86/include/asm/tlbflush.h
> > b/arch/x86/include/asm/tlbflush.h
> > index 6f66d841262d..1d535059b358 100644
> > --- a/arch/x86/include/asm/tlbflush.h
> > +++ b/arch/x86/include/asm/tlbflush.h
> > @@ -219,6 +219,12 @@ struct tlb_state {
> > */
> > unsigned long cr4;
> >
> > + /*
> > + * Flush the L1D cache on switch_mm_irqs_off() for a
> > + * task getting off the CPU, if it opted in to do so
> > + */
> > + bool last_user_mm_l1d_flush;
>
> ...
>
> > +/*
> > + * Flush the L1D cache for this CPU. We want to this at switch mm time,
> > + * this is a pessimistic security measure and an opt-in for those tasks
> > + * that host sensitive information and there are concerns about spills
> > + * from fill buffers.
>
> Color me confused, but how is L1D flush mitigating fill buffer spills
> (MFBDS)? The MDS family is mitigated by MD_CLEAR, i.e VERW.

I should reword that sentence to say snoop from L1D

>
> > + */
> > +static void l1d_flush(struct mm_struct *next, struct task_struct *tsk)
> > +{
> > + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
> > +
> > + /*
> > + * If we are not really switching mm's, we can just return
> > + */
> > + if (real_prev == next)
> > + return;
>
> Instead of having the same check here, please stick the call into the
> corresponding path in switch_mm_irqs_off(), i.e. where we already have
> the cond_ibpb() invocation.
>

Sure, will do

> > + /*
> > + * Do we need flushing for by the previous task
>
> for by? Perhaps:

I'll fix that comment up

>
> Did the previous task request L1D flush when it scheduled in?
>
> > + */
> > + if (this_cpu_read(cpu_tlbstate.last_user_mm_l1d_flush) != 0) {
>
> This is a bool, so != 0 is pointless.
>
> > + if (!flush_l1d_cache_hw())
> > + flush_l1d_cache_sw(l1d_flush_pages);
> > + this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 0);
>
> s/0/false/
>

Will do

> > + /* Make sure we clear the values before we set it again */
> > + barrier();
> > + }
> > +
> > + if (tsk == NULL)
> > + return;
> > +
> > + /* We don't need stringent checks as we opt-in/opt-out */
> > + if (test_ti_thread_flag(&tsk->thread_info, TIF_L1D_FLUSH))
> > + this_cpu_write(cpu_tlbstate.last_user_mm_l1d_flush, 1);
>
> s/1/true/
>
> That aside looking at the gazillion of conditionals here. That's 4 in
> the worst case. So how about extending cond_ibpb() like the below?
>
> Thanks,
>
> tglx
>

Makes sense, it mostly looks good! Let me refactor the comments and code.

Balbir

> 8<---------------------
>
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -84,7 +84,7 @@ struct thread_info {
> #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
> #define TIF_SECCOMP 8 /* secure computing */
> #define TIF_SPEC_IB 9 /* Indirect branch speculation
> mitigation */
> -#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in
> context switch */
> +#define TIF_SPEC_FLUSH_L1D 10 /* L1D Flush in switch_mm() */
> #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return
> */
> #define TIF_UPROBE 12 /* breakpointed or singlestepping */
> #define TIF_PATCH_PENDING 13 /* pending live patching update */
> @@ -96,6 +96,7 @@ struct thread_info {
> #define TIF_MEMDIE 20 /* is terminating due to OOM killer
> */
> #define TIF_POLLING_NRFLAG 21 /* idle is polling for
> TIF_NEED_RESCHED */
> #define TIF_IO_BITMAP 22 /* uses I/O bitmap */
> +#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in
> context switch */
> #define TIF_FORCED_TF 24 /* true if TF in eflags artificially
> */
> #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF
> */
> #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily
> */
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -172,7 +172,7 @@ struct tlb_state {
> /* Last user mm for optimizing IBPB */
> union {
> struct mm_struct *last_user_mm;
> - unsigned long last_user_mm_ibpb;
> + unsigned long last_user_mm_spec;
> };
>
> u16 loaded_mm_asid;
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -33,10 +33,13 @@
> */
>
> /*
> - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
> - * stored in cpu_tlb_state.last_user_mm_ibpb.
> + * Bits mangle the TIF_SPEC_* state into the mm pointer which is
> + * stored in cpu_tlb_state.last_user_mm_spec.
> */
> #define LAST_USER_MM_IBPB 0x1UL
> +#define LAST_USER_MM_FLUSH_L1D 0x2UL
> +
> +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_FLUSH_L1D)
>
> /*
> * We get here when we do something requiring a TLB invalidation
> @@ -189,18 +192,22 @@ static void sync_current_stack_to_mm(str
> }
> }
>
> -static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
> +static inline unsigned long mm_mangle_tif_spec(struct task_struct *next)
> {
> unsigned long next_tif = task_thread_info(next)->flags;
> - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
> + unsigned long bits = (next_tif >> TIF_SPEC_IB) &
> LAST_USER_MM_SPEC_MASK;
> +
> + BUILD_BUG_ON(TIF_SPEC_FLUSH_L1D != TIF_SPEC_IB + 1);
>
> - return (unsigned long)next->mm | ibpb;
> + return (unsigned long)next->mm | bits;
> }
>
> -static void cond_ibpb(struct task_struct *next)
> +static void cond_mitigations(struct task_struct *next)
> {
> - if (!next || !next->mm)
> - return;
> + unsigned long prev_mm, next_mm;
> +
> + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
> + next_mm = mm_mangle_tif_spec(next);
>
> /*
> * Both, the conditional and the always IBPB mode use the mm
> @@ -212,8 +219,6 @@ static void cond_ibpb(struct task_struct
> * exposed data is not really interesting.
> */
> if (static_branch_likely(&switch_mm_cond_ibpb)) {
> - unsigned long prev_mm, next_mm;
> -
> /*
> * This is a bit more complex than the always mode because
> * it has to handle two cases:
> @@ -243,20 +248,14 @@ static void cond_ibpb(struct task_struct
> * Optimize this with reasonably small overhead for the
> * above cases. Mangle the TIF_SPEC_IB bit into the mm
> * pointer of the incoming task which is stored in
> - * cpu_tlbstate.last_user_mm_ibpb for comparison.
> - */
> - next_mm = mm_mangle_tif_spec_ib(next);
> - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
> -
> - /*
> + * cpu_tlbstate.last_user_mm_spec for comparison.
> + *
> * Issue IBPB only if the mm's are different and one or
> * both have the IBPB bit set.
> */
> if (next_mm != prev_mm &&
> (next_mm | prev_mm) & LAST_USER_MM_IBPB)
> indirect_branch_prediction_barrier();
> -
> - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
> }
>
> if (static_branch_unlikely(&switch_mm_always_ibpb)) {
> @@ -265,11 +264,15 @@ static void cond_ibpb(struct task_struct
> * different context than the user space task which ran
> * last on this CPU.
> */
> - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
> + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) !=
> + (unsigned long)next->mm)
> indirect_branch_prediction_barrier();
> - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
> - }
> }
> +
> + if (prev_mm & LAST_USER_MM_FLUSH_L1D)
> + flush_l1d();
> +
> + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
> }
>
> void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> @@ -371,11 +374,10 @@ void switch_mm_irqs_off(struct mm_struct
> need_flush = true;
> } else {
> /*
> - * Avoid user/user BTB poisoning by flushing the branch
> - * predictor when switching between processes. This stops
> - * one process from doing Spectre-v2 attacks on another.
> + * Speculation vulnerability mitigations when switching
> + * to a different user space process.
> */
> - cond_ibpb(tsk);
> + cond_mitigations(tsk);
>
> if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> /*
> @@ -501,7 +503,7 @@ void initialize_tlbstate_and_flush(void)
> write_cr3(build_cr3(mm->pgd, 0));
>
> /* Reinitialize tlbstate. */
> - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
> + this_cpu_write(cpu_tlbstate.last_user_mm_spec,
> LAST_USER_MM_SPEC_MASK);
> this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
> this_cpu_write(cpu_tlbstate.next_asid, 1);
> this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);