2020-10-20 09:00:53

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

Core-scheduling prevents hyperthreads in usermode from attacking each
other, but it does not do anything about one of the hyperthreads
entering the kernel for any reason. This leaves the door open for MDS
and L1TF attacks with concurrent execution sequences between
hyperthreads.

This patch therefore adds support for protecting all syscall and IRQ
kernel mode entries. Care is taken to track the outermost usermode exit
and entry using per-cpu counters. In cases where one of the hyperthreads
enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
when not needed - example: idle and non-cookie HTs do not need to be
forced into kernel mode.

More information about attacks:
For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
data to either host or guest attackers. For L1TF, it is possible to leak
to guest attackers. There is no possible mitigation involving flushing
of buffers to avoid this since the execution of attacker and victims
happen concurrently on 2 or more HTs.

Cc: Julien Desfossez <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Aaron Lu <[email protected]>
Cc: Aubrey Li <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Co-developed-by: Vineeth Pillai <[email protected]>
Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Vineeth Pillai <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 7 +
include/linux/entry-common.h | 2 +-
include/linux/sched.h | 12 +
kernel/entry/common.c | 25 +-
kernel/sched/core.c | 229 ++++++++++++++++++
kernel/sched/sched.h | 3 +
6 files changed, 275 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3236427e2215..48567110f709 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,13 @@

sbni= [NET] Granch SBNI12 leased line adapter

+ sched_core_protect_kernel=
+ [SCHED_CORE] Pause SMT siblings of a core running in
+ user mode, if at least one of the siblings of the core
+ is running in kernel mode. This is to guarantee that
+ kernel data is not leaked to tasks which are not trusted
+ by the kernel.
+
sched_debug [KNL] Enables verbose scheduler debug messages.

schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 474f29638d2c..260216de357b 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -69,7 +69,7 @@

#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
+ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
ARCH_EXIT_TO_USER_MODE_WORK)

/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d38e904dd603..fe6f225bfbf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

+#ifdef CONFIG_SCHED_CORE
+void sched_core_unsafe_enter(void);
+void sched_core_unsafe_exit(void);
+bool sched_core_wait_till_safe(unsigned long ti_check);
+bool sched_core_kernel_protected(void);
+#else
+#define sched_core_unsafe_enter(ignore) do { } while (0)
+#define sched_core_unsafe_exit(ignore) do { } while (0)
+#define sched_core_wait_till_safe(ignore) do { } while (0)
+#define sched_core_kernel_protected(ignore) do { } while (0)
+#endif
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0a1e20f8d4e8..c8dc6b1b1f40 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal(struct pt_regs *regs) { }

+unsigned long exit_to_user_get_work(void)
+{
+ unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+ if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
+ return ti_work;
+
+#ifdef CONFIG_SCHED_CORE
+ ti_work &= EXIT_TO_USER_MODE_WORK;
+ if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
+ sched_core_unsafe_exit();
+ if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
+ sched_core_unsafe_enter(); /* not exiting to user yet. */
+ }
+ }
+
+ return READ_ONCE(current_thread_info()->flags);
+#endif
+}
+
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -175,7 +195,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = exit_to_user_get_work();
}

/* Return the latest work state for arch_exit_to_user_mode() */
@@ -184,9 +204,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,

static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work;

lockdep_assert_irqs_disabled();
+ ti_work = exit_to_user_get_work();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02db5b024768..5a7aeaa914e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -76,6 +76,27 @@ __read_mostly int scheduler_running;

#ifdef CONFIG_SCHED_CORE

+DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
+static int __init set_sched_core_protect_kernel(char *str)
+{
+ unsigned long val = 0;
+
+ if (!str)
+ return 0;
+
+ if (!kstrtoul(str, 0, &val) && !val)
+ static_branch_disable(&sched_core_protect_kernel);
+
+ return 1;
+}
+__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
+
+/* Is the kernel protected by core scheduling? */
+bool sched_core_kernel_protected(void)
+{
+ return static_branch_likely(&sched_core_protect_kernel);
+}
+
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */
@@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}

+/*
+ * Handler to attempt to enter kernel. It does nothing because the exit to
+ * usermode or guest mode will do the actual work (of waiting if needed).
+ */
+static void sched_core_irq_work(struct irq_work *work)
+{
+ return;
+}
+
+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+ init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
+/*
+ * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
+ * exits the core-wide unsafe state. Obviously the CPU calling this function
+ * should not be responsible for the core being in the core-wide unsafe state
+ * otherwise it will deadlock.
+ *
+ * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
+ * the loop if TIF flags are set and notify caller about it.
+ *
+ * IRQs should be disabled.
+ */
+bool sched_core_wait_till_safe(unsigned long ti_check)
+{
+ bool restart = false;
+ struct rq *rq;
+ int cpu;
+
+ /* We clear the thread flag only at the end, so need to check for it. */
+ ti_check &= ~_TIF_UNSAFE_RET;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Down grade to allow interrupts to prevent stop_machine lockups.. */
+ preempt_disable();
+ local_irq_enable();
+
+ /*
+ * Wait till the core of this HT is not in an unsafe state.
+ *
+ * Pair with smp_store_release() in sched_core_unsafe_exit().
+ */
+ while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
+ cpu_relax();
+ if (READ_ONCE(current_thread_info()->flags) & ti_check) {
+ restart = true;
+ break;
+ }
+ }
+
+ /* Upgrade it back to the expectations of entry code. */
+ local_irq_disable();
+ preempt_enable();
+
+ret:
+ if (!restart)
+ clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ return restart;
+}
+
+/*
+ * Enter the core-wide IRQ state. Sibling will be paused if it is running
+ * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
+ * avoid sending useless IPIs is made. Must be called only from hard IRQ
+ * context.
+ */
+void sched_core_unsafe_enter(void)
+{
+ const struct cpumask *smt_mask;
+ unsigned long flags;
+ struct rq *rq;
+ int i, cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ /* Ensure that on return to user/guest, we check whether to wait. */
+ if (current->core_cookie)
+ set_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
+ rq->core_this_unsafe_nest++;
+
+ /* Should not nest: enter() should only pair with exit(). */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ smt_mask = cpu_smt_mask(cpu);
+
+ /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
+ WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
+
+ if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
+ goto unlock;
+
+ if (irq_work_is_busy(&rq->core_irq_work)) {
+ /*
+ * Do nothing more since we are in an IPI sent from another
+ * sibling to enforce safety. That sibling would have sent IPIs
+ * to all of the HTs.
+ */
+ goto unlock;
+ }
+
+ /*
+ * If we are not the first ones on the core to enter core-wide unsafe
+ * state, do nothing.
+ */
+ if (rq->core->core_unsafe_nest > 1)
+ goto unlock;
+
+ /* Do nothing more if the core is not tagged. */
+ if (!rq->core->core_cookie)
+ goto unlock;
+
+ for_each_cpu(i, smt_mask) {
+ struct rq *srq = cpu_rq(i);
+
+ if (i == cpu || cpu_is_offline(i))
+ continue;
+
+ if (!srq->curr->mm || is_task_rq_idle(srq->curr))
+ continue;
+
+ /* Skip if HT is not running a tagged task. */
+ if (!srq->curr->core_cookie && !srq->core_pick)
+ continue;
+
+ /*
+ * Force sibling into the kernel by IPI. If work was already
+ * pending, no new IPIs are sent. This is Ok since the receiver
+ * would already be in the kernel, or on its way to it.
+ */
+ irq_work_queue_on(&srq->core_irq_work, i);
+ }
+unlock:
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
+/*
+ * Process any work need for either exiting the core-wide unsafe state, or for
+ * waiting on this hyperthread if the core is still in this state.
+ *
+ * @idle: Are we called from the idle loop?
+ */
+void sched_core_unsafe_exit(void)
+{
+ unsigned long flags;
+ unsigned int nest;
+ struct rq *rq;
+ int cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ /* Do nothing if core-sched disabled. */
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /*
+ * Can happen when a process is forked and the first return to user
+ * mode is a syscall exit. Either way, there's nothing to do.
+ */
+ if (rq->core_this_unsafe_nest == 0)
+ goto ret;
+
+ rq->core_this_unsafe_nest--;
+
+ /* enter() should be paired with exit() only. */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ /*
+ * Core-wide nesting counter can never be 0 because we are
+ * still in it on this CPU.
+ */
+ nest = rq->core->core_unsafe_nest;
+ WARN_ON_ONCE(!nest);
+
+ /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
+ smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
// XXX fairness/fwd progress conditions
/*
* Returns
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f7e2d8a3be8e..4bcf3b1ddfb3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1059,12 +1059,15 @@ struct rq {
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
+ struct irq_work core_irq_work; /* To force HT into kernel */
+ unsigned int core_this_unsafe_nest;

/* shared state */
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
unsigned char core_forceidle;
+ unsigned int core_unsafe_nest;
#endif
};

--
2.29.0.rc1.297.gfa9743e501-goog


2020-10-20 17:26:26

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On 10/19/20 6:43 PM, Joel Fernandes (Google) wrote:
>
> ---
> .../admin-guide/kernel-parameters.txt | 7 +
> include/linux/entry-common.h | 2 +-
> include/linux/sched.h | 12 +
> kernel/entry/common.c | 25 +-
> kernel/sched/core.c | 229 ++++++++++++++++++
> kernel/sched/sched.h | 3 +
> 6 files changed, 275 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 3236427e2215..48567110f709 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4678,6 +4678,13 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> + sched_core_protect_kernel=

Needs a list of possible values after '=', along with telling us
what the default value/setting is.


> + [SCHED_CORE] Pause SMT siblings of a core running in
> + user mode, if at least one of the siblings of the core
> + is running in kernel mode. This is to guarantee that
> + kernel data is not leaked to tasks which are not trusted
> + by the kernel.
> +


thanks.

2020-10-22 08:31:35

by Li, Aubrey

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On 2020/10/20 9:43, Joel Fernandes (Google) wrote:
> Core-scheduling prevents hyperthreads in usermode from attacking each
> other, but it does not do anything about one of the hyperthreads
> entering the kernel for any reason. This leaves the door open for MDS
> and L1TF attacks with concurrent execution sequences between
> hyperthreads.
>
> This patch therefore adds support for protecting all syscall and IRQ
> kernel mode entries. Care is taken to track the outermost usermode exit
> and entry using per-cpu counters. In cases where one of the hyperthreads
> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> when not needed - example: idle and non-cookie HTs do not need to be
> forced into kernel mode.
>
> More information about attacks:
> For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> data to either host or guest attackers. For L1TF, it is possible to leak
> to guest attackers. There is no possible mitigation involving flushing
> of buffers to avoid this since the execution of attacker and victims
> happen concurrently on 2 or more HTs.
>
> Cc: Julien Desfossez <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Aaron Lu <[email protected]>
> Cc: Aubrey Li <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Co-developed-by: Vineeth Pillai <[email protected]>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Vineeth Pillai <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 7 +
> include/linux/entry-common.h | 2 +-
> include/linux/sched.h | 12 +
> kernel/entry/common.c | 25 +-
> kernel/sched/core.c | 229 ++++++++++++++++++
> kernel/sched/sched.h | 3 +
> 6 files changed, 275 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 3236427e2215..48567110f709 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4678,6 +4678,13 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> + sched_core_protect_kernel=
> + [SCHED_CORE] Pause SMT siblings of a core running in
> + user mode, if at least one of the siblings of the core
> + is running in kernel mode. This is to guarantee that
> + kernel data is not leaked to tasks which are not trusted
> + by the kernel.
> +
> sched_debug [KNL] Enables verbose scheduler debug messages.
>
> schedstats= [KNL,X86] Enable or disable scheduled statistics.
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 474f29638d2c..260216de357b 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -69,7 +69,7 @@
>
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d38e904dd603..fe6f225bfbf9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>
> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>
> +#ifdef CONFIG_SCHED_CORE
> +void sched_core_unsafe_enter(void);
> +void sched_core_unsafe_exit(void);
> +bool sched_core_wait_till_safe(unsigned long ti_check);
> +bool sched_core_kernel_protected(void);
> +#else
> +#define sched_core_unsafe_enter(ignore) do { } while (0)
> +#define sched_core_unsafe_exit(ignore) do { } while (0)
> +#define sched_core_wait_till_safe(ignore) do { } while (0)
> +#define sched_core_kernel_protected(ignore) do { } while (0)
> +#endif
> +
> #endif
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 0a1e20f8d4e8..c8dc6b1b1f40 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
> /* Workaround to allow gradual conversion of architecture code */
> void __weak arch_do_signal(struct pt_regs *regs) { }
>
> +unsigned long exit_to_user_get_work(void)
> +{
> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> +
> + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> + return ti_work;
> +
> +#ifdef CONFIG_SCHED_CORE
> + ti_work &= EXIT_TO_USER_MODE_WORK;
> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {

Though _TIF_UNSAFE_RET is not x86 specific, but I only saw the definition in x86.
I'm not sure if other SMT capable architectures are happy with this?


> + sched_core_unsafe_exit();
> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> + sched_core_unsafe_enter(); /* not exiting to user yet. */
> + }
> + }
> +
> + return READ_ONCE(current_thread_info()->flags);
> +#endif
> +}
> +
> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> unsigned long ti_work)
> {
> @@ -175,7 +195,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> * enabled above.
> */
> local_irq_disable_exit_to_user();
> - ti_work = READ_ONCE(current_thread_info()->flags);
> + ti_work = exit_to_user_get_work();
> }
>
> /* Return the latest work state for arch_exit_to_user_mode() */
> @@ -184,9 +204,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> static void exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> + unsigned long ti_work;
>
> lockdep_assert_irqs_disabled();
> + ti_work = exit_to_user_get_work();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02db5b024768..5a7aeaa914e3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
>
> #ifdef CONFIG_SCHED_CORE
>
> +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> +static int __init set_sched_core_protect_kernel(char *str)
> +{
> + unsigned long val = 0;
> +
> + if (!str)
> + return 0;
> +
> + if (!kstrtoul(str, 0, &val) && !val)
> + static_branch_disable(&sched_core_protect_kernel);
> +
> + return 1;
> +}
> +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> +
> +/* Is the kernel protected by core scheduling? */
> +bool sched_core_kernel_protected(void)
> +{
> + return static_branch_likely(&sched_core_protect_kernel);
> +}
> +
> DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
>
> /* kernel prio, less is more */
> @@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> return a->core_cookie == b->core_cookie;
> }
>
> +/*
> + * Handler to attempt to enter kernel. It does nothing because the exit to
> + * usermode or guest mode will do the actual work (of waiting if needed).
> + */
> +static void sched_core_irq_work(struct irq_work *work)
> +{
> + return;
> +}
> +
> +static inline void init_sched_core_irq_work(struct rq *rq)
> +{
> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> +}
> +
> +/*
> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> + * exits the core-wide unsafe state. Obviously the CPU calling this function
> + * should not be responsible for the core being in the core-wide unsafe state
> + * otherwise it will deadlock.
> + *
> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> + * the loop if TIF flags are set and notify caller about it.
> + *
> + * IRQs should be disabled.
> + */
> +bool sched_core_wait_till_safe(unsigned long ti_check)
> +{
> + bool restart = false;
> + struct rq *rq;
> + int cpu;
> +
> + /* We clear the thread flag only at the end, so need to check for it. */
> + ti_check &= ~_TIF_UNSAFE_RET;
> +
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> + preempt_disable();
> + local_irq_enable();
> +
> + /*
> + * Wait till the core of this HT is not in an unsafe state.
> + *
> + * Pair with smp_store_release() in sched_core_unsafe_exit().
> + */
> + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> + cpu_relax();
> + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> + restart = true;
> + break;
> + }
> + }
> +
> + /* Upgrade it back to the expectations of entry code. */
> + local_irq_disable();
> + preempt_enable();
> +
> +ret:
> + if (!restart)
> + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + return restart;
> +}
> +
> +/*
> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> + * context.
> + */
> +void sched_core_unsafe_enter(void)
> +{
> + const struct cpumask *smt_mask;
> + unsigned long flags;
> + struct rq *rq;
> + int i, cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + /* Ensure that on return to user/guest, we check whether to wait. */
> + if (current->core_cookie)
> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> + rq->core_this_unsafe_nest++;
> +
> + /* Should not nest: enter() should only pair with exit(). */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + smt_mask = cpu_smt_mask(cpu);
> +
> + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> +
> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> + goto unlock;
> +
> + if (irq_work_is_busy(&rq->core_irq_work)) {
> + /*
> + * Do nothing more since we are in an IPI sent from another
> + * sibling to enforce safety. That sibling would have sent IPIs
> + * to all of the HTs.
> + */
> + goto unlock;
> + }
> +
> + /*
> + * If we are not the first ones on the core to enter core-wide unsafe
> + * state, do nothing.
> + */
> + if (rq->core->core_unsafe_nest > 1)
> + goto unlock;
> +
> + /* Do nothing more if the core is not tagged. */
> + if (!rq->core->core_cookie)
> + goto unlock;
> +
> + for_each_cpu(i, smt_mask) {
> + struct rq *srq = cpu_rq(i);
> +
> + if (i == cpu || cpu_is_offline(i))
> + continue;
> +
> + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> + continue;
> +
> + /* Skip if HT is not running a tagged task. */
> + if (!srq->curr->core_cookie && !srq->core_pick)
> + continue;
> +
> + /*
> + * Force sibling into the kernel by IPI. If work was already
> + * pending, no new IPIs are sent. This is Ok since the receiver
> + * would already be in the kernel, or on its way to it.
> + */
> + irq_work_queue_on(&srq->core_irq_work, i);
> + }
> +unlock:
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> +/*
> + * Process any work need for either exiting the core-wide unsafe state, or for
> + * waiting on this hyperthread if the core is still in this state.
> + *
> + * @idle: Are we called from the idle loop?
> + */
> +void sched_core_unsafe_exit(void)
> +{
> + unsigned long flags;
> + unsigned int nest;
> + struct rq *rq;
> + int cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + /* Do nothing if core-sched disabled. */
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /*
> + * Can happen when a process is forked and the first return to user
> + * mode is a syscall exit. Either way, there's nothing to do.
> + */
> + if (rq->core_this_unsafe_nest == 0)
> + goto ret;
> +
> + rq->core_this_unsafe_nest--;
> +
> + /* enter() should be paired with exit() only. */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + /*
> + * Core-wide nesting counter can never be 0 because we are
> + * still in it on this CPU.
> + */
> + nest = rq->core->core_unsafe_nest;
> + WARN_ON_ONCE(!nest);
> +
> + /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
> + smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> // XXX fairness/fwd progress conditions
> /*
> * Returns
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index f7e2d8a3be8e..4bcf3b1ddfb3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1059,12 +1059,15 @@ struct rq {
> unsigned int core_enabled;
> unsigned int core_sched_seq;
> struct rb_root core_tree;
> + struct irq_work core_irq_work; /* To force HT into kernel */
> + unsigned int core_this_unsafe_nest;
>
> /* shared state */
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> unsigned char core_forceidle;
> + unsigned int core_unsafe_nest;
> #endif
> };
>
>

2020-10-30 10:35:21

by Alexandre Chartre

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode


On 10/20/20 3:43 AM, Joel Fernandes (Google) wrote:
> Core-scheduling prevents hyperthreads in usermode from attacking each
> other, but it does not do anything about one of the hyperthreads
> entering the kernel for any reason. This leaves the door open for MDS
> and L1TF attacks with concurrent execution sequences between
> hyperthreads.
>
> This patch therefore adds support for protecting all syscall and IRQ
> kernel mode entries. Care is taken to track the outermost usermode exit
> and entry using per-cpu counters. In cases where one of the hyperthreads
> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> when not needed - example: idle and non-cookie HTs do not need to be
> forced into kernel mode.

Hi Joel,

In order to protect syscall/IRQ kernel mode entries, shouldn't we have a
call to sched_core_unsafe_enter() in the syscall/IRQ entry code? I don't
see such a call. Am I missing something?


> More information about attacks:
> For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> data to either host or guest attackers. For L1TF, it is possible to leak
> to guest attackers. There is no possible mitigation involving flushing
> of buffers to avoid this since the execution of attacker and victims
> happen concurrently on 2 or more HTs.
>
> Cc: Julien Desfossez <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Aaron Lu <[email protected]>
> Cc: Aubrey Li <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Co-developed-by: Vineeth Pillai <[email protected]>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Vineeth Pillai <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 7 +
> include/linux/entry-common.h | 2 +-
> include/linux/sched.h | 12 +
> kernel/entry/common.c | 25 +-
> kernel/sched/core.c | 229 ++++++++++++++++++
> kernel/sched/sched.h | 3 +
> 6 files changed, 275 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 3236427e2215..48567110f709 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4678,6 +4678,13 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> + sched_core_protect_kernel=
> + [SCHED_CORE] Pause SMT siblings of a core running in
> + user mode, if at least one of the siblings of the core
> + is running in kernel mode. This is to guarantee that
> + kernel data is not leaked to tasks which are not trusted
> + by the kernel.
> +
> sched_debug [KNL] Enables verbose scheduler debug messages.
>
> schedstats= [KNL,X86] Enable or disable scheduled statistics.
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 474f29638d2c..260216de357b 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -69,7 +69,7 @@
>
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d38e904dd603..fe6f225bfbf9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>
> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>
> +#ifdef CONFIG_SCHED_CORE
> +void sched_core_unsafe_enter(void);
> +void sched_core_unsafe_exit(void);
> +bool sched_core_wait_till_safe(unsigned long ti_check);
> +bool sched_core_kernel_protected(void);
> +#else
> +#define sched_core_unsafe_enter(ignore) do { } while (0)
> +#define sched_core_unsafe_exit(ignore) do { } while (0)
> +#define sched_core_wait_till_safe(ignore) do { } while (0)
> +#define sched_core_kernel_protected(ignore) do { } while (0)
> +#endif
> +
> #endif
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 0a1e20f8d4e8..c8dc6b1b1f40 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
> /* Workaround to allow gradual conversion of architecture code */
> void __weak arch_do_signal(struct pt_regs *regs) { }
>
> +unsigned long exit_to_user_get_work(void)
> +{
> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> +
> + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> + return ti_work;
> +
> +#ifdef CONFIG_SCHED_CORE
> + ti_work &= EXIT_TO_USER_MODE_WORK;
> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> + sched_core_unsafe_exit();
> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {

If we call sched_core_unsafe_exit() before sched_core_wait_till_safe() then we
expose ourself during the entire wait period in sched_core_wait_till_safe(). It
would be better to call sched_core_unsafe_exit() once we know for sure we are
going to exit.

alex.


> + sched_core_unsafe_enter(); /* not exiting to user yet. */
> + }
> + }
> +
> + return READ_ONCE(current_thread_info()->flags);
> +#endif
> +}
> +
> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> unsigned long ti_work)
> {
> @@ -175,7 +195,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> * enabled above.
> */
> local_irq_disable_exit_to_user();
> - ti_work = READ_ONCE(current_thread_info()->flags);
> + ti_work = exit_to_user_get_work();
> }
>
> /* Return the latest work state for arch_exit_to_user_mode() */
> @@ -184,9 +204,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> static void exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> + unsigned long ti_work;
>
> lockdep_assert_irqs_disabled();
> + ti_work = exit_to_user_get_work();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02db5b024768..5a7aeaa914e3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
>
> #ifdef CONFIG_SCHED_CORE
>
> +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> +static int __init set_sched_core_protect_kernel(char *str)
> +{
> + unsigned long val = 0;
> +
> + if (!str)
> + return 0;
> +
> + if (!kstrtoul(str, 0, &val) && !val)
> + static_branch_disable(&sched_core_protect_kernel);
> +
> + return 1;
> +}
> +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> +
> +/* Is the kernel protected by core scheduling? */
> +bool sched_core_kernel_protected(void)
> +{
> + return static_branch_likely(&sched_core_protect_kernel);
> +}
> +
> DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
>
> /* kernel prio, less is more */
> @@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> return a->core_cookie == b->core_cookie;
> }
>
> +/*
> + * Handler to attempt to enter kernel. It does nothing because the exit to
> + * usermode or guest mode will do the actual work (of waiting if needed).
> + */
> +static void sched_core_irq_work(struct irq_work *work)
> +{
> + return;
> +}
> +
> +static inline void init_sched_core_irq_work(struct rq *rq)
> +{
> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> +}
> +
> +/*
> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> + * exits the core-wide unsafe state. Obviously the CPU calling this function
> + * should not be responsible for the core being in the core-wide unsafe state
> + * otherwise it will deadlock.
> + *
> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> + * the loop if TIF flags are set and notify caller about it.
> + *
> + * IRQs should be disabled.
> + */
> +bool sched_core_wait_till_safe(unsigned long ti_check)
> +{
> + bool restart = false;
> + struct rq *rq;
> + int cpu;
> +
> + /* We clear the thread flag only at the end, so need to check for it. */
> + ti_check &= ~_TIF_UNSAFE_RET;
> +
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> + preempt_disable();
> + local_irq_enable();
> +
> + /*
> + * Wait till the core of this HT is not in an unsafe state.
> + *
> + * Pair with smp_store_release() in sched_core_unsafe_exit().
> + */
> + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> + cpu_relax();
> + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> + restart = true;
> + break;
> + }
> + }
> +
> + /* Upgrade it back to the expectations of entry code. */
> + local_irq_disable();
> + preempt_enable();
> +
> +ret:
> + if (!restart)
> + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + return restart;
> +}
> +
> +/*
> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> + * context.
> + */
> +void sched_core_unsafe_enter(void)
> +{
> + const struct cpumask *smt_mask;
> + unsigned long flags;
> + struct rq *rq;
> + int i, cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + /* Ensure that on return to user/guest, we check whether to wait. */
> + if (current->core_cookie)
> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> + rq->core_this_unsafe_nest++;
> +
> + /* Should not nest: enter() should only pair with exit(). */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + smt_mask = cpu_smt_mask(cpu);
> +
> + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> +
> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> + goto unlock;
> +
> + if (irq_work_is_busy(&rq->core_irq_work)) {
> + /*
> + * Do nothing more since we are in an IPI sent from another
> + * sibling to enforce safety. That sibling would have sent IPIs
> + * to all of the HTs.
> + */
> + goto unlock;
> + }
> +
> + /*
> + * If we are not the first ones on the core to enter core-wide unsafe
> + * state, do nothing.
> + */
> + if (rq->core->core_unsafe_nest > 1)
> + goto unlock;
> +
> + /* Do nothing more if the core is not tagged. */
> + if (!rq->core->core_cookie)
> + goto unlock;
> +
> + for_each_cpu(i, smt_mask) {
> + struct rq *srq = cpu_rq(i);
> +
> + if (i == cpu || cpu_is_offline(i))
> + continue;
> +
> + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> + continue;
> +
> + /* Skip if HT is not running a tagged task. */
> + if (!srq->curr->core_cookie && !srq->core_pick)
> + continue;
> +
> + /*
> + * Force sibling into the kernel by IPI. If work was already
> + * pending, no new IPIs are sent. This is Ok since the receiver
> + * would already be in the kernel, or on its way to it.
> + */
> + irq_work_queue_on(&srq->core_irq_work, i);
> + }
> +unlock:
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> +/*
> + * Process any work need for either exiting the core-wide unsafe state, or for
> + * waiting on this hyperthread if the core is still in this state.
> + *
> + * @idle: Are we called from the idle loop?
> + */
> +void sched_core_unsafe_exit(void)
> +{
> + unsigned long flags;
> + unsigned int nest;
> + struct rq *rq;
> + int cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + /* Do nothing if core-sched disabled. */
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /*
> + * Can happen when a process is forked and the first return to user
> + * mode is a syscall exit. Either way, there's nothing to do.
> + */
> + if (rq->core_this_unsafe_nest == 0)
> + goto ret;
> +
> + rq->core_this_unsafe_nest--;
> +
> + /* enter() should be paired with exit() only. */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + /*
> + * Core-wide nesting counter can never be 0 because we are
> + * still in it on this CPU.
> + */
> + nest = rq->core->core_unsafe_nest;
> + WARN_ON_ONCE(!nest);
> +
> + /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
> + smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> // XXX fairness/fwd progress conditions
> /*
> * Returns
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index f7e2d8a3be8e..4bcf3b1ddfb3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1059,12 +1059,15 @@ struct rq {
> unsigned int core_enabled;
> unsigned int core_sched_seq;
> struct rb_root core_tree;
> + struct irq_work core_irq_work; /* To force HT into kernel */
> + unsigned int core_this_unsafe_nest;
>
> /* shared state */
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> unsigned char core_forceidle;
> + unsigned int core_unsafe_nest;
> #endif
> };
>
>

2020-11-03 00:22:26

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On Mon, Oct 19, 2020 at 08:41:04PM -0700, Randy Dunlap wrote:
> On 10/19/20 6:43 PM, Joel Fernandes (Google) wrote:
> >
> > ---
> > .../admin-guide/kernel-parameters.txt | 7 +
> > include/linux/entry-common.h | 2 +-
> > include/linux/sched.h | 12 +
> > kernel/entry/common.c | 25 +-
> > kernel/sched/core.c | 229 ++++++++++++++++++
> > kernel/sched/sched.h | 3 +
> > 6 files changed, 275 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 3236427e2215..48567110f709 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -4678,6 +4678,13 @@
> > sbni= [NET] Granch SBNI12 leased line adapter
> > + sched_core_protect_kernel=
>
> Needs a list of possible values after '=', along with telling us
> what the default value/setting is.

Ok, I made it the following:

sched_core_protect_kernel=
[SCHED_CORE] Pause SMT siblings of a core running in
user mode, if at least one of the siblings of the core
is running in kernel mode. This is to guarantee that
kernel data is not leaked to tasks which are not trusted
by the kernel. A value of 0 disables protection, 1
enables protection. The default is 1.

thanks,

- Joel


> > + [SCHED_CORE] Pause SMT siblings of a core running in
> > + user mode, if at least one of the siblings of the core
> > + is running in kernel mode. This is to guarantee that
> > + kernel data is not leaked to tasks which are not trusted
> > + by the kernel.
> > +
>
>
> thanks.

2020-11-03 00:52:42

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On Thu, Oct 22, 2020 at 01:48:12PM +0800, Li, Aubrey wrote:
> On 2020/10/20 9:43, Joel Fernandes (Google) wrote:
> > Core-scheduling prevents hyperthreads in usermode from attacking each
> > other, but it does not do anything about one of the hyperthreads
> > entering the kernel for any reason. This leaves the door open for MDS
> > and L1TF attacks with concurrent execution sequences between
> > hyperthreads.
> >
> > This patch therefore adds support for protecting all syscall and IRQ
> > kernel mode entries. Care is taken to track the outermost usermode exit
> > and entry using per-cpu counters. In cases where one of the hyperthreads
> > enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> > when not needed - example: idle and non-cookie HTs do not need to be
> > forced into kernel mode.
> >
> > More information about attacks:
> > For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> > data to either host or guest attackers. For L1TF, it is possible to leak
> > to guest attackers. There is no possible mitigation involving flushing
> > of buffers to avoid this since the execution of attacker and victims
> > happen concurrently on 2 or more HTs.
> >
> > Cc: Julien Desfossez <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Aaron Lu <[email protected]>
> > Cc: Aubrey Li <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Paul E. McKenney <[email protected]>
> > Co-developed-by: Vineeth Pillai <[email protected]>
> > Tested-by: Julien Desfossez <[email protected]>
> > Signed-off-by: Vineeth Pillai <[email protected]>
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > ---
> > .../admin-guide/kernel-parameters.txt | 7 +
> > include/linux/entry-common.h | 2 +-
> > include/linux/sched.h | 12 +
> > kernel/entry/common.c | 25 +-
> > kernel/sched/core.c | 229 ++++++++++++++++++
> > kernel/sched/sched.h | 3 +
> > 6 files changed, 275 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 3236427e2215..48567110f709 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -4678,6 +4678,13 @@
> >
> > sbni= [NET] Granch SBNI12 leased line adapter
> >
> > + sched_core_protect_kernel=
> > + [SCHED_CORE] Pause SMT siblings of a core running in
> > + user mode, if at least one of the siblings of the core
> > + is running in kernel mode. This is to guarantee that
> > + kernel data is not leaked to tasks which are not trusted
> > + by the kernel.
> > +
> > sched_debug [KNL] Enables verbose scheduler debug messages.
> >
> > schedstats= [KNL,X86] Enable or disable scheduled statistics.
> > diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> > index 474f29638d2c..260216de357b 100644
> > --- a/include/linux/entry-common.h
> > +++ b/include/linux/entry-common.h
> > @@ -69,7 +69,7 @@
> >
> > #define EXIT_TO_USER_MODE_WORK \
> > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> > - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> > + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> > ARCH_EXIT_TO_USER_MODE_WORK)
> >
> > /**
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d38e904dd603..fe6f225bfbf9 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
> >
> > const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
> >
> > +#ifdef CONFIG_SCHED_CORE
> > +void sched_core_unsafe_enter(void);
> > +void sched_core_unsafe_exit(void);
> > +bool sched_core_wait_till_safe(unsigned long ti_check);
> > +bool sched_core_kernel_protected(void);
> > +#else
> > +#define sched_core_unsafe_enter(ignore) do { } while (0)
> > +#define sched_core_unsafe_exit(ignore) do { } while (0)
> > +#define sched_core_wait_till_safe(ignore) do { } while (0)
> > +#define sched_core_kernel_protected(ignore) do { } while (0)
> > +#endif
> > +
> > #endif
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index 0a1e20f8d4e8..c8dc6b1b1f40 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
> > /* Workaround to allow gradual conversion of architecture code */
> > void __weak arch_do_signal(struct pt_regs *regs) { }
> >
> > +unsigned long exit_to_user_get_work(void)
> > +{
> > + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > +
> > + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> > + return ti_work;
> > +
> > +#ifdef CONFIG_SCHED_CORE
> > + ti_work &= EXIT_TO_USER_MODE_WORK;
> > + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
>
> Though _TIF_UNSAFE_RET is not x86 specific, but I only saw the definition in x86.
> I'm not sure if other SMT capable architectures are happy with this?

Sorry for late reply.

Yes, arch has to define it if they want kernel protection. There's no way
around it.

But I need to stub _TIF_UNSAFE_RET to 0 if it is not defined, like the other
TIF flags do. I'll make changes as below.

thanks!

- Joel

---8<-----------------------

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 34dbbb764d6c..a338d5d64c3d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4684,7 +4684,8 @@
is running in kernel mode. This is to guarantee that
kernel data is not leaked to tasks which are not trusted
by the kernel. A value of 0 disables protection, 1
- enables protection. The default is 1.
+ enables protection. The default is 1. Note that protection
+ depends on the arch defining the _TIF_UNSAFE_RET flag.

sched_debug [KNL] Enables verbose scheduler debug messages.

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 879562d920f2..4475d545f9a0 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -33,6 +33,10 @@
# define _TIF_PATCH_PENDING (0)
#endif

+#ifndef _TIF_UNSAFE_RET
+# define _TIF_UNSAFE_RET (0)
+#endif
+
#ifndef _TIF_UPROBE
# define _TIF_UPROBE (0)
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8e1974e5008..a18ed60cedea 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -28,7 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)

instrumentation_begin();
trace_hardirqs_off_finish();
- sched_core_unsafe_enter();
+ if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
+ sched_core_unsafe_enter();
instrumentation_end();
}

@@ -142,7 +143,8 @@ unsigned long exit_to_user_get_work(void)
{
unsigned long ti_work = READ_ONCE(current_thread_info()->flags);

- if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
+ if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
+ || !_TIF_UNSAFE_RET)
return ti_work;

#ifdef CONFIG_SCHED_CORE
--
2.29.1.341.ge80a0c044ae-goog


2020-11-03 01:27:07

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

Hi Alexandre,

Sorry for late reply as I was working on the snapshotting patch...

On Fri, Oct 30, 2020 at 11:29:26AM +0100, Alexandre Chartre wrote:
>
> On 10/20/20 3:43 AM, Joel Fernandes (Google) wrote:
> > Core-scheduling prevents hyperthreads in usermode from attacking each
> > other, but it does not do anything about one of the hyperthreads
> > entering the kernel for any reason. This leaves the door open for MDS
> > and L1TF attacks with concurrent execution sequences between
> > hyperthreads.
> >
> > This patch therefore adds support for protecting all syscall and IRQ
> > kernel mode entries. Care is taken to track the outermost usermode exit
> > and entry using per-cpu counters. In cases where one of the hyperthreads
> > enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> > when not needed - example: idle and non-cookie HTs do not need to be
> > forced into kernel mode.
>
> Hi Joel,
>
> In order to protect syscall/IRQ kernel mode entries, shouldn't we have a
> call to sched_core_unsafe_enter() in the syscall/IRQ entry code? I don't
> see such a call. Am I missing something?

Yes, this is known bug and fixed in v9 which I'll post soon. Meanwhile
updated patch is appended below:

> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 3236427e2215..48567110f709 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -4678,6 +4678,13 @@
> > sbni= [NET] Granch SBNI12 leased line adapter
> > + sched_core_protect_kernel=
> > + [SCHED_CORE] Pause SMT siblings of a core running in
> > + user mode, if at least one of the siblings of the core
> > + is running in kernel mode. This is to guarantee that
> > + kernel data is not leaked to tasks which are not trusted
> > + by the kernel.
> > +
> > sched_debug [KNL] Enables verbose scheduler debug messages.
> > schedstats= [KNL,X86] Enable or disable scheduled statistics.
> > diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> > index 474f29638d2c..260216de357b 100644
> > --- a/include/linux/entry-common.h
> > +++ b/include/linux/entry-common.h
> > @@ -69,7 +69,7 @@
> > #define EXIT_TO_USER_MODE_WORK \
> > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> > - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> > + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> > ARCH_EXIT_TO_USER_MODE_WORK)
> > /**
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d38e904dd603..fe6f225bfbf9 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
> > const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
> > +#ifdef CONFIG_SCHED_CORE
> > +void sched_core_unsafe_enter(void);
> > +void sched_core_unsafe_exit(void);
> > +bool sched_core_wait_till_safe(unsigned long ti_check);
> > +bool sched_core_kernel_protected(void);
> > +#else
> > +#define sched_core_unsafe_enter(ignore) do { } while (0)
> > +#define sched_core_unsafe_exit(ignore) do { } while (0)
> > +#define sched_core_wait_till_safe(ignore) do { } while (0)
> > +#define sched_core_kernel_protected(ignore) do { } while (0)
> > +#endif
> > +
> > #endif
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index 0a1e20f8d4e8..c8dc6b1b1f40 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
> > /* Workaround to allow gradual conversion of architecture code */
> > void __weak arch_do_signal(struct pt_regs *regs) { }
> > +unsigned long exit_to_user_get_work(void)
> > +{
> > + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > +
> > + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> > + return ti_work;
> > +
> > +#ifdef CONFIG_SCHED_CORE
> > + ti_work &= EXIT_TO_USER_MODE_WORK;
> > + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> > + sched_core_unsafe_exit();
> > + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
>
> If we call sched_core_unsafe_exit() before sched_core_wait_till_safe() then we
> expose ourself during the entire wait period in sched_core_wait_till_safe(). It
> would be better to call sched_core_unsafe_exit() once we know for sure we are
> going to exit.

The way the algorithm works right now, it requires the current task to get
out of the unsafe state while waiting otherwise it will lockup. Note that we
wait with interrupts enabled so new interrupts could come while waiting.

TBH this code is very tricky to get right and it took long time to get it
working properly. For now I am content with the way it works. We can improve
further incrementally on it in the future.

Let me know if I may add your Reviewed-by tag for this patch, if there are no
other comments, and I appreciate it. Appended the updated patch below.

thanks,

- Joel

---8<-----------------------

From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <[email protected]>
Date: Mon, 27 Jul 2020 17:56:14 -0400
Subject: [PATCH] kernel/entry: Add support for core-wide protection of
kernel-mode

Core-scheduling prevents hyperthreads in usermode from attacking each
other, but it does not do anything about one of the hyperthreads
entering the kernel for any reason. This leaves the door open for MDS
and L1TF attacks with concurrent execution sequences between
hyperthreads.

This patch therefore adds support for protecting all syscall and IRQ
kernel mode entries. Care is taken to track the outermost usermode exit
and entry using per-cpu counters. In cases where one of the hyperthreads
enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
when not needed - example: idle and non-cookie HTs do not need to be
forced into kernel mode.

More information about attacks:
For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
data to either host or guest attackers. For L1TF, it is possible to leak
to guest attackers. There is no possible mitigation involving flushing
of buffers to avoid this since the execution of attacker and victims
happen concurrently on 2 or more HTs.

Cc: Julien Desfossez <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Aaron Lu <[email protected]>
Cc: Aubrey Li <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Co-developed-by: Vineeth Pillai <[email protected]>
Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Vineeth Pillai <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 9 +
include/linux/entry-common.h | 6 +-
include/linux/sched.h | 12 +
kernel/entry/common.c | 28 ++-
kernel/sched/core.c | 230 ++++++++++++++++++
kernel/sched/sched.h | 3 +
6 files changed, 285 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3236427e2215..a338d5d64c3d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,15 @@

sbni= [NET] Granch SBNI12 leased line adapter

+ sched_core_protect_kernel=
+ [SCHED_CORE] Pause SMT siblings of a core running in
+ user mode, if at least one of the siblings of the core
+ is running in kernel mode. This is to guarantee that
+ kernel data is not leaked to tasks which are not trusted
+ by the kernel. A value of 0 disables protection, 1
+ enables protection. The default is 1. Note that protection
+ depends on the arch defining the _TIF_UNSAFE_RET flag.
+
sched_debug [KNL] Enables verbose scheduler debug messages.

schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 474f29638d2c..62278c5b3b5f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -33,6 +33,10 @@
# define _TIF_PATCH_PENDING (0)
#endif

+#ifndef _TIF_UNSAFE_RET
+# define _TIF_UNSAFE_RET (0)
+#endif
+
#ifndef _TIF_UPROBE
# define _TIF_UPROBE (0)
#endif
@@ -69,7 +73,7 @@

#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
+ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
ARCH_EXIT_TO_USER_MODE_WORK)

/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d38e904dd603..fe6f225bfbf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

+#ifdef CONFIG_SCHED_CORE
+void sched_core_unsafe_enter(void);
+void sched_core_unsafe_exit(void);
+bool sched_core_wait_till_safe(unsigned long ti_check);
+bool sched_core_kernel_protected(void);
+#else
+#define sched_core_unsafe_enter(ignore) do { } while (0)
+#define sched_core_unsafe_exit(ignore) do { } while (0)
+#define sched_core_wait_till_safe(ignore) do { } while (0)
+#define sched_core_kernel_protected(ignore) do { } while (0)
+#endif
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0a1e20f8d4e8..a18ed60cedea 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)

instrumentation_begin();
trace_hardirqs_off_finish();
+ if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
+ sched_core_unsafe_enter();
instrumentation_end();
}

@@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal(struct pt_regs *regs) { }

+unsigned long exit_to_user_get_work(void)
+{
+ unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+ if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
+ || !_TIF_UNSAFE_RET)
+ return ti_work;
+
+#ifdef CONFIG_SCHED_CORE
+ ti_work &= EXIT_TO_USER_MODE_WORK;
+ if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
+ sched_core_unsafe_exit();
+ if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
+ sched_core_unsafe_enter(); /* not exiting to user yet. */
+ }
+ }
+
+ return READ_ONCE(current_thread_info()->flags);
+#endif
+}
+
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = exit_to_user_get_work();
}

/* Return the latest work state for arch_exit_to_user_mode() */
@@ -184,9 +207,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,

static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work;

lockdep_assert_irqs_disabled();
+ ti_work = exit_to_user_get_work();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e05728bdb18c..bd206708fac2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -76,6 +76,27 @@ __read_mostly int scheduler_running;

#ifdef CONFIG_SCHED_CORE

+DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
+static int __init set_sched_core_protect_kernel(char *str)
+{
+ unsigned long val = 0;
+
+ if (!str)
+ return 0;
+
+ if (!kstrtoul(str, 0, &val) && !val)
+ static_branch_disable(&sched_core_protect_kernel);
+
+ return 1;
+}
+__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
+
+/* Is the kernel protected by core scheduling? */
+bool sched_core_kernel_protected(void)
+{
+ return static_branch_likely(&sched_core_protect_kernel);
+}
+
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */
@@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}

+/*
+ * Handler to attempt to enter kernel. It does nothing because the exit to
+ * usermode or guest mode will do the actual work (of waiting if needed).
+ */
+static void sched_core_irq_work(struct irq_work *work)
+{
+ return;
+}
+
+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+ init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
+/*
+ * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
+ * exits the core-wide unsafe state. Obviously the CPU calling this function
+ * should not be responsible for the core being in the core-wide unsafe state
+ * otherwise it will deadlock.
+ *
+ * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
+ * the loop if TIF flags are set and notify caller about it.
+ *
+ * IRQs should be disabled.
+ */
+bool sched_core_wait_till_safe(unsigned long ti_check)
+{
+ bool restart = false;
+ struct rq *rq;
+ int cpu;
+
+ /* We clear the thread flag only at the end, so need to check for it. */
+ ti_check &= ~_TIF_UNSAFE_RET;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Down grade to allow interrupts to prevent stop_machine lockups.. */
+ preempt_disable();
+ local_irq_enable();
+
+ /*
+ * Wait till the core of this HT is not in an unsafe state.
+ *
+ * Pair with smp_store_release() in sched_core_unsafe_exit().
+ */
+ while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
+ cpu_relax();
+ if (READ_ONCE(current_thread_info()->flags) & ti_check) {
+ restart = true;
+ break;
+ }
+ }
+
+ /* Upgrade it back to the expectations of entry code. */
+ local_irq_disable();
+ preempt_enable();
+
+ret:
+ if (!restart)
+ clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ return restart;
+}
+
+/*
+ * Enter the core-wide IRQ state. Sibling will be paused if it is running
+ * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
+ * avoid sending useless IPIs is made. Must be called only from hard IRQ
+ * context.
+ */
+void sched_core_unsafe_enter(void)
+{
+ const struct cpumask *smt_mask;
+ unsigned long flags;
+ struct rq *rq;
+ int i, cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ /* Ensure that on return to user/guest, we check whether to wait. */
+ if (current->core_cookie)
+ set_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
+ rq->core_this_unsafe_nest++;
+
+ /* Should not nest: enter() should only pair with exit(). */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ smt_mask = cpu_smt_mask(cpu);
+
+ /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
+ WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
+
+ if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
+ goto unlock;
+
+ if (irq_work_is_busy(&rq->core_irq_work)) {
+ /*
+ * Do nothing more since we are in an IPI sent from another
+ * sibling to enforce safety. That sibling would have sent IPIs
+ * to all of the HTs.
+ */
+ goto unlock;
+ }
+
+ /*
+ * If we are not the first ones on the core to enter core-wide unsafe
+ * state, do nothing.
+ */
+ if (rq->core->core_unsafe_nest > 1)
+ goto unlock;
+
+ /* Do nothing more if the core is not tagged. */
+ if (!rq->core->core_cookie)
+ goto unlock;
+
+ for_each_cpu(i, smt_mask) {
+ struct rq *srq = cpu_rq(i);
+
+ if (i == cpu || cpu_is_offline(i))
+ continue;
+
+ if (!srq->curr->mm || is_task_rq_idle(srq->curr))
+ continue;
+
+ /* Skip if HT is not running a tagged task. */
+ if (!srq->curr->core_cookie && !srq->core_pick)
+ continue;
+
+ /*
+ * Force sibling into the kernel by IPI. If work was already
+ * pending, no new IPIs are sent. This is Ok since the receiver
+ * would already be in the kernel, or on its way to it.
+ */
+ irq_work_queue_on(&srq->core_irq_work, i);
+ }
+unlock:
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
+/*
+ * Process any work need for either exiting the core-wide unsafe state, or for
+ * waiting on this hyperthread if the core is still in this state.
+ *
+ * @idle: Are we called from the idle loop?
+ */
+void sched_core_unsafe_exit(void)
+{
+ unsigned long flags;
+ unsigned int nest;
+ struct rq *rq;
+ int cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ /* Do nothing if core-sched disabled. */
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /*
+ * Can happen when a process is forked and the first return to user
+ * mode is a syscall exit. Either way, there's nothing to do.
+ */
+ if (rq->core_this_unsafe_nest == 0)
+ goto ret;
+
+ rq->core_this_unsafe_nest--;
+
+ /* enter() should be paired with exit() only. */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ /*
+ * Core-wide nesting counter can never be 0 because we are
+ * still in it on this CPU.
+ */
+ nest = rq->core->core_unsafe_nest;
+ WARN_ON_ONCE(!nest);
+
+ /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
+ smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
// XXX fairness/fwd progress conditions
/*
* Returns
@@ -5019,6 +5248,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
rq = cpu_rq(i);
if (rq->core && rq->core == rq)
core_rq = rq;
+ init_sched_core_irq_work(rq);
}

if (!core_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f7e2d8a3be8e..4bcf3b1ddfb3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1059,12 +1059,15 @@ struct rq {
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
+ struct irq_work core_irq_work; /* To force HT into kernel */
+ unsigned int core_this_unsafe_nest;

/* shared state */
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
unsigned char core_forceidle;
+ unsigned int core_unsafe_nest;
#endif
};

--
2.29.1.341.ge80a0c044ae-goog

2020-11-06 16:59:36

by Alexandre Chartre

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode



On 11/3/20 2:20 AM, Joel Fernandes wrote:
> Hi Alexandre,
>
> Sorry for late reply as I was working on the snapshotting patch...
>
> On Fri, Oct 30, 2020 at 11:29:26AM +0100, Alexandre Chartre wrote:
>>
>> On 10/20/20 3:43 AM, Joel Fernandes (Google) wrote:
>>> Core-scheduling prevents hyperthreads in usermode from attacking each
>>> other, but it does not do anything about one of the hyperthreads
>>> entering the kernel for any reason. This leaves the door open for MDS
>>> and L1TF attacks with concurrent execution sequences between
>>> hyperthreads.
>>>
>>> This patch therefore adds support for protecting all syscall and IRQ
>>> kernel mode entries. Care is taken to track the outermost usermode exit
>>> and entry using per-cpu counters. In cases where one of the hyperthreads
>>> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
>>> when not needed - example: idle and non-cookie HTs do not need to be
>>> forced into kernel mode.
>>
>> Hi Joel,
>>
>> In order to protect syscall/IRQ kernel mode entries, shouldn't we have a
>> call to sched_core_unsafe_enter() in the syscall/IRQ entry code? I don't
>> see such a call. Am I missing something?
>
> Yes, this is known bug and fixed in v9 which I'll post soon. Meanwhile
> updated patch is appended below:
>

[..]

>>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>>> index 0a1e20f8d4e8..c8dc6b1b1f40 100644
>>> --- a/kernel/entry/common.c
>>> +++ b/kernel/entry/common.c
>>> @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
>>> /* Workaround to allow gradual conversion of architecture code */
>>> void __weak arch_do_signal(struct pt_regs *regs) { }
>>> +unsigned long exit_to_user_get_work(void)
>>> +{
>>> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
>>> +
>>> + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
>>> + return ti_work;
>>> +
>>> +#ifdef CONFIG_SCHED_CORE
>>> + ti_work &= EXIT_TO_USER_MODE_WORK;
>>> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
>>> + sched_core_unsafe_exit();
>>> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
>>
>> If we call sched_core_unsafe_exit() before sched_core_wait_till_safe() then we
>> expose ourself during the entire wait period in sched_core_wait_till_safe(). It
>> would be better to call sched_core_unsafe_exit() once we know for sure we are
>> going to exit.
>
> The way the algorithm works right now, it requires the current task to get
> out of the unsafe state while waiting otherwise it will lockup. Note that we
> wait with interrupts enabled so new interrupts could come while waiting.
>
> TBH this code is very tricky to get right and it took long time to get it
> working properly. For now I am content with the way it works. We can improve
> further incrementally on it in the future.
>

I am concerned this leaves a lot of windows opened (even with the updated patch)
where the system remains exposed. There are 3 obvious windows:

- after switching to the kernel page-table and until enter_from_user_mode() is called
- while waiting for other cpus
- after leaving exit_to_user_mode_loop() and until switching back to the user page-table

Also on syscall/interrupt entry, sched_core_unsafe_enter() is called (in the
updated patch) and this sends an IPI to other CPUs but it doesn't wait for
other CPUs to effectively switch to the kernel page-table. It also seems like
the case where the CPU is interrupted by a NMI is not handled.

I know the code is tricky, and I am working on something similar for ASI (ASI
lockdown) where I am addressing all these cases (this seems to work).

> Let me know if I may add your Reviewed-by tag for this patch, if there are no
> other comments, and I appreciate it. Appended the updated patch below.
>

I haven't effectively reviewed the code yet. Also I wonder if the work I am doing
with ASI for synchronizing sibling cpus (i.e. the ASI lockdown) and the integration
with PTI could provide what you need. Basically each process has an ASI and the
ASI lockdown ensure that sibling cpus are also running with a trusted ASI. If
the process/ASI is interrupted (e.g. on interrupt/exception/NMI) then it forces
sibling cpus to also interrupt ASI. The sibling cpus synchronization occurs when
switching the page-tables (between user and kernel) so there's no exposure window.

Let me have a closer look.

alex.


> ---8<-----------------------
>
> From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
> From: "Joel Fernandes (Google)" <[email protected]>
> Date: Mon, 27 Jul 2020 17:56:14 -0400
> Subject: [PATCH] kernel/entry: Add support for core-wide protection of
> kernel-mode
>
> Core-scheduling prevents hyperthreads in usermode from attacking each
> other, but it does not do anything about one of the hyperthreads
> entering the kernel for any reason. This leaves the door open for MDS
> and L1TF attacks with concurrent execution sequences between
> hyperthreads.
>
> This patch therefore adds support for protecting all syscall and IRQ
> kernel mode entries. Care is taken to track the outermost usermode exit
> and entry using per-cpu counters. In cases where one of the hyperthreads
> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> when not needed - example: idle and non-cookie HTs do not need to be
> forced into kernel mode.
>
> More information about attacks:
> For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> data to either host or guest attackers. For L1TF, it is possible to leak
> to guest attackers. There is no possible mitigation involving flushing
> of buffers to avoid this since the execution of attacker and victims
> happen concurrently on 2 or more HTs.
>
> Cc: Julien Desfossez <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Aaron Lu <[email protected]>
> Cc: Aubrey Li <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Co-developed-by: Vineeth Pillai <[email protected]>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Vineeth Pillai <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 9 +
> include/linux/entry-common.h | 6 +-
> include/linux/sched.h | 12 +
> kernel/entry/common.c | 28 ++-
> kernel/sched/core.c | 230 ++++++++++++++++++
> kernel/sched/sched.h | 3 +
> 6 files changed, 285 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 3236427e2215..a338d5d64c3d 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4678,6 +4678,15 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> + sched_core_protect_kernel=
> + [SCHED_CORE] Pause SMT siblings of a core running in
> + user mode, if at least one of the siblings of the core
> + is running in kernel mode. This is to guarantee that
> + kernel data is not leaked to tasks which are not trusted
> + by the kernel. A value of 0 disables protection, 1
> + enables protection. The default is 1. Note that protection
> + depends on the arch defining the _TIF_UNSAFE_RET flag.
> +
> sched_debug [KNL] Enables verbose scheduler debug messages.
>
> schedstats= [KNL,X86] Enable or disable scheduled statistics.
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 474f29638d2c..62278c5b3b5f 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -33,6 +33,10 @@
> # define _TIF_PATCH_PENDING (0)
> #endif
>
> +#ifndef _TIF_UNSAFE_RET
> +# define _TIF_UNSAFE_RET (0)
> +#endif
> +
> #ifndef _TIF_UPROBE
> # define _TIF_UPROBE (0)
> #endif
> @@ -69,7 +73,7 @@
>
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d38e904dd603..fe6f225bfbf9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>
> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>
> +#ifdef CONFIG_SCHED_CORE
> +void sched_core_unsafe_enter(void);
> +void sched_core_unsafe_exit(void);
> +bool sched_core_wait_till_safe(unsigned long ti_check);
> +bool sched_core_kernel_protected(void);
> +#else
> +#define sched_core_unsafe_enter(ignore) do { } while (0)
> +#define sched_core_unsafe_exit(ignore) do { } while (0)
> +#define sched_core_wait_till_safe(ignore) do { } while (0)
> +#define sched_core_kernel_protected(ignore) do { } while (0)
> +#endif
> +
> #endif
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 0a1e20f8d4e8..a18ed60cedea 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>
> instrumentation_begin();
> trace_hardirqs_off_finish();
> + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
> + sched_core_unsafe_enter();
> instrumentation_end();
> }
>
> @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
> /* Workaround to allow gradual conversion of architecture code */
> void __weak arch_do_signal(struct pt_regs *regs) { }
>
> +unsigned long exit_to_user_get_work(void)
> +{
> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> +
> + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> + || !_TIF_UNSAFE_RET)
> + return ti_work;
> +
> +#ifdef CONFIG_SCHED_CORE
> + ti_work &= EXIT_TO_USER_MODE_WORK;
> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> + sched_core_unsafe_exit();
> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> + sched_core_unsafe_enter(); /* not exiting to user yet. */
> + }
> + }
> +
> + return READ_ONCE(current_thread_info()->flags);
> +#endif
> +}
> +
> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> unsigned long ti_work)
> {
> @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> * enabled above.
> */
> local_irq_disable_exit_to_user();
> - ti_work = READ_ONCE(current_thread_info()->flags);
> + ti_work = exit_to_user_get_work();
> }
>
> /* Return the latest work state for arch_exit_to_user_mode() */
> @@ -184,9 +207,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> static void exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> + unsigned long ti_work;
>
> lockdep_assert_irqs_disabled();
> + ti_work = exit_to_user_get_work();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e05728bdb18c..bd206708fac2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
>
> #ifdef CONFIG_SCHED_CORE
>
> +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> +static int __init set_sched_core_protect_kernel(char *str)
> +{
> + unsigned long val = 0;
> +
> + if (!str)
> + return 0;
> +
> + if (!kstrtoul(str, 0, &val) && !val)
> + static_branch_disable(&sched_core_protect_kernel);
> +
> + return 1;
> +}
> +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> +
> +/* Is the kernel protected by core scheduling? */
> +bool sched_core_kernel_protected(void)
> +{
> + return static_branch_likely(&sched_core_protect_kernel);
> +}
> +
> DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
>
> /* kernel prio, less is more */
> @@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> return a->core_cookie == b->core_cookie;
> }
>
> +/*
> + * Handler to attempt to enter kernel. It does nothing because the exit to
> + * usermode or guest mode will do the actual work (of waiting if needed).
> + */
> +static void sched_core_irq_work(struct irq_work *work)
> +{
> + return;
> +}
> +
> +static inline void init_sched_core_irq_work(struct rq *rq)
> +{
> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> +}
> +
> +/*
> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> + * exits the core-wide unsafe state. Obviously the CPU calling this function
> + * should not be responsible for the core being in the core-wide unsafe state
> + * otherwise it will deadlock.
> + *
> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> + * the loop if TIF flags are set and notify caller about it.
> + *
> + * IRQs should be disabled.
> + */
> +bool sched_core_wait_till_safe(unsigned long ti_check)
> +{
> + bool restart = false;
> + struct rq *rq;
> + int cpu;
> +
> + /* We clear the thread flag only at the end, so need to check for it. */
> + ti_check &= ~_TIF_UNSAFE_RET;
> +
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> + preempt_disable();
> + local_irq_enable();
> +
> + /*
> + * Wait till the core of this HT is not in an unsafe state.
> + *
> + * Pair with smp_store_release() in sched_core_unsafe_exit().
> + */
> + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> + cpu_relax();
> + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> + restart = true;
> + break;
> + }
> + }
> +
> + /* Upgrade it back to the expectations of entry code. */
> + local_irq_disable();
> + preempt_enable();
> +
> +ret:
> + if (!restart)
> + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + return restart;
> +}
> +
> +/*
> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> + * context.
> + */
> +void sched_core_unsafe_enter(void)
> +{
> + const struct cpumask *smt_mask;
> + unsigned long flags;
> + struct rq *rq;
> + int i, cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + /* Ensure that on return to user/guest, we check whether to wait. */
> + if (current->core_cookie)
> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> + rq->core_this_unsafe_nest++;
> +
> + /* Should not nest: enter() should only pair with exit(). */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + smt_mask = cpu_smt_mask(cpu);
> +
> + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> +
> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> + goto unlock;
> +
> + if (irq_work_is_busy(&rq->core_irq_work)) {
> + /*
> + * Do nothing more since we are in an IPI sent from another
> + * sibling to enforce safety. That sibling would have sent IPIs
> + * to all of the HTs.
> + */
> + goto unlock;
> + }
> +
> + /*
> + * If we are not the first ones on the core to enter core-wide unsafe
> + * state, do nothing.
> + */
> + if (rq->core->core_unsafe_nest > 1)
> + goto unlock;
> +
> + /* Do nothing more if the core is not tagged. */
> + if (!rq->core->core_cookie)
> + goto unlock;
> +
> + for_each_cpu(i, smt_mask) {
> + struct rq *srq = cpu_rq(i);
> +
> + if (i == cpu || cpu_is_offline(i))
> + continue;
> +
> + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> + continue;
> +
> + /* Skip if HT is not running a tagged task. */
> + if (!srq->curr->core_cookie && !srq->core_pick)
> + continue;
> +
> + /*
> + * Force sibling into the kernel by IPI. If work was already
> + * pending, no new IPIs are sent. This is Ok since the receiver
> + * would already be in the kernel, or on its way to it.
> + */
> + irq_work_queue_on(&srq->core_irq_work, i);
> + }
> +unlock:
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> +/*
> + * Process any work need for either exiting the core-wide unsafe state, or for
> + * waiting on this hyperthread if the core is still in this state.
> + *
> + * @idle: Are we called from the idle loop?
> + */
> +void sched_core_unsafe_exit(void)
> +{
> + unsigned long flags;
> + unsigned int nest;
> + struct rq *rq;
> + int cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + /* Do nothing if core-sched disabled. */
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /*
> + * Can happen when a process is forked and the first return to user
> + * mode is a syscall exit. Either way, there's nothing to do.
> + */
> + if (rq->core_this_unsafe_nest == 0)
> + goto ret;
> +
> + rq->core_this_unsafe_nest--;
> +
> + /* enter() should be paired with exit() only. */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + /*
> + * Core-wide nesting counter can never be 0 because we are
> + * still in it on this CPU.
> + */
> + nest = rq->core->core_unsafe_nest;
> + WARN_ON_ONCE(!nest);
> +
> + /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
> + smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> // XXX fairness/fwd progress conditions
> /*
> * Returns
> @@ -5019,6 +5248,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
> rq = cpu_rq(i);
> if (rq->core && rq->core == rq)
> core_rq = rq;
> + init_sched_core_irq_work(rq);
> }
>
> if (!core_rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index f7e2d8a3be8e..4bcf3b1ddfb3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1059,12 +1059,15 @@ struct rq {
> unsigned int core_enabled;
> unsigned int core_sched_seq;
> struct rb_root core_tree;
> + struct irq_work core_irq_work; /* To force HT into kernel */
> + unsigned int core_this_unsafe_nest;
>
> /* shared state */
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> unsigned char core_forceidle;
> + unsigned int core_unsafe_nest;
> #endif
> };
>
>

2020-11-06 17:47:57

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On Fri, Nov 06, 2020 at 05:57:21PM +0100, Alexandre Chartre wrote:
>
>
> On 11/3/20 2:20 AM, Joel Fernandes wrote:
> > Hi Alexandre,
> >
> > Sorry for late reply as I was working on the snapshotting patch...
> >
> > On Fri, Oct 30, 2020 at 11:29:26AM +0100, Alexandre Chartre wrote:
> > >
> > > On 10/20/20 3:43 AM, Joel Fernandes (Google) wrote:
> > > > Core-scheduling prevents hyperthreads in usermode from attacking each
> > > > other, but it does not do anything about one of the hyperthreads
> > > > entering the kernel for any reason. This leaves the door open for MDS
> > > > and L1TF attacks with concurrent execution sequences between
> > > > hyperthreads.
> > > >
> > > > This patch therefore adds support for protecting all syscall and IRQ
> > > > kernel mode entries. Care is taken to track the outermost usermode exit
> > > > and entry using per-cpu counters. In cases where one of the hyperthreads
> > > > enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> > > > when not needed - example: idle and non-cookie HTs do not need to be
> > > > forced into kernel mode.
> > >
> > > Hi Joel,
> > >
> > > In order to protect syscall/IRQ kernel mode entries, shouldn't we have a
> > > call to sched_core_unsafe_enter() in the syscall/IRQ entry code? I don't
> > > see such a call. Am I missing something?
> >
> > Yes, this is known bug and fixed in v9 which I'll post soon. Meanwhile
> > updated patch is appended below:
>
> [..]
>
> > > > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > > > index 0a1e20f8d4e8..c8dc6b1b1f40 100644
> > > > --- a/kernel/entry/common.c
> > > > +++ b/kernel/entry/common.c
> > > > @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
> > > > /* Workaround to allow gradual conversion of architecture code */
> > > > void __weak arch_do_signal(struct pt_regs *regs) { }
> > > > +unsigned long exit_to_user_get_work(void)
> > > > +{
> > > > + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > > > +
> > > > + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> > > > + return ti_work;
> > > > +
> > > > +#ifdef CONFIG_SCHED_CORE
> > > > + ti_work &= EXIT_TO_USER_MODE_WORK;
> > > > + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> > > > + sched_core_unsafe_exit();
> > > > + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> > >
> > > If we call sched_core_unsafe_exit() before sched_core_wait_till_safe() then we
> > > expose ourself during the entire wait period in sched_core_wait_till_safe(). It
> > > would be better to call sched_core_unsafe_exit() once we know for sure we are
> > > going to exit.
> >
> > The way the algorithm works right now, it requires the current task to get
> > out of the unsafe state while waiting otherwise it will lockup. Note that we
> > wait with interrupts enabled so new interrupts could come while waiting.
> >
> > TBH this code is very tricky to get right and it took long time to get it
> > working properly. For now I am content with the way it works. We can improve
> > further incrementally on it in the future.
> >
>
> I am concerned this leaves a lot of windows opened (even with the updated patch)
> where the system remains exposed. There are 3 obvious windows:
>
> - after switching to the kernel page-table and until enter_from_user_mode() is called
> - while waiting for other cpus
> - after leaving exit_to_user_mode_loop() and until switching back to the user page-table
>
> Also on syscall/interrupt entry, sched_core_unsafe_enter() is called (in the
> updated patch) and this sends an IPI to other CPUs but it doesn't wait for
> other CPUs to effectively switch to the kernel page-table. It also seems like
> the case where the CPU is interrupted by a NMI is not handled.

TBH, we discussed on list before that there may not be much value in closing
the above mentioned windows. We already knew there are a few windows open
like that. Thomas Glexiner told us that the solution does not need to be 100%
as long as it closes most windows and is performant -- the important thing
being to disrupt the attacker than making it 100% attack proof. And keeping
the code simple.

> I know the code is tricky, and I am working on something similar for ASI (ASI
> lockdown) where I am addressing all these cases (this seems to work).

Ok.

> > Let me know if I may add your Reviewed-by tag for this patch, if there are no
> > other comments, and I appreciate it. Appended the updated patch below.
> >
>
> I haven't effectively reviewed the code yet. Also I wonder if the work I am doing
> with ASI for synchronizing sibling cpus (i.e. the ASI lockdown) and the integration
> with PTI could provide what you need. Basically each process has an ASI and the
> ASI lockdown ensure that sibling cpus are also running with a trusted ASI. If
> the process/ASI is interrupted (e.g. on interrupt/exception/NMI) then it forces
> sibling cpus to also interrupt ASI. The sibling cpus synchronization occurs when
> switching the page-tables (between user and kernel) so there's no exposure window.

Maybe. But are you doing everything this patch does, when you enter ASI lock
down?
Basically, we want to only send IPIs if needed and we track the "core wide"
entry into the kernel to make sure we do the sligthly higher overhead things
once per "core wide" entry and exit. For some pictures, check these slides
from slide 15:
https://docs.google.com/presentation/d/1VzeQo3AyGTN35DJ3LKoPWBfiZHZJiF8q0NrX9eVYG70/edit#slide=id.g91cff3980b_0_7

As for switching page tables, I am not sure if that is all that's needed to
close the windows you mentioned, because MDS attacks are independent of page
table entries, they leak the uarch buffers. So you really have to isolate
things in the time domain as this patch does.

We discussed with Thomas and Dario in previous list emails that maybe this
patch can be used as a subset of ASI work as a "utility", the implement the
stunning.

> Let me have a closer look.

Sure, thanks.

- Joel

> alex.
>
>
> > ---8<-----------------------
> >
> > From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
> > From: "Joel Fernandes (Google)" <[email protected]>
> > Date: Mon, 27 Jul 2020 17:56:14 -0400
> > Subject: [PATCH] kernel/entry: Add support for core-wide protection of
> > kernel-mode
> >
> > Core-scheduling prevents hyperthreads in usermode from attacking each
> > other, but it does not do anything about one of the hyperthreads
> > entering the kernel for any reason. This leaves the door open for MDS
> > and L1TF attacks with concurrent execution sequences between
> > hyperthreads.
> >
> > This patch therefore adds support for protecting all syscall and IRQ
> > kernel mode entries. Care is taken to track the outermost usermode exit
> > and entry using per-cpu counters. In cases where one of the hyperthreads
> > enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> > when not needed - example: idle and non-cookie HTs do not need to be
> > forced into kernel mode.
> >
> > More information about attacks:
> > For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> > data to either host or guest attackers. For L1TF, it is possible to leak
> > to guest attackers. There is no possible mitigation involving flushing
> > of buffers to avoid this since the execution of attacker and victims
> > happen concurrently on 2 or more HTs.
> >
> > Cc: Julien Desfossez <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Aaron Lu <[email protected]>
> > Cc: Aubrey Li <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Paul E. McKenney <[email protected]>
> > Co-developed-by: Vineeth Pillai <[email protected]>
> > Tested-by: Julien Desfossez <[email protected]>
> > Signed-off-by: Vineeth Pillai <[email protected]>
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > ---
> > .../admin-guide/kernel-parameters.txt | 9 +
> > include/linux/entry-common.h | 6 +-
> > include/linux/sched.h | 12 +
> > kernel/entry/common.c | 28 ++-
> > kernel/sched/core.c | 230 ++++++++++++++++++
> > kernel/sched/sched.h | 3 +
> > 6 files changed, 285 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 3236427e2215..a338d5d64c3d 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -4678,6 +4678,15 @@
> > sbni= [NET] Granch SBNI12 leased line adapter
> > + sched_core_protect_kernel=
> > + [SCHED_CORE] Pause SMT siblings of a core running in
> > + user mode, if at least one of the siblings of the core
> > + is running in kernel mode. This is to guarantee that
> > + kernel data is not leaked to tasks which are not trusted
> > + by the kernel. A value of 0 disables protection, 1
> > + enables protection. The default is 1. Note that protection
> > + depends on the arch defining the _TIF_UNSAFE_RET flag.
> > +
> > sched_debug [KNL] Enables verbose scheduler debug messages.
> > schedstats= [KNL,X86] Enable or disable scheduled statistics.
> > diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> > index 474f29638d2c..62278c5b3b5f 100644
> > --- a/include/linux/entry-common.h
> > +++ b/include/linux/entry-common.h
> > @@ -33,6 +33,10 @@
> > # define _TIF_PATCH_PENDING (0)
> > #endif
> > +#ifndef _TIF_UNSAFE_RET
> > +# define _TIF_UNSAFE_RET (0)
> > +#endif
> > +
> > #ifndef _TIF_UPROBE
> > # define _TIF_UPROBE (0)
> > #endif
> > @@ -69,7 +73,7 @@
> > #define EXIT_TO_USER_MODE_WORK \
> > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> > - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> > + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> > ARCH_EXIT_TO_USER_MODE_WORK)
> > /**
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d38e904dd603..fe6f225bfbf9 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
> > const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
> > +#ifdef CONFIG_SCHED_CORE
> > +void sched_core_unsafe_enter(void);
> > +void sched_core_unsafe_exit(void);
> > +bool sched_core_wait_till_safe(unsigned long ti_check);
> > +bool sched_core_kernel_protected(void);
> > +#else
> > +#define sched_core_unsafe_enter(ignore) do { } while (0)
> > +#define sched_core_unsafe_exit(ignore) do { } while (0)
> > +#define sched_core_wait_till_safe(ignore) do { } while (0)
> > +#define sched_core_kernel_protected(ignore) do { } while (0)
> > +#endif
> > +
> > #endif
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index 0a1e20f8d4e8..a18ed60cedea 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
> > instrumentation_begin();
> > trace_hardirqs_off_finish();
> > + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
> > + sched_core_unsafe_enter();
> > instrumentation_end();
> > }
> > @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
> > /* Workaround to allow gradual conversion of architecture code */
> > void __weak arch_do_signal(struct pt_regs *regs) { }
> > +unsigned long exit_to_user_get_work(void)
> > +{
> > + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > +
> > + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> > + || !_TIF_UNSAFE_RET)
> > + return ti_work;
> > +
> > +#ifdef CONFIG_SCHED_CORE
> > + ti_work &= EXIT_TO_USER_MODE_WORK;
> > + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> > + sched_core_unsafe_exit();
> > + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> > + sched_core_unsafe_enter(); /* not exiting to user yet. */
> > + }
> > + }
> > +
> > + return READ_ONCE(current_thread_info()->flags);
> > +#endif
> > +}
> > +
> > static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > unsigned long ti_work)
> > {
> > @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > * enabled above.
> > */
> > local_irq_disable_exit_to_user();
> > - ti_work = READ_ONCE(current_thread_info()->flags);
> > + ti_work = exit_to_user_get_work();
> > }
> > /* Return the latest work state for arch_exit_to_user_mode() */
> > @@ -184,9 +207,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > static void exit_to_user_mode_prepare(struct pt_regs *regs)
> > {
> > - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > + unsigned long ti_work;
> > lockdep_assert_irqs_disabled();
> > + ti_work = exit_to_user_get_work();
> > if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> > ti_work = exit_to_user_mode_loop(regs, ti_work);
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index e05728bdb18c..bd206708fac2 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
> > #ifdef CONFIG_SCHED_CORE
> > +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> > +static int __init set_sched_core_protect_kernel(char *str)
> > +{
> > + unsigned long val = 0;
> > +
> > + if (!str)
> > + return 0;
> > +
> > + if (!kstrtoul(str, 0, &val) && !val)
> > + static_branch_disable(&sched_core_protect_kernel);
> > +
> > + return 1;
> > +}
> > +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> > +
> > +/* Is the kernel protected by core scheduling? */
> > +bool sched_core_kernel_protected(void)
> > +{
> > + return static_branch_likely(&sched_core_protect_kernel);
> > +}
> > +
> > DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
> > /* kernel prio, less is more */
> > @@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> > return a->core_cookie == b->core_cookie;
> > }
> > +/*
> > + * Handler to attempt to enter kernel. It does nothing because the exit to
> > + * usermode or guest mode will do the actual work (of waiting if needed).
> > + */
> > +static void sched_core_irq_work(struct irq_work *work)
> > +{
> > + return;
> > +}
> > +
> > +static inline void init_sched_core_irq_work(struct rq *rq)
> > +{
> > + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> > +}
> > +
> > +/*
> > + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> > + * exits the core-wide unsafe state. Obviously the CPU calling this function
> > + * should not be responsible for the core being in the core-wide unsafe state
> > + * otherwise it will deadlock.
> > + *
> > + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> > + * the loop if TIF flags are set and notify caller about it.
> > + *
> > + * IRQs should be disabled.
> > + */
> > +bool sched_core_wait_till_safe(unsigned long ti_check)
> > +{
> > + bool restart = false;
> > + struct rq *rq;
> > + int cpu;
> > +
> > + /* We clear the thread flag only at the end, so need to check for it. */
> > + ti_check &= ~_TIF_UNSAFE_RET;
> > +
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > +
> > + if (!sched_core_enabled(rq))
> > + goto ret;
> > +
> > + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> > + preempt_disable();
> > + local_irq_enable();
> > +
> > + /*
> > + * Wait till the core of this HT is not in an unsafe state.
> > + *
> > + * Pair with smp_store_release() in sched_core_unsafe_exit().
> > + */
> > + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> > + cpu_relax();
> > + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> > + restart = true;
> > + break;
> > + }
> > + }
> > +
> > + /* Upgrade it back to the expectations of entry code. */
> > + local_irq_disable();
> > + preempt_enable();
> > +
> > +ret:
> > + if (!restart)
> > + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> > +
> > + return restart;
> > +}
> > +
> > +/*
> > + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> > + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> > + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> > + * context.
> > + */
> > +void sched_core_unsafe_enter(void)
> > +{
> > + const struct cpumask *smt_mask;
> > + unsigned long flags;
> > + struct rq *rq;
> > + int i, cpu;
> > +
> > + if (!static_branch_likely(&sched_core_protect_kernel))
> > + return;
> > +
> > + /* Ensure that on return to user/guest, we check whether to wait. */
> > + if (current->core_cookie)
> > + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> > +
> > + local_irq_save(flags);
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > + if (!sched_core_enabled(rq))
> > + goto ret;
> > +
> > + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> > + rq->core_this_unsafe_nest++;
> > +
> > + /* Should not nest: enter() should only pair with exit(). */
> > + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> > + goto ret;
> > +
> > + raw_spin_lock(rq_lockp(rq));
> > + smt_mask = cpu_smt_mask(cpu);
> > +
> > + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
> > + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> > +
> > + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> > + goto unlock;
> > +
> > + if (irq_work_is_busy(&rq->core_irq_work)) {
> > + /*
> > + * Do nothing more since we are in an IPI sent from another
> > + * sibling to enforce safety. That sibling would have sent IPIs
> > + * to all of the HTs.
> > + */
> > + goto unlock;
> > + }
> > +
> > + /*
> > + * If we are not the first ones on the core to enter core-wide unsafe
> > + * state, do nothing.
> > + */
> > + if (rq->core->core_unsafe_nest > 1)
> > + goto unlock;
> > +
> > + /* Do nothing more if the core is not tagged. */
> > + if (!rq->core->core_cookie)
> > + goto unlock;
> > +
> > + for_each_cpu(i, smt_mask) {
> > + struct rq *srq = cpu_rq(i);
> > +
> > + if (i == cpu || cpu_is_offline(i))
> > + continue;
> > +
> > + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> > + continue;
> > +
> > + /* Skip if HT is not running a tagged task. */
> > + if (!srq->curr->core_cookie && !srq->core_pick)
> > + continue;
> > +
> > + /*
> > + * Force sibling into the kernel by IPI. If work was already
> > + * pending, no new IPIs are sent. This is Ok since the receiver
> > + * would already be in the kernel, or on its way to it.
> > + */
> > + irq_work_queue_on(&srq->core_irq_work, i);
> > + }
> > +unlock:
> > + raw_spin_unlock(rq_lockp(rq));
> > +ret:
> > + local_irq_restore(flags);
> > +}
> > +
> > +/*
> > + * Process any work need for either exiting the core-wide unsafe state, or for
> > + * waiting on this hyperthread if the core is still in this state.
> > + *
> > + * @idle: Are we called from the idle loop?
> > + */
> > +void sched_core_unsafe_exit(void)
> > +{
> > + unsigned long flags;
> > + unsigned int nest;
> > + struct rq *rq;
> > + int cpu;
> > +
> > + if (!static_branch_likely(&sched_core_protect_kernel))
> > + return;
> > +
> > + local_irq_save(flags);
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > +
> > + /* Do nothing if core-sched disabled. */
> > + if (!sched_core_enabled(rq))
> > + goto ret;
> > +
> > + /*
> > + * Can happen when a process is forked and the first return to user
> > + * mode is a syscall exit. Either way, there's nothing to do.
> > + */
> > + if (rq->core_this_unsafe_nest == 0)
> > + goto ret;
> > +
> > + rq->core_this_unsafe_nest--;
> > +
> > + /* enter() should be paired with exit() only. */
> > + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> > + goto ret;
> > +
> > + raw_spin_lock(rq_lockp(rq));
> > + /*
> > + * Core-wide nesting counter can never be 0 because we are
> > + * still in it on this CPU.
> > + */
> > + nest = rq->core->core_unsafe_nest;
> > + WARN_ON_ONCE(!nest);
> > +
> > + /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
> > + smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
> > + raw_spin_unlock(rq_lockp(rq));
> > +ret:
> > + local_irq_restore(flags);
> > +}
> > +
> > // XXX fairness/fwd progress conditions
> > /*
> > * Returns
> > @@ -5019,6 +5248,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
> > rq = cpu_rq(i);
> > if (rq->core && rq->core == rq)
> > core_rq = rq;
> > + init_sched_core_irq_work(rq);
> > }
> > if (!core_rq)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index f7e2d8a3be8e..4bcf3b1ddfb3 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -1059,12 +1059,15 @@ struct rq {
> > unsigned int core_enabled;
> > unsigned int core_sched_seq;
> > struct rb_root core_tree;
> > + struct irq_work core_irq_work; /* To force HT into kernel */
> > + unsigned int core_this_unsafe_nest;
> > /* shared state */
> > unsigned int core_task_seq;
> > unsigned int core_pick_seq;
> > unsigned long core_cookie;
> > unsigned char core_forceidle;
> > + unsigned int core_unsafe_nest;
> > #endif
> > };
> >

2020-11-06 18:12:58

by Alexandre Chartre

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode


On 11/6/20 6:43 PM, Joel Fernandes wrote:
> On Fri, Nov 06, 2020 at 05:57:21PM +0100, Alexandre Chartre wrote:
>>
>>
>> On 11/3/20 2:20 AM, Joel Fernandes wrote:
>>> Hi Alexandre,
>>>
>>> Sorry for late reply as I was working on the snapshotting patch...
>>>
>>> On Fri, Oct 30, 2020 at 11:29:26AM +0100, Alexandre Chartre wrote:
>>>>
>>>> On 10/20/20 3:43 AM, Joel Fernandes (Google) wrote:
>>>>> Core-scheduling prevents hyperthreads in usermode from attacking each
>>>>> other, but it does not do anything about one of the hyperthreads
>>>>> entering the kernel for any reason. This leaves the door open for MDS
>>>>> and L1TF attacks with concurrent execution sequences between
>>>>> hyperthreads.
>>>>>
>>>>> This patch therefore adds support for protecting all syscall and IRQ
>>>>> kernel mode entries. Care is taken to track the outermost usermode exit
>>>>> and entry using per-cpu counters. In cases where one of the hyperthreads
>>>>> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
>>>>> when not needed - example: idle and non-cookie HTs do not need to be
>>>>> forced into kernel mode.
>>>>
>>>> Hi Joel,
>>>>
>>>> In order to protect syscall/IRQ kernel mode entries, shouldn't we have a
>>>> call to sched_core_unsafe_enter() in the syscall/IRQ entry code? I don't
>>>> see such a call. Am I missing something?
>>>
>>> Yes, this is known bug and fixed in v9 which I'll post soon. Meanwhile
>>> updated patch is appended below:
>>
>> [..]
>>
>>>>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>>>>> index 0a1e20f8d4e8..c8dc6b1b1f40 100644
>>>>> --- a/kernel/entry/common.c
>>>>> +++ b/kernel/entry/common.c
>>>>> @@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
>>>>> /* Workaround to allow gradual conversion of architecture code */
>>>>> void __weak arch_do_signal(struct pt_regs *regs) { }
>>>>> +unsigned long exit_to_user_get_work(void)
>>>>> +{
>>>>> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
>>>>> +
>>>>> + if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
>>>>> + return ti_work;
>>>>> +
>>>>> +#ifdef CONFIG_SCHED_CORE
>>>>> + ti_work &= EXIT_TO_USER_MODE_WORK;
>>>>> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
>>>>> + sched_core_unsafe_exit();
>>>>> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
>>>>
>>>> If we call sched_core_unsafe_exit() before sched_core_wait_till_safe() then we
>>>> expose ourself during the entire wait period in sched_core_wait_till_safe(). It
>>>> would be better to call sched_core_unsafe_exit() once we know for sure we are
>>>> going to exit.
>>>
>>> The way the algorithm works right now, it requires the current task to get
>>> out of the unsafe state while waiting otherwise it will lockup. Note that we
>>> wait with interrupts enabled so new interrupts could come while waiting.
>>>
>>> TBH this code is very tricky to get right and it took long time to get it
>>> working properly. For now I am content with the way it works. We can improve
>>> further incrementally on it in the future.
>>>
>>
>> I am concerned this leaves a lot of windows opened (even with the updated patch)
>> where the system remains exposed. There are 3 obvious windows:
>>
>> - after switching to the kernel page-table and until enter_from_user_mode() is called
>> - while waiting for other cpus
>> - after leaving exit_to_user_mode_loop() and until switching back to the user page-table
>>
>> Also on syscall/interrupt entry, sched_core_unsafe_enter() is called (in the
>> updated patch) and this sends an IPI to other CPUs but it doesn't wait for
>> other CPUs to effectively switch to the kernel page-table. It also seems like
>> the case where the CPU is interrupted by a NMI is not handled.
>
> TBH, we discussed on list before that there may not be much value in closing
> the above mentioned windows. We already knew there are a few windows open
> like that. Thomas Glexiner told us that the solution does not need to be 100%
> as long as it closes most windows and is performant -- the important thing
> being to disrupt the attacker than making it 100% attack proof. And keeping
> the code simple.

Ok. I will need to check if the additional complexity I have is effectively
worth it, and that's certainly something we can improve other time if needed.


>> I know the code is tricky, and I am working on something similar for ASI (ASI
>> lockdown) where I am addressing all these cases (this seems to work).
>
> Ok.
>
>>> Let me know if I may add your Reviewed-by tag for this patch, if there are no
>>> other comments, and I appreciate it. Appended the updated patch below.
>>>
>>
>> I haven't effectively reviewed the code yet. Also I wonder if the work I am doing
>> with ASI for synchronizing sibling cpus (i.e. the ASI lockdown) and the integration
>> with PTI could provide what you need. Basically each process has an ASI and the
>> ASI lockdown ensure that sibling cpus are also running with a trusted ASI. If
>> the process/ASI is interrupted (e.g. on interrupt/exception/NMI) then it forces
>> sibling cpus to also interrupt ASI. The sibling cpus synchronization occurs when
>> switching the page-tables (between user and kernel) so there's no exposure window.
>
> Maybe. But are you doing everything this patch does, when you enter ASI lockdown?
> Basically, we want to only send IPIs if needed and we track the "core wide"
> entry into the kernel to make sure we do the sligthly higher overhead things
> once per "core wide" entry and exit. For some pictures, check these slides
> from slide 15:
> https://docs.google.com/presentation/d/1VzeQo3AyGTN35DJ3LKoPWBfiZHZJiF8q0NrX9eVYG70/edit#slide=id.g91cff3980b_0_7

Yes, I think this looks similar to what ASI lockdown is doing.


> As for switching page tables, I am not sure if that is all that's needed to
> close the windows you mentioned, because MDS attacks are independent of page
> table entries, they leak the uarch buffers. So you really have to isolate
> things in the time domain as this patch does.
>
> We discussed with Thomas and Dario in previous list emails that maybe this
> patch can be used as a subset of ASI work as a "utility", the implement the
> stunning.
>

Ok, I will look how this can fit together.

alex.

>> Let me have a closer look.
>
> Sure, thanks.
>
> - Joel
>
>> alex.
>>
>>
>>> ---8<-----------------------
>>>
>>> From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
>>> From: "Joel Fernandes (Google)" <[email protected]>
>>> Date: Mon, 27 Jul 2020 17:56:14 -0400
>>> Subject: [PATCH] kernel/entry: Add support for core-wide protection of
>>> kernel-mode
>>>
>>> Core-scheduling prevents hyperthreads in usermode from attacking each
>>> other, but it does not do anything about one of the hyperthreads
>>> entering the kernel for any reason. This leaves the door open for MDS
>>> and L1TF attacks with concurrent execution sequences between
>>> hyperthreads.
>>>
>>> This patch therefore adds support for protecting all syscall and IRQ
>>> kernel mode entries. Care is taken to track the outermost usermode exit
>>> and entry using per-cpu counters. In cases where one of the hyperthreads
>>> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
>>> when not needed - example: idle and non-cookie HTs do not need to be
>>> forced into kernel mode.
>>>
>>> More information about attacks:
>>> For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
>>> data to either host or guest attackers. For L1TF, it is possible to leak
>>> to guest attackers. There is no possible mitigation involving flushing
>>> of buffers to avoid this since the execution of attacker and victims
>>> happen concurrently on 2 or more HTs.
>>>
>>> Cc: Julien Desfossez <[email protected]>
>>> Cc: Tim Chen <[email protected]>
>>> Cc: Aaron Lu <[email protected]>
>>> Cc: Aubrey Li <[email protected]>
>>> Cc: Tim Chen <[email protected]>
>>> Cc: Paul E. McKenney <[email protected]>
>>> Co-developed-by: Vineeth Pillai <[email protected]>
>>> Tested-by: Julien Desfossez <[email protected]>
>>> Signed-off-by: Vineeth Pillai <[email protected]>
>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>>> ---
>>> .../admin-guide/kernel-parameters.txt | 9 +
>>> include/linux/entry-common.h | 6 +-
>>> include/linux/sched.h | 12 +
>>> kernel/entry/common.c | 28 ++-
>>> kernel/sched/core.c | 230 ++++++++++++++++++
>>> kernel/sched/sched.h | 3 +
>>> 6 files changed, 285 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>>> index 3236427e2215..a338d5d64c3d 100644
>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>> @@ -4678,6 +4678,15 @@
>>> sbni= [NET] Granch SBNI12 leased line adapter
>>> + sched_core_protect_kernel=
>>> + [SCHED_CORE] Pause SMT siblings of a core running in
>>> + user mode, if at least one of the siblings of the core
>>> + is running in kernel mode. This is to guarantee that
>>> + kernel data is not leaked to tasks which are not trusted
>>> + by the kernel. A value of 0 disables protection, 1
>>> + enables protection. The default is 1. Note that protection
>>> + depends on the arch defining the _TIF_UNSAFE_RET flag.
>>> +
>>> sched_debug [KNL] Enables verbose scheduler debug messages.
>>> schedstats= [KNL,X86] Enable or disable scheduled statistics.
>>> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
>>> index 474f29638d2c..62278c5b3b5f 100644
>>> --- a/include/linux/entry-common.h
>>> +++ b/include/linux/entry-common.h
>>> @@ -33,6 +33,10 @@
>>> # define _TIF_PATCH_PENDING (0)
>>> #endif
>>> +#ifndef _TIF_UNSAFE_RET
>>> +# define _TIF_UNSAFE_RET (0)
>>> +#endif
>>> +
>>> #ifndef _TIF_UPROBE
>>> # define _TIF_UPROBE (0)
>>> #endif
>>> @@ -69,7 +73,7 @@
>>> #define EXIT_TO_USER_MODE_WORK \
>>> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
>>> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
>>> + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
>>> ARCH_EXIT_TO_USER_MODE_WORK)
>>> /**
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d38e904dd603..fe6f225bfbf9 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>>> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>>> +#ifdef CONFIG_SCHED_CORE
>>> +void sched_core_unsafe_enter(void);
>>> +void sched_core_unsafe_exit(void);
>>> +bool sched_core_wait_till_safe(unsigned long ti_check);
>>> +bool sched_core_kernel_protected(void);
>>> +#else
>>> +#define sched_core_unsafe_enter(ignore) do { } while (0)
>>> +#define sched_core_unsafe_exit(ignore) do { } while (0)
>>> +#define sched_core_wait_till_safe(ignore) do { } while (0)
>>> +#define sched_core_kernel_protected(ignore) do { } while (0)
>>> +#endif
>>> +
>>> #endif
>>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>>> index 0a1e20f8d4e8..a18ed60cedea 100644
>>> --- a/kernel/entry/common.c
>>> +++ b/kernel/entry/common.c
>>> @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>>> instrumentation_begin();
>>> trace_hardirqs_off_finish();
>>> + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
>>> + sched_core_unsafe_enter();
>>> instrumentation_end();
>>> }
>>> @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
>>> /* Workaround to allow gradual conversion of architecture code */
>>> void __weak arch_do_signal(struct pt_regs *regs) { }
>>> +unsigned long exit_to_user_get_work(void)
>>> +{
>>> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
>>> +
>>> + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
>>> + || !_TIF_UNSAFE_RET)
>>> + return ti_work;
>>> +
>>> +#ifdef CONFIG_SCHED_CORE
>>> + ti_work &= EXIT_TO_USER_MODE_WORK;
>>> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
>>> + sched_core_unsafe_exit();
>>> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
>>> + sched_core_unsafe_enter(); /* not exiting to user yet. */
>>> + }
>>> + }
>>> +
>>> + return READ_ONCE(current_thread_info()->flags);
>>> +#endif
>>> +}
>>> +
>>> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>> unsigned long ti_work)
>>> {
>>> @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>> * enabled above.
>>> */
>>> local_irq_disable_exit_to_user();
>>> - ti_work = READ_ONCE(current_thread_info()->flags);
>>> + ti_work = exit_to_user_get_work();
>>> }
>>> /* Return the latest work state for arch_exit_to_user_mode() */
>>> @@ -184,9 +207,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>> static void exit_to_user_mode_prepare(struct pt_regs *regs)
>>> {
>>> - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
>>> + unsigned long ti_work;
>>> lockdep_assert_irqs_disabled();
>>> + ti_work = exit_to_user_get_work();
>>> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
>>> ti_work = exit_to_user_mode_loop(regs, ti_work);
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index e05728bdb18c..bd206708fac2 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
>>> #ifdef CONFIG_SCHED_CORE
>>> +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
>>> +static int __init set_sched_core_protect_kernel(char *str)
>>> +{
>>> + unsigned long val = 0;
>>> +
>>> + if (!str)
>>> + return 0;
>>> +
>>> + if (!kstrtoul(str, 0, &val) && !val)
>>> + static_branch_disable(&sched_core_protect_kernel);
>>> +
>>> + return 1;
>>> +}
>>> +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
>>> +
>>> +/* Is the kernel protected by core scheduling? */
>>> +bool sched_core_kernel_protected(void)
>>> +{
>>> + return static_branch_likely(&sched_core_protect_kernel);
>>> +}
>>> +
>>> DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
>>> /* kernel prio, less is more */
>>> @@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
>>> return a->core_cookie == b->core_cookie;
>>> }
>>> +/*
>>> + * Handler to attempt to enter kernel. It does nothing because the exit to
>>> + * usermode or guest mode will do the actual work (of waiting if needed).
>>> + */
>>> +static void sched_core_irq_work(struct irq_work *work)
>>> +{
>>> + return;
>>> +}
>>> +
>>> +static inline void init_sched_core_irq_work(struct rq *rq)
>>> +{
>>> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
>>> +}
>>> +
>>> +/*
>>> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
>>> + * exits the core-wide unsafe state. Obviously the CPU calling this function
>>> + * should not be responsible for the core being in the core-wide unsafe state
>>> + * otherwise it will deadlock.
>>> + *
>>> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
>>> + * the loop if TIF flags are set and notify caller about it.
>>> + *
>>> + * IRQs should be disabled.
>>> + */
>>> +bool sched_core_wait_till_safe(unsigned long ti_check)
>>> +{
>>> + bool restart = false;
>>> + struct rq *rq;
>>> + int cpu;
>>> +
>>> + /* We clear the thread flag only at the end, so need to check for it. */
>>> + ti_check &= ~_TIF_UNSAFE_RET;
>>> +
>>> + cpu = smp_processor_id();
>>> + rq = cpu_rq(cpu);
>>> +
>>> + if (!sched_core_enabled(rq))
>>> + goto ret;
>>> +
>>> + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
>>> + preempt_disable();
>>> + local_irq_enable();
>>> +
>>> + /*
>>> + * Wait till the core of this HT is not in an unsafe state.
>>> + *
>>> + * Pair with smp_store_release() in sched_core_unsafe_exit().
>>> + */
>>> + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
>>> + cpu_relax();
>>> + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
>>> + restart = true;
>>> + break;
>>> + }
>>> + }
>>> +
>>> + /* Upgrade it back to the expectations of entry code. */
>>> + local_irq_disable();
>>> + preempt_enable();
>>> +
>>> +ret:
>>> + if (!restart)
>>> + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
>>> +
>>> + return restart;
>>> +}
>>> +
>>> +/*
>>> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
>>> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
>>> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
>>> + * context.
>>> + */
>>> +void sched_core_unsafe_enter(void)
>>> +{
>>> + const struct cpumask *smt_mask;
>>> + unsigned long flags;
>>> + struct rq *rq;
>>> + int i, cpu;
>>> +
>>> + if (!static_branch_likely(&sched_core_protect_kernel))
>>> + return;
>>> +
>>> + /* Ensure that on return to user/guest, we check whether to wait. */
>>> + if (current->core_cookie)
>>> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
>>> +
>>> + local_irq_save(flags);
>>> + cpu = smp_processor_id();
>>> + rq = cpu_rq(cpu);
>>> + if (!sched_core_enabled(rq))
>>> + goto ret;
>>> +
>>> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
>>> + rq->core_this_unsafe_nest++;
>>> +
>>> + /* Should not nest: enter() should only pair with exit(). */
>>> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
>>> + goto ret;
>>> +
>>> + raw_spin_lock(rq_lockp(rq));
>>> + smt_mask = cpu_smt_mask(cpu);
>>> +
>>> + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
>>> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
>>> +
>>> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
>>> + goto unlock;
>>> +
>>> + if (irq_work_is_busy(&rq->core_irq_work)) {
>>> + /*
>>> + * Do nothing more since we are in an IPI sent from another
>>> + * sibling to enforce safety. That sibling would have sent IPIs
>>> + * to all of the HTs.
>>> + */
>>> + goto unlock;
>>> + }
>>> +
>>> + /*
>>> + * If we are not the first ones on the core to enter core-wide unsafe
>>> + * state, do nothing.
>>> + */
>>> + if (rq->core->core_unsafe_nest > 1)
>>> + goto unlock;
>>> +
>>> + /* Do nothing more if the core is not tagged. */
>>> + if (!rq->core->core_cookie)
>>> + goto unlock;
>>> +
>>> + for_each_cpu(i, smt_mask) {
>>> + struct rq *srq = cpu_rq(i);
>>> +
>>> + if (i == cpu || cpu_is_offline(i))
>>> + continue;
>>> +
>>> + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
>>> + continue;
>>> +
>>> + /* Skip if HT is not running a tagged task. */
>>> + if (!srq->curr->core_cookie && !srq->core_pick)
>>> + continue;
>>> +
>>> + /*
>>> + * Force sibling into the kernel by IPI. If work was already
>>> + * pending, no new IPIs are sent. This is Ok since the receiver
>>> + * would already be in the kernel, or on its way to it.
>>> + */
>>> + irq_work_queue_on(&srq->core_irq_work, i);
>>> + }
>>> +unlock:
>>> + raw_spin_unlock(rq_lockp(rq));
>>> +ret:
>>> + local_irq_restore(flags);
>>> +}
>>> +
>>> +/*
>>> + * Process any work need for either exiting the core-wide unsafe state, or for
>>> + * waiting on this hyperthread if the core is still in this state.
>>> + *
>>> + * @idle: Are we called from the idle loop?
>>> + */
>>> +void sched_core_unsafe_exit(void)
>>> +{
>>> + unsigned long flags;
>>> + unsigned int nest;
>>> + struct rq *rq;
>>> + int cpu;
>>> +
>>> + if (!static_branch_likely(&sched_core_protect_kernel))
>>> + return;
>>> +
>>> + local_irq_save(flags);
>>> + cpu = smp_processor_id();
>>> + rq = cpu_rq(cpu);
>>> +
>>> + /* Do nothing if core-sched disabled. */
>>> + if (!sched_core_enabled(rq))
>>> + goto ret;
>>> +
>>> + /*
>>> + * Can happen when a process is forked and the first return to user
>>> + * mode is a syscall exit. Either way, there's nothing to do.
>>> + */
>>> + if (rq->core_this_unsafe_nest == 0)
>>> + goto ret;
>>> +
>>> + rq->core_this_unsafe_nest--;
>>> +
>>> + /* enter() should be paired with exit() only. */
>>> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
>>> + goto ret;
>>> +
>>> + raw_spin_lock(rq_lockp(rq));
>>> + /*
>>> + * Core-wide nesting counter can never be 0 because we are
>>> + * still in it on this CPU.
>>> + */
>>> + nest = rq->core->core_unsafe_nest;
>>> + WARN_ON_ONCE(!nest);
>>> +
>>> + /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
>>> + smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
>>> + raw_spin_unlock(rq_lockp(rq));
>>> +ret:
>>> + local_irq_restore(flags);
>>> +}
>>> +
>>> // XXX fairness/fwd progress conditions
>>> /*
>>> * Returns
>>> @@ -5019,6 +5248,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
>>> rq = cpu_rq(i);
>>> if (rq->core && rq->core == rq)
>>> core_rq = rq;
>>> + init_sched_core_irq_work(rq);
>>> }
>>> if (!core_rq)
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index f7e2d8a3be8e..4bcf3b1ddfb3 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -1059,12 +1059,15 @@ struct rq {
>>> unsigned int core_enabled;
>>> unsigned int core_sched_seq;
>>> struct rb_root core_tree;
>>> + struct irq_work core_irq_work; /* To force HT into kernel */
>>> + unsigned int core_this_unsafe_nest;
>>> /* shared state */
>>> unsigned int core_task_seq;
>>> unsigned int core_pick_seq;
>>> unsigned long core_cookie;
>>> unsigned char core_forceidle;
>>> + unsigned int core_unsafe_nest;
>>> #endif
>>> };
>>>

2020-11-10 09:38:16

by Alexandre Chartre

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode


On 11/3/20 2:20 AM, Joel Fernandes wrote:
> Hi Alexandre,
>
> Sorry for late reply as I was working on the snapshotting patch...
>
> On Fri, Oct 30, 2020 at 11:29:26AM +0100, Alexandre Chartre wrote:
>>
>> On 10/20/20 3:43 AM, Joel Fernandes (Google) wrote:
>>> Core-scheduling prevents hyperthreads in usermode from attacking each
>>> other, but it does not do anything about one of the hyperthreads
>>> entering the kernel for any reason. This leaves the door open for MDS
>>> and L1TF attacks with concurrent execution sequences between
>>> hyperthreads.
>>>
>>> This patch therefore adds support for protecting all syscall and IRQ
>>> kernel mode entries. Care is taken to track the outermost usermode exit
>>> and entry using per-cpu counters. In cases where one of the hyperthreads
>>> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
>>> when not needed - example: idle and non-cookie HTs do not need to be
>>> forced into kernel mode.
>>
>> Hi Joel,
>>
>> In order to protect syscall/IRQ kernel mode entries, shouldn't we have a
>> call to sched_core_unsafe_enter() in the syscall/IRQ entry code? I don't
>> see such a call. Am I missing something?
>
> Yes, this is known bug and fixed in v9 which I'll post soon. Meanwhile
> updated patch is appended below:
>

See comments below about the updated patch.

> ---8<-----------------------
>
> From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
> From: "Joel Fernandes (Google)" <[email protected]>
> Date: Mon, 27 Jul 2020 17:56:14 -0400
> Subject: [PATCH] kernel/entry: Add support for core-wide protection of
> kernel-mode
>
> Core-scheduling prevents hyperthreads in usermode from attacking each
> other, but it does not do anything about one of the hyperthreads
> entering the kernel for any reason. This leaves the door open for MDS
> and L1TF attacks with concurrent execution sequences between
> hyperthreads.
>
> This patch therefore adds support for protecting all syscall and IRQ
> kernel mode entries. Care is taken to track the outermost usermode exit
> and entry using per-cpu counters. In cases where one of the hyperthreads
> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> when not needed - example: idle and non-cookie HTs do not need to be
> forced into kernel mode.
>
> More information about attacks:
> For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> data to either host or guest attackers. For L1TF, it is possible to leak
> to guest attackers. There is no possible mitigation involving flushing
> of buffers to avoid this since the execution of attacker and victims
> happen concurrently on 2 or more HTs.
>
> Cc: Julien Desfossez <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Aaron Lu <[email protected]>
> Cc: Aubrey Li <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Co-developed-by: Vineeth Pillai <[email protected]>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Vineeth Pillai <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 9 +
> include/linux/entry-common.h | 6 +-
> include/linux/sched.h | 12 +
> kernel/entry/common.c | 28 ++-
> kernel/sched/core.c | 230 ++++++++++++++++++
> kernel/sched/sched.h | 3 +
> 6 files changed, 285 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 3236427e2215..a338d5d64c3d 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4678,6 +4678,15 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> + sched_core_protect_kernel=
> + [SCHED_CORE] Pause SMT siblings of a core running in
> + user mode, if at least one of the siblings of the core
> + is running in kernel mode. This is to guarantee that
> + kernel data is not leaked to tasks which are not trusted
> + by the kernel. A value of 0 disables protection, 1
> + enables protection. The default is 1. Note that protection
> + depends on the arch defining the _TIF_UNSAFE_RET flag.
> +
> sched_debug [KNL] Enables verbose scheduler debug messages.
>
> schedstats= [KNL,X86] Enable or disable scheduled statistics.
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 474f29638d2c..62278c5b3b5f 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -33,6 +33,10 @@
> # define _TIF_PATCH_PENDING (0)
> #endif
>
> +#ifndef _TIF_UNSAFE_RET
> +# define _TIF_UNSAFE_RET (0)
> +#endif
> +
> #ifndef _TIF_UPROBE
> # define _TIF_UPROBE (0)
> #endif
> @@ -69,7 +73,7 @@
>
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d38e904dd603..fe6f225bfbf9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>
> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>
> +#ifdef CONFIG_SCHED_CORE
> +void sched_core_unsafe_enter(void);
> +void sched_core_unsafe_exit(void);
> +bool sched_core_wait_till_safe(unsigned long ti_check);
> +bool sched_core_kernel_protected(void);
> +#else
> +#define sched_core_unsafe_enter(ignore) do { } while (0)
> +#define sched_core_unsafe_exit(ignore) do { } while (0)
> +#define sched_core_wait_till_safe(ignore) do { } while (0)
> +#define sched_core_kernel_protected(ignore) do { } while (0)
> +#endif
> +
> #endif
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 0a1e20f8d4e8..a18ed60cedea 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>
> instrumentation_begin();
> trace_hardirqs_off_finish();
> + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
> + sched_core_unsafe_enter();
> instrumentation_end();
> }
>
> @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
> /* Workaround to allow gradual conversion of architecture code */
> void __weak arch_do_signal(struct pt_regs *regs) { }
>
> +unsigned long exit_to_user_get_work(void)

Function should be static.


> +{
> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> +
> + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> + || !_TIF_UNSAFE_RET)
> + return ti_work;
> +
> +#ifdef CONFIG_SCHED_CORE
> + ti_work &= EXIT_TO_USER_MODE_WORK;
> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> + sched_core_unsafe_exit();
> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> + sched_core_unsafe_enter(); /* not exiting to user yet. */
> + }
> + }
> +
> + return READ_ONCE(current_thread_info()->flags);
> +#endif
> +}
> +
> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> unsigned long ti_work)
> {
> @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> * enabled above.
> */
> local_irq_disable_exit_to_user();
> - ti_work = READ_ONCE(current_thread_info()->flags);
> + ti_work = exit_to_user_get_work();
> }

What happen if the task is scheduled out in exit_to_user_mode_loop? (e.g. if it has
_TIF_NEED_RESCHED set). It will have call sched_core_unsafe_enter() and force siblings
to wait for it. So shouldn't sched_core_unsafe_exit() be called when the task is
scheduled out? (because it won't run anymore) And sched_core_unsafe_enter() when
the task is scheduled back in?


> /* Return the latest work state for arch_exit_to_user_mode() */
> @@ -184,9 +207,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> static void exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> + unsigned long ti_work;
>
> lockdep_assert_irqs_disabled();
> + ti_work = exit_to_user_get_work();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e05728bdb18c..bd206708fac2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
>
> #ifdef CONFIG_SCHED_CORE
>
> +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> +static int __init set_sched_core_protect_kernel(char *str)
> +{
> + unsigned long val = 0;
> +
> + if (!str)
> + return 0;
> +
> + if (!kstrtoul(str, 0, &val) && !val)
> + static_branch_disable(&sched_core_protect_kernel);
> +
> + return 1;
> +}
> +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> +
> +/* Is the kernel protected by core scheduling? */
> +bool sched_core_kernel_protected(void)
> +{
> + return static_branch_likely(&sched_core_protect_kernel);
> +}
> +
> DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
>
> /* kernel prio, less is more */
> @@ -4596,6 +4617,214 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> return a->core_cookie == b->core_cookie;
> }
>
> +/*
> + * Handler to attempt to enter kernel. It does nothing because the exit to
> + * usermode or guest mode will do the actual work (of waiting if needed).
> + */
> +static void sched_core_irq_work(struct irq_work *work)
> +{
> + return;
> +}
> +
> +static inline void init_sched_core_irq_work(struct rq *rq)
> +{
> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> +}
> +
> +/*
> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> + * exits the core-wide unsafe state. Obviously the CPU calling this function
> + * should not be responsible for the core being in the core-wide unsafe state
> + * otherwise it will deadlock.
> + *
> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> + * the loop if TIF flags are set and notify caller about it.
> + *
> + * IRQs should be disabled.
> + */
> +bool sched_core_wait_till_safe(unsigned long ti_check)
> +{
> + bool restart = false;
> + struct rq *rq;
> + int cpu;
> +
> + /* We clear the thread flag only at the end, so need to check for it. */

Do you mean "no need to check for it" ?


> + ti_check &= ~_TIF_UNSAFE_RET;
> +
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> + preempt_disable();
> + local_irq_enable();
> +
> + /*
> + * Wait till the core of this HT is not in an unsafe state.
> + *
> + * Pair with smp_store_release() in sched_core_unsafe_exit().
> + */
> + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> + cpu_relax();
> + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> + restart = true;
> + break;
> + }
> + }
> +
> + /* Upgrade it back to the expectations of entry code. */
> + local_irq_disable();
> + preempt_enable();
> +
> +ret:
> + if (!restart)
> + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + return restart;
> +}
> +
> +/*
> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> + * context.
> + */
> +void sched_core_unsafe_enter(void)
> +{
> + const struct cpumask *smt_mask;
> + unsigned long flags;
> + struct rq *rq;
> + int i, cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + /* Ensure that on return to user/guest, we check whether to wait. */
> + if (current->core_cookie)
> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> + if (!sched_core_enabled(rq))
> + goto ret;

Should we clear TIF_UNSAFE_RET if (!sched_core_enabled(rq))? This would avoid calling
sched_core_wait_till_safe().


> +
> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> + rq->core_this_unsafe_nest++;
> +
> + /* Should not nest: enter() should only pair with exit(). */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> + goto ret;

I would be more precise about the nesting comment: we don't nest not only because each
enter() is paired with an exit() but because each enter()/exit() is for a user context.
We can have nested interrupts but they will be for a kernel context so they won't enter/exit.

So I would say something like:

/*
* Should not nest: each enter() is paired with an exit(), and enter()/exit()
* are done when coming from userspace. We can have nested interrupts between
* enter()/exit() but they will originate from the kernel so they won't enter()
* nor exit().
*/


> +
> + raw_spin_lock(rq_lockp(rq));
> + smt_mask = cpu_smt_mask(cpu);
> +
> + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);

We are protected by the rq_lockp(rq) spinlock, but we still need to use WRITE_ONCE()
because sched_core_wait_till_safe() checks core_unsafe_next without taking rq_lockp(rq),
right? Shouldn't we be using smp_store_release() like sched_core_unsafe_exit() does?

In any case, it is worth having a comment why WRITE_ONCE() or smp_store_release() is
used.


> +
> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> + goto unlock;

It might be better checking if (rq->core->core_unsafe_nest >= cpumask_weight(smt_mask))
because we shouldn't exceed the number of siblings.

alex.


> +
> + if (irq_work_is_busy(&rq->core_irq_work)) {
> + /*
> + * Do nothing more since we are in an IPI sent from another
> + * sibling to enforce safety. That sibling would have sent IPIs
> + * to all of the HTs.
> + */
> + goto unlock;
> + }
> +
> + /*
> + * If we are not the first ones on the core to enter core-wide unsafe
> + * state, do nothing.
> + */
> + if (rq->core->core_unsafe_nest > 1)
> + goto unlock;
> +
> + /* Do nothing more if the core is not tagged. */
> + if (!rq->core->core_cookie)
> + goto unlock;
> +
> + for_each_cpu(i, smt_mask) {
> + struct rq *srq = cpu_rq(i);
> +
> + if (i == cpu || cpu_is_offline(i))
> + continue;
> +
> + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> + continue;
> +
> + /* Skip if HT is not running a tagged task. */
> + if (!srq->curr->core_cookie && !srq->core_pick)
> + continue;
> +
> + /*
> + * Force sibling into the kernel by IPI. If work was already
> + * pending, no new IPIs are sent. This is Ok since the receiver
> + * would already be in the kernel, or on its way to it.
> + */
> + irq_work_queue_on(&srq->core_irq_work, i);
> + }
> +unlock:
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> +/*
> + * Process any work need for either exiting the core-wide unsafe state, or for
> + * waiting on this hyperthread if the core is still in this state.
> + *
> + * @idle: Are we called from the idle loop?
> + */
> +void sched_core_unsafe_exit(void)
> +{
> + unsigned long flags;
> + unsigned int nest;
> + struct rq *rq;
> + int cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + /* Do nothing if core-sched disabled. */
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /*
> + * Can happen when a process is forked and the first return to user
> + * mode is a syscall exit. Either way, there's nothing to do.
> + */
> + if (rq->core_this_unsafe_nest == 0)
> + goto ret;
> +
> + rq->core_this_unsafe_nest--;
> +
> + /* enter() should be paired with exit() only. */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + /*
> + * Core-wide nesting counter can never be 0 because we are
> + * still in it on this CPU.
> + */
> + nest = rq->core->core_unsafe_nest;
> + WARN_ON_ONCE(!nest);
> +
> + /* Pair with smp_load_acquire() in sched_core_wait_till_safe(). */
> + smp_store_release(&rq->core->core_unsafe_nest, nest - 1);
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> // XXX fairness/fwd progress conditions
> /*
> * Returns
> @@ -5019,6 +5248,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
> rq = cpu_rq(i);
> if (rq->core && rq->core == rq)
> core_rq = rq;
> + init_sched_core_irq_work(rq);
> }
>
> if (!core_rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index f7e2d8a3be8e..4bcf3b1ddfb3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1059,12 +1059,15 @@ struct rq {
> unsigned int core_enabled;
> unsigned int core_sched_seq;
> struct rb_root core_tree;
> + struct irq_work core_irq_work; /* To force HT into kernel */
> + unsigned int core_this_unsafe_nest;
>
> /* shared state */
> unsigned int core_task_seq;
> unsigned int core_pick_seq;
> unsigned long core_cookie;
> unsigned char core_forceidle;
> + unsigned int core_unsafe_nest;
> #endif
> };
>
>

2020-11-10 22:45:42

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On Tue, Nov 10, 2020 at 10:35:17AM +0100, Alexandre Chartre wrote:
[..]
> > ---8<-----------------------
> >
> > From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
> > From: "Joel Fernandes (Google)" <[email protected]>
> > Date: Mon, 27 Jul 2020 17:56:14 -0400
> > Subject: [PATCH] kernel/entry: Add support for core-wide protection of
> > kernel-mode
[..]
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d38e904dd603..fe6f225bfbf9 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
> > const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
> > +#ifdef CONFIG_SCHED_CORE
> > +void sched_core_unsafe_enter(void);
> > +void sched_core_unsafe_exit(void);
> > +bool sched_core_wait_till_safe(unsigned long ti_check);
> > +bool sched_core_kernel_protected(void);
> > +#else
> > +#define sched_core_unsafe_enter(ignore) do { } while (0)
> > +#define sched_core_unsafe_exit(ignore) do { } while (0)
> > +#define sched_core_wait_till_safe(ignore) do { } while (0)
> > +#define sched_core_kernel_protected(ignore) do { } while (0)
> > +#endif
> > +
> > #endif
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index 0a1e20f8d4e8..a18ed60cedea 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
> > instrumentation_begin();
> > trace_hardirqs_off_finish();
> > + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
> > + sched_core_unsafe_enter();
> > instrumentation_end();
> > }
> > @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
> > /* Workaround to allow gradual conversion of architecture code */
> > void __weak arch_do_signal(struct pt_regs *regs) { }
> > +unsigned long exit_to_user_get_work(void)
>
> Function should be static.

Fixed.

> > +{
> > + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > +
> > + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> > + || !_TIF_UNSAFE_RET)
> > + return ti_work;
> > +
> > +#ifdef CONFIG_SCHED_CORE
> > + ti_work &= EXIT_TO_USER_MODE_WORK;
> > + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> > + sched_core_unsafe_exit();
> > + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> > + sched_core_unsafe_enter(); /* not exiting to user yet. */
> > + }
> > + }
> > +
> > + return READ_ONCE(current_thread_info()->flags);
> > +#endif
> > +}
> > +
> > static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > unsigned long ti_work)
> > {
> > @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > * enabled above.
> > */
> > local_irq_disable_exit_to_user();
> > - ti_work = READ_ONCE(current_thread_info()->flags);
> > + ti_work = exit_to_user_get_work();
> > }
>
> What happen if the task is scheduled out in exit_to_user_mode_loop? (e.g. if it has
> _TIF_NEED_RESCHED set). It will have call sched_core_unsafe_enter() and force siblings
> to wait for it. So shouldn't sched_core_unsafe_exit() be called when the task is
> scheduled out? (because it won't run anymore) And sched_core_unsafe_enter() when
> the task is scheduled back in?

No, when the task is scheduled out, it will in kernel mode on the task being
scheduled in. That task (being scheduled-in) would have already done a
sched_core_unsafe_enter(). When that task returns to user made, it will do a
sched_core_unsafe_exit(). When all tasks goto sleep, the last task that
enters the idle loop will do a sched_core_unsafe_exit(). Just to note: the
"unsafe kernel context" is per-CPU and not per-task. Does that answer your
question?

> > +static inline void init_sched_core_irq_work(struct rq *rq)
> > +{
> > + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> > +}
> > +
> > +/*
> > + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> > + * exits the core-wide unsafe state. Obviously the CPU calling this function
> > + * should not be responsible for the core being in the core-wide unsafe state
> > + * otherwise it will deadlock.
> > + *
> > + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> > + * the loop if TIF flags are set and notify caller about it.
> > + *
> > + * IRQs should be disabled.
> > + */
> > +bool sched_core_wait_till_safe(unsigned long ti_check)
> > +{
> > + bool restart = false;
> > + struct rq *rq;
> > + int cpu;
> > +
> > + /* We clear the thread flag only at the end, so need to check for it. */
>
> Do you mean "no need to check for it" ?

Fixed.

> > +/*
> > + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> > + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> > + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> > + * context.
> > + */
> > +void sched_core_unsafe_enter(void)
> > +{
> > + const struct cpumask *smt_mask;
> > + unsigned long flags;
> > + struct rq *rq;
> > + int i, cpu;
> > +
> > + if (!static_branch_likely(&sched_core_protect_kernel))
> > + return;
> > +
> > + /* Ensure that on return to user/guest, we check whether to wait. */
> > + if (current->core_cookie)
> > + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> > +
> > + local_irq_save(flags);
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > + if (!sched_core_enabled(rq))
> > + goto ret;
>
> Should we clear TIF_UNSAFE_RET if (!sched_core_enabled(rq))? This would avoid calling
> sched_core_wait_till_safe().

Ok, or what I'll do is move the set_tsk_thread_flag to after the check for
sched_core_enabled().

> > +
> > + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> > + rq->core_this_unsafe_nest++;
> > +
> > + /* Should not nest: enter() should only pair with exit(). */
> > + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> > + goto ret;
>
> I would be more precise about the nesting comment: we don't nest not only because each
> enter() is paired with an exit() but because each enter()/exit() is for a user context.
> We can have nested interrupts but they will be for a kernel context so they won't enter/exit.
>
> So I would say something like:
>
> /*
> * Should not nest: each enter() is paired with an exit(), and enter()/exit()
> * are done when coming from userspace. We can have nested interrupts between
> * enter()/exit() but they will originate from the kernel so they won't enter()
> * nor exit().
> */

Changed it to following, hope its ok with you:
/*
* Should not nest: enter() should only pair with exit(). Both are done
* during the first entry into kernel and the last exit from kernel.
* Nested kernel entries (such as nested interrupts) will only trigger
* enter() and exit() on the outer most kernel entry and exit.
*/

> > +
> > + raw_spin_lock(rq_lockp(rq));
> > + smt_mask = cpu_smt_mask(cpu);
> > +
> > + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
> > + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
>
> We are protected by the rq_lockp(rq) spinlock, but we still need to use WRITE_ONCE()
> because sched_core_wait_till_safe() checks core_unsafe_next without taking rq_lockp(rq),
> right?

Yes.

> Shouldn't we be using smp_store_release() like sched_core_unsafe_exit() does?
>
> In any case, it is worth having a comment why WRITE_ONCE() or smp_store_release() is
> used.

The smp_store_release() in exit() ensures that the write to the nesting
counter happens *after* all prior reads and write accesses done by this CPU
are seen by the spinning CPU doing the smp_load_acquire() before that
spinning CPU returns. I did put a comment there.

But, I think I don't need smp_store_release() at all here. The spin_unlock
that follows already has the required release semantics. I will demote it to
a WRITE_ONCE() in enter() as well, and add appropriate comments.

> > +
> > + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> > + goto unlock;
>
> It might be better checking if (rq->core->core_unsafe_nest >= cpumask_weight(smt_mask))
> because we shouldn't exceed the number of siblings.

I am a bit concerned with the time complexity of cpumask_weight(). It may be
better not to add overhead. I am not fully sure how it works but there is a
loop in bitmask weight that goes through the bits of the bitmap, what is your
opinion on that?

Can I add your Reviewed-by tag to below updated patch? Thanks for review!

- Joel

---8<---

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bd1a5b87a5e2..a36f08d74e09 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,15 @@

sbni= [NET] Granch SBNI12 leased line adapter

+ sched_core_protect_kernel=
+ [SCHED_CORE] Pause SMT siblings of a core running in
+ user mode, if at least one of the siblings of the core
+ is running in kernel mode. This is to guarantee that
+ kernel data is not leaked to tasks which are not trusted
+ by the kernel. A value of 0 disables protection, 1
+ enables protection. The default is 1. Note that protection
+ depends on the arch defining the _TIF_UNSAFE_RET flag.
+
sched_debug [KNL] Enables verbose scheduler debug messages.

schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 474f29638d2c..62278c5b3b5f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -33,6 +33,10 @@
# define _TIF_PATCH_PENDING (0)
#endif

+#ifndef _TIF_UNSAFE_RET
+# define _TIF_UNSAFE_RET (0)
+#endif
+
#ifndef _TIF_UPROBE
# define _TIF_UPROBE (0)
#endif
@@ -69,7 +73,7 @@

#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
+ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
ARCH_EXIT_TO_USER_MODE_WORK)

/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d38e904dd603..fe6f225bfbf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

+#ifdef CONFIG_SCHED_CORE
+void sched_core_unsafe_enter(void);
+void sched_core_unsafe_exit(void);
+bool sched_core_wait_till_safe(unsigned long ti_check);
+bool sched_core_kernel_protected(void);
+#else
+#define sched_core_unsafe_enter(ignore) do { } while (0)
+#define sched_core_unsafe_exit(ignore) do { } while (0)
+#define sched_core_wait_till_safe(ignore) do { } while (0)
+#define sched_core_kernel_protected(ignore) do { } while (0)
+#endif
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 2b8366693d5c..d5d88e735d55 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)

instrumentation_begin();
trace_hardirqs_off_finish();
+ if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
+ sched_core_unsafe_enter();
instrumentation_end();
}

@@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal(struct pt_regs *regs) { }

+static unsigned long exit_to_user_get_work(void)
+{
+ unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+ if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
+ || !_TIF_UNSAFE_RET)
+ return ti_work;
+
+#ifdef CONFIG_SCHED_CORE
+ ti_work &= EXIT_TO_USER_MODE_WORK;
+ if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
+ sched_core_unsafe_exit();
+ if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
+ sched_core_unsafe_enter(); /* not exiting to user yet. */
+ }
+ }
+
+ return READ_ONCE(current_thread_info()->flags);
+#endif
+}
+
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -174,7 +197,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = exit_to_user_get_work();
}

/* Return the latest work state for arch_exit_to_user_mode() */
@@ -183,9 +206,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,

static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work;

lockdep_assert_irqs_disabled();
+ ti_work = exit_to_user_get_work();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa68941998e3..429f9b8ca38e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -76,6 +76,27 @@ __read_mostly int scheduler_running;

#ifdef CONFIG_SCHED_CORE

+DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
+static int __init set_sched_core_protect_kernel(char *str)
+{
+ unsigned long val = 0;
+
+ if (!str)
+ return 0;
+
+ if (!kstrtoul(str, 0, &val) && !val)
+ static_branch_disable(&sched_core_protect_kernel);
+
+ return 1;
+}
+__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
+
+/* Is the kernel protected by core scheduling? */
+bool sched_core_kernel_protected(void)
+{
+ return static_branch_likely(&sched_core_protect_kernel);
+}
+
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */
@@ -4596,6 +4617,226 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}

+/*
+ * Handler to attempt to enter kernel. It does nothing because the exit to
+ * usermode or guest mode will do the actual work (of waiting if needed).
+ */
+static void sched_core_irq_work(struct irq_work *work)
+{
+ return;
+}
+
+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+ init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
+/*
+ * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
+ * exits the core-wide unsafe state. Obviously the CPU calling this function
+ * should not be responsible for the core being in the core-wide unsafe state
+ * otherwise it will deadlock.
+ *
+ * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
+ * the loop if TIF flags are set and notify caller about it.
+ *
+ * IRQs should be disabled.
+ */
+bool sched_core_wait_till_safe(unsigned long ti_check)
+{
+ bool restart = false;
+ struct rq *rq;
+ int cpu;
+
+ /* We clear the thread flag only at the end, so no need to check for it. */
+ ti_check &= ~_TIF_UNSAFE_RET;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Down grade to allow interrupts to prevent stop_machine lockups.. */
+ preempt_disable();
+ local_irq_enable();
+
+ /*
+ * Wait till the core of this HT is not in an unsafe state.
+ *
+ * Pair with smp_store_release() in sched_core_unsafe_exit().
+ */
+ while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
+ cpu_relax();
+ if (READ_ONCE(current_thread_info()->flags) & ti_check) {
+ restart = true;
+ break;
+ }
+ }
+
+ /* Upgrade it back to the expectations of entry code. */
+ local_irq_disable();
+ preempt_enable();
+
+ret:
+ if (!restart)
+ clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ return restart;
+}
+
+/*
+ * Enter the core-wide IRQ state. Sibling will be paused if it is running
+ * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
+ * avoid sending useless IPIs is made. Must be called only from hard IRQ
+ * context.
+ */
+void sched_core_unsafe_enter(void)
+{
+ const struct cpumask *smt_mask;
+ unsigned long flags;
+ struct rq *rq;
+ int i, cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Ensure that on return to user/guest, we check whether to wait. */
+ if (current->core_cookie)
+ set_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
+ rq->core_this_unsafe_nest++;
+
+ /*
+ * Should not nest: enter() should only pair with exit(). Both are done
+ * during the first entry into kernel and the last exit from kernel.
+ * Nested kernel entries (such as nested interrupts) will only trigger
+ * enter() and exit() on the outer most kernel entry and exit.
+ */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ smt_mask = cpu_smt_mask(cpu);
+
+ /*
+ * Contribute this CPU's unsafe_enter() to the core-wide unsafe_enter()
+ * count. The raw_spin_unlock() release semantics pairs with the nest
+ * counter's smp_load_acquire() in sched_core_wait_till_safe().
+ */
+ WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
+
+ if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
+ goto unlock;
+
+ if (irq_work_is_busy(&rq->core_irq_work)) {
+ /*
+ * Do nothing more since we are in an IPI sent from another
+ * sibling to enforce safety. That sibling would have sent IPIs
+ * to all of the HTs.
+ */
+ goto unlock;
+ }
+
+ /*
+ * If we are not the first ones on the core to enter core-wide unsafe
+ * state, do nothing.
+ */
+ if (rq->core->core_unsafe_nest > 1)
+ goto unlock;
+
+ /* Do nothing more if the core is not tagged. */
+ if (!rq->core->core_cookie)
+ goto unlock;
+
+ for_each_cpu(i, smt_mask) {
+ struct rq *srq = cpu_rq(i);
+
+ if (i == cpu || cpu_is_offline(i))
+ continue;
+
+ if (!srq->curr->mm || is_task_rq_idle(srq->curr))
+ continue;
+
+ /* Skip if HT is not running a tagged task. */
+ if (!srq->curr->core_cookie && !srq->core_pick)
+ continue;
+
+ /*
+ * Force sibling into the kernel by IPI. If work was already
+ * pending, no new IPIs are sent. This is Ok since the receiver
+ * would already be in the kernel, or on its way to it.
+ */
+ irq_work_queue_on(&srq->core_irq_work, i);
+ }
+unlock:
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
+/*
+ * Process any work need for either exiting the core-wide unsafe state, or for
+ * waiting on this hyperthread if the core is still in this state.
+ *
+ * @idle: Are we called from the idle loop?
+ */
+void sched_core_unsafe_exit(void)
+{
+ unsigned long flags;
+ unsigned int nest;
+ struct rq *rq;
+ int cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ /* Do nothing if core-sched disabled. */
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /*
+ * Can happen when a process is forked and the first return to user
+ * mode is a syscall exit. Either way, there's nothing to do.
+ */
+ if (rq->core_this_unsafe_nest == 0)
+ goto ret;
+
+ rq->core_this_unsafe_nest--;
+
+ /* enter() should be paired with exit() only. */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ /*
+ * Core-wide nesting counter can never be 0 because we are
+ * still in it on this CPU.
+ */
+ nest = rq->core->core_unsafe_nest;
+ WARN_ON_ONCE(!nest);
+
+ WRITE_ONCE(rq->core->core_unsafe_nest, nest - 1);
+ /*
+ * The raw_spin_unlock release semantics pairs with the nest counter's
+ * smp_load_acquire() in sched_core_wait_till_safe().
+ */
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
// XXX fairness/fwd progress conditions
/*
* Returns
@@ -4991,6 +5232,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
rq = cpu_rq(i);
if (rq->core && rq->core == rq)
core_rq = rq;
+ init_sched_core_irq_work(rq);
}

if (!core_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001382bc67f9..20937a5b6272 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1061,6 +1061,8 @@ struct rq {
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
+ struct irq_work core_irq_work; /* To force HT into kernel */
+ unsigned int core_this_unsafe_nest;

/* shared state */
unsigned int core_task_seq;
@@ -1068,6 +1070,7 @@ struct rq {
unsigned long core_cookie;
unsigned char core_forceidle;
unsigned int core_forceidle_seq;
+ unsigned int core_unsafe_nest;
#endif
};

2020-11-16 10:11:59

by Alexandre Chartre

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode


On 11/10/20 11:42 PM, Joel Fernandes wrote:
> On Tue, Nov 10, 2020 at 10:35:17AM +0100, Alexandre Chartre wrote:
> [..]
>>> ---8<-----------------------
>>>
>>> From b2835a587a28405ffdf8fc801e798129a014a8c8 Mon Sep 17 00:00:00 2001
>>> From: "Joel Fernandes (Google)" <[email protected]>
>>> Date: Mon, 27 Jul 2020 17:56:14 -0400
>>> Subject: [PATCH] kernel/entry: Add support for core-wide protection of
>>> kernel-mode
> [..]
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d38e904dd603..fe6f225bfbf9 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>>> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>>> +#ifdef CONFIG_SCHED_CORE
>>> +void sched_core_unsafe_enter(void);
>>> +void sched_core_unsafe_exit(void);
>>> +bool sched_core_wait_till_safe(unsigned long ti_check);
>>> +bool sched_core_kernel_protected(void);
>>> +#else
>>> +#define sched_core_unsafe_enter(ignore) do { } while (0)
>>> +#define sched_core_unsafe_exit(ignore) do { } while (0)
>>> +#define sched_core_wait_till_safe(ignore) do { } while (0)
>>> +#define sched_core_kernel_protected(ignore) do { } while (0)
>>> +#endif
>>> +
>>> #endif
>>> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
>>> index 0a1e20f8d4e8..a18ed60cedea 100644
>>> --- a/kernel/entry/common.c
>>> +++ b/kernel/entry/common.c
>>> @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>>> instrumentation_begin();
>>> trace_hardirqs_off_finish();
>>> + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
>>> + sched_core_unsafe_enter();
>>> instrumentation_end();
>>> }
>>> @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
>>> /* Workaround to allow gradual conversion of architecture code */
>>> void __weak arch_do_signal(struct pt_regs *regs) { }
>>> +unsigned long exit_to_user_get_work(void)
>>
>> Function should be static.
>
> Fixed.
>
>>> +{
>>> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
>>> +
>>> + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
>>> + || !_TIF_UNSAFE_RET)
>>> + return ti_work;
>>> +
>>> +#ifdef CONFIG_SCHED_CORE
>>> + ti_work &= EXIT_TO_USER_MODE_WORK;
>>> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
>>> + sched_core_unsafe_exit();
>>> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
>>> + sched_core_unsafe_enter(); /* not exiting to user yet. */
>>> + }
>>> + }
>>> +
>>> + return READ_ONCE(current_thread_info()->flags);
>>> +#endif
>>> +}
>>> +
>>> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>> unsigned long ti_work)
>>> {
>>> @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>>> * enabled above.
>>> */
>>> local_irq_disable_exit_to_user();
>>> - ti_work = READ_ONCE(current_thread_info()->flags);
>>> + ti_work = exit_to_user_get_work();
>>> }
>>
>> What happen if the task is scheduled out in exit_to_user_mode_loop? (e.g. if it has
>> _TIF_NEED_RESCHED set). It will have call sched_core_unsafe_enter() and force siblings
>> to wait for it. So shouldn't sched_core_unsafe_exit() be called when the task is
>> scheduled out? (because it won't run anymore) And sched_core_unsafe_enter() when
>> the task is scheduled back in?
>
> No, when the task is scheduled out, it will in kernel mode on the task being
> scheduled in. That task (being scheduled-in) would have already done a
> sched_core_unsafe_enter(). When that task returns to user made, it will do a
> sched_core_unsafe_exit(). When all tasks goto sleep, the last task that
> enters the idle loop will do a sched_core_unsafe_exit(). Just to note: the
> "unsafe kernel context" is per-CPU and not per-task. Does that answer your
> question?

Ok, I think I get it: it works because when a task is scheduled out then the
scheduler will schedule in a new tagged task (because we have core scheduling).
So that new task should be accounted for core-wide protection the same way as
the previous one.


>>> +static inline void init_sched_core_irq_work(struct rq *rq)
>>> +{
>>> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
>>> +}
>>> +
>>> +/*
>>> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
>>> + * exits the core-wide unsafe state. Obviously the CPU calling this function
>>> + * should not be responsible for the core being in the core-wide unsafe state
>>> + * otherwise it will deadlock.
>>> + *
>>> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
>>> + * the loop if TIF flags are set and notify caller about it.
>>> + *
>>> + * IRQs should be disabled.
>>> + */
>>> +bool sched_core_wait_till_safe(unsigned long ti_check)
>>> +{
>>> + bool restart = false;
>>> + struct rq *rq;
>>> + int cpu;
>>> +
>>> + /* We clear the thread flag only at the end, so need to check for it. */
>>
>> Do you mean "no need to check for it" ?
>
> Fixed.
>
>>> +/*
>>> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
>>> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
>>> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
>>> + * context.
>>> + */
>>> +void sched_core_unsafe_enter(void)
>>> +{
>>> + const struct cpumask *smt_mask;
>>> + unsigned long flags;
>>> + struct rq *rq;
>>> + int i, cpu;
>>> +
>>> + if (!static_branch_likely(&sched_core_protect_kernel))
>>> + return;
>>> +
>>> + /* Ensure that on return to user/guest, we check whether to wait. */
>>> + if (current->core_cookie)
>>> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
>>> +
>>> + local_irq_save(flags);
>>> + cpu = smp_processor_id();
>>> + rq = cpu_rq(cpu);
>>> + if (!sched_core_enabled(rq))
>>> + goto ret;
>>
>> Should we clear TIF_UNSAFE_RET if (!sched_core_enabled(rq))? This would avoid calling
>> sched_core_wait_till_safe().
>
> Ok, or what I'll do is move the set_tsk_thread_flag to after the check for
> sched_core_enabled().
>
>>> +
>>> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
>>> + rq->core_this_unsafe_nest++;
>>> +
>>> + /* Should not nest: enter() should only pair with exit(). */
>>> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
>>> + goto ret;
>>
>> I would be more precise about the nesting comment: we don't nest not only because each
>> enter() is paired with an exit() but because each enter()/exit() is for a user context.
>> We can have nested interrupts but they will be for a kernel context so they won't enter/exit.
>>
>> So I would say something like:
>>
>> /*
>> * Should not nest: each enter() is paired with an exit(), and enter()/exit()
>> * are done when coming from userspace. We can have nested interrupts between
>> * enter()/exit() but they will originate from the kernel so they won't enter()
>> * nor exit().
>> */
>
> Changed it to following, hope its ok with you:
> /*
> * Should not nest: enter() should only pair with exit(). Both are done
> * during the first entry into kernel and the last exit from kernel.
> * Nested kernel entries (such as nested interrupts) will only trigger
> * enter() and exit() on the outer most kernel entry and exit.
> */
>
>>> +
>>> + raw_spin_lock(rq_lockp(rq));
>>> + smt_mask = cpu_smt_mask(cpu);
>>> +
>>> + /* Contribute this CPU's unsafe_enter() to core-wide unsafe_enter() count. */
>>> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
>>
>> We are protected by the rq_lockp(rq) spinlock, but we still need to use WRITE_ONCE()
>> because sched_core_wait_till_safe() checks core_unsafe_next without taking rq_lockp(rq),
>> right?
>
> Yes.
>
>> Shouldn't we be using smp_store_release() like sched_core_unsafe_exit() does?
>>
>> In any case, it is worth having a comment why WRITE_ONCE() or smp_store_release() is
>> used.
>
> The smp_store_release() in exit() ensures that the write to the nesting
> counter happens *after* all prior reads and write accesses done by this CPU
> are seen by the spinning CPU doing the smp_load_acquire() before that
> spinning CPU returns. I did put a comment there.
>
> But, I think I don't need smp_store_release() at all here. The spin_unlock
> that follows already has the required release semantics. I will demote it to
> a WRITE_ONCE() in enter() as well, and add appropriate comments.
>

I think a WRITE_ONCE() is not even be useful here. The WRITE_ONCE() will only prevent
some possible compiler optimization in the function wrt rq->core->core_unsafe_nest, but
rq->core->core_unsafe_nest is just updated here, and concurrent changes are protected
by the rq_lockp(rq) spinlock, and the memory barrier is ensured by raw_spin_unlock().

So I thing you can just do: rq->core->core_unsafe_nest++;

And in sched_core_unsafe_exit(), you can just do: rq->core->core_unsafe_nest = nest - 1

Also comment in sched_core_wait_till_safe() wrt smp_load_acquire() should be updated,
it should say:

/*
* Wait till the core of this HT is not in an unsafe state.
*
* Pair with raw_spin_unlock(rq_lockp(rq) in sched_core_unsafe_enter/exit()
*/


>>> +
>>> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
>>> + goto unlock;
>>
>> It might be better checking if (rq->core->core_unsafe_nest >= cpumask_weight(smt_mask))
>> because we shouldn't exceed the number of siblings.
>
> I am a bit concerned with the time complexity of cpumask_weight(). It may be
> better not to add overhead. I am not fully sure how it works but there is a
> loop in bitmask weight that goes through the bits of the bitmap, what is your
> opinion on that?

Yes, it's looping through the bitmap so probably not worth adding this overhead it here.

>
> Can I add your Reviewed-by tag to below updated patch? Thanks for review!

Reviewed-by: Alexandre Chartre <[email protected]>


alex.

>
> ---8<---
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index bd1a5b87a5e2..a36f08d74e09 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4678,6 +4678,15 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> + sched_core_protect_kernel=
> + [SCHED_CORE] Pause SMT siblings of a core running in
> + user mode, if at least one of the siblings of the core
> + is running in kernel mode. This is to guarantee that
> + kernel data is not leaked to tasks which are not trusted
> + by the kernel. A value of 0 disables protection, 1
> + enables protection. The default is 1. Note that protection
> + depends on the arch defining the _TIF_UNSAFE_RET flag.
> +
> sched_debug [KNL] Enables verbose scheduler debug messages.
>
> schedstats= [KNL,X86] Enable or disable scheduled statistics.
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 474f29638d2c..62278c5b3b5f 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -33,6 +33,10 @@
> # define _TIF_PATCH_PENDING (0)
> #endif
>
> +#ifndef _TIF_UNSAFE_RET
> +# define _TIF_UNSAFE_RET (0)
> +#endif
> +
> #ifndef _TIF_UPROBE
> # define _TIF_UPROBE (0)
> #endif
> @@ -69,7 +73,7 @@
>
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> ARCH_EXIT_TO_USER_MODE_WORK)
>
> /**
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d38e904dd603..fe6f225bfbf9 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
>
> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>
> +#ifdef CONFIG_SCHED_CORE
> +void sched_core_unsafe_enter(void);
> +void sched_core_unsafe_exit(void);
> +bool sched_core_wait_till_safe(unsigned long ti_check);
> +bool sched_core_kernel_protected(void);
> +#else
> +#define sched_core_unsafe_enter(ignore) do { } while (0)
> +#define sched_core_unsafe_exit(ignore) do { } while (0)
> +#define sched_core_wait_till_safe(ignore) do { } while (0)
> +#define sched_core_kernel_protected(ignore) do { } while (0)
> +#endif
> +
> #endif
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 2b8366693d5c..d5d88e735d55 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
>
> instrumentation_begin();
> trace_hardirqs_off_finish();
> + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
> + sched_core_unsafe_enter();
> instrumentation_end();
> }
>
> @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
> /* Workaround to allow gradual conversion of architecture code */
> void __weak arch_do_signal(struct pt_regs *regs) { }
>
> +static unsigned long exit_to_user_get_work(void)
> +{
> + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> +
> + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> + || !_TIF_UNSAFE_RET)
> + return ti_work;
> +
> +#ifdef CONFIG_SCHED_CORE
> + ti_work &= EXIT_TO_USER_MODE_WORK;
> + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> + sched_core_unsafe_exit();
> + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> + sched_core_unsafe_enter(); /* not exiting to user yet. */
> + }
> + }
> +
> + return READ_ONCE(current_thread_info()->flags);
> +#endif
> +}
> +
> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> unsigned long ti_work)
> {
> @@ -174,7 +197,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> * enabled above.
> */
> local_irq_disable_exit_to_user();
> - ti_work = READ_ONCE(current_thread_info()->flags);
> + ti_work = exit_to_user_get_work();
> }
>
> /* Return the latest work state for arch_exit_to_user_mode() */
> @@ -183,9 +206,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> static void exit_to_user_mode_prepare(struct pt_regs *regs)
> {
> - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> + unsigned long ti_work;
>
> lockdep_assert_irqs_disabled();
> + ti_work = exit_to_user_get_work();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fa68941998e3..429f9b8ca38e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
>
> #ifdef CONFIG_SCHED_CORE
>
> +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> +static int __init set_sched_core_protect_kernel(char *str)
> +{
> + unsigned long val = 0;
> +
> + if (!str)
> + return 0;
> +
> + if (!kstrtoul(str, 0, &val) && !val)
> + static_branch_disable(&sched_core_protect_kernel);
> +
> + return 1;
> +}
> +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> +
> +/* Is the kernel protected by core scheduling? */
> +bool sched_core_kernel_protected(void)
> +{
> + return static_branch_likely(&sched_core_protect_kernel);
> +}
> +
> DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
>
> /* kernel prio, less is more */
> @@ -4596,6 +4617,226 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> return a->core_cookie == b->core_cookie;
> }
>
> +/*
> + * Handler to attempt to enter kernel. It does nothing because the exit to
> + * usermode or guest mode will do the actual work (of waiting if needed).
> + */
> +static void sched_core_irq_work(struct irq_work *work)
> +{
> + return;
> +}
> +
> +static inline void init_sched_core_irq_work(struct rq *rq)
> +{
> + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> +}
> +
> +/*
> + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> + * exits the core-wide unsafe state. Obviously the CPU calling this function
> + * should not be responsible for the core being in the core-wide unsafe state
> + * otherwise it will deadlock.
> + *
> + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> + * the loop if TIF flags are set and notify caller about it.
> + *
> + * IRQs should be disabled.
> + */
> +bool sched_core_wait_till_safe(unsigned long ti_check)
> +{
> + bool restart = false;
> + struct rq *rq;
> + int cpu;
> +
> + /* We clear the thread flag only at the end, so no need to check for it. */
> + ti_check &= ~_TIF_UNSAFE_RET;
> +
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> + preempt_disable();
> + local_irq_enable();
> +
> + /*
> + * Wait till the core of this HT is not in an unsafe state.
> + *
> + * Pair with smp_store_release() in sched_core_unsafe_exit().
> + */
> + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> + cpu_relax();
> + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> + restart = true;
> + break;
> + }
> + }
> +
> + /* Upgrade it back to the expectations of entry code. */
> + local_irq_disable();
> + preempt_enable();
> +
> +ret:
> + if (!restart)
> + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + return restart;
> +}
> +
> +/*
> + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> + * context.
> + */
> +void sched_core_unsafe_enter(void)
> +{
> + const struct cpumask *smt_mask;
> + unsigned long flags;
> + struct rq *rq;
> + int i, cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /* Ensure that on return to user/guest, we check whether to wait. */
> + if (current->core_cookie)
> + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> + rq->core_this_unsafe_nest++;
> +
> + /*
> + * Should not nest: enter() should only pair with exit(). Both are done
> + * during the first entry into kernel and the last exit from kernel.
> + * Nested kernel entries (such as nested interrupts) will only trigger
> + * enter() and exit() on the outer most kernel entry and exit.
> + */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + smt_mask = cpu_smt_mask(cpu);
> +
> + /*
> + * Contribute this CPU's unsafe_enter() to the core-wide unsafe_enter()
> + * count. The raw_spin_unlock() release semantics pairs with the nest
> + * counter's smp_load_acquire() in sched_core_wait_till_safe().
> + */
> + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> +
> + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> + goto unlock;
> +
> + if (irq_work_is_busy(&rq->core_irq_work)) {
> + /*
> + * Do nothing more since we are in an IPI sent from another
> + * sibling to enforce safety. That sibling would have sent IPIs
> + * to all of the HTs.
> + */
> + goto unlock;
> + }
> +
> + /*
> + * If we are not the first ones on the core to enter core-wide unsafe
> + * state, do nothing.
> + */
> + if (rq->core->core_unsafe_nest > 1)
> + goto unlock;
> +
> + /* Do nothing more if the core is not tagged. */
> + if (!rq->core->core_cookie)
> + goto unlock;
> +
> + for_each_cpu(i, smt_mask) {
> + struct rq *srq = cpu_rq(i);
> +
> + if (i == cpu || cpu_is_offline(i))
> + continue;
> +
> + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> + continue;
> +
> + /* Skip if HT is not running a tagged task. */
> + if (!srq->curr->core_cookie && !srq->core_pick)
> + continue;
> +
> + /*
> + * Force sibling into the kernel by IPI. If work was already
> + * pending, no new IPIs are sent. This is Ok since the receiver
> + * would already be in the kernel, or on its way to it.
> + */
> + irq_work_queue_on(&srq->core_irq_work, i);
> + }
> +unlock:
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> +/*
> + * Process any work need for either exiting the core-wide unsafe state, or for
> + * waiting on this hyperthread if the core is still in this state.
> + *
> + * @idle: Are we called from the idle loop?
> + */
> +void sched_core_unsafe_exit(void)
> +{
> + unsigned long flags;
> + unsigned int nest;
> + struct rq *rq;
> + int cpu;
> +
> + if (!static_branch_likely(&sched_core_protect_kernel))
> + return;
> +
> + local_irq_save(flags);
> + cpu = smp_processor_id();
> + rq = cpu_rq(cpu);
> +
> + /* Do nothing if core-sched disabled. */
> + if (!sched_core_enabled(rq))
> + goto ret;
> +
> + /*
> + * Can happen when a process is forked and the first return to user
> + * mode is a syscall exit. Either way, there's nothing to do.
> + */
> + if (rq->core_this_unsafe_nest == 0)
> + goto ret;
> +
> + rq->core_this_unsafe_nest--;
> +
> + /* enter() should be paired with exit() only. */
> + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> + goto ret;
> +
> + raw_spin_lock(rq_lockp(rq));
> + /*
> + * Core-wide nesting counter can never be 0 because we are
> + * still in it on this CPU.
> + */
> + nest = rq->core->core_unsafe_nest;
> + WARN_ON_ONCE(!nest);
> +
> + WRITE_ONCE(rq->core->core_unsafe_nest, nest - 1);
> + /*
> + * The raw_spin_unlock release semantics pairs with the nest counter's
> + * smp_load_acquire() in sched_core_wait_till_safe().
> + */
> + raw_spin_unlock(rq_lockp(rq));
> +ret:
> + local_irq_restore(flags);
> +}
> +
> // XXX fairness/fwd progress conditions
> /*
> * Returns
> @@ -4991,6 +5232,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
> rq = cpu_rq(i);
> if (rq->core && rq->core == rq)
> core_rq = rq;
> + init_sched_core_irq_work(rq);
> }
>
> if (!core_rq)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 001382bc67f9..20937a5b6272 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1061,6 +1061,8 @@ struct rq {
> unsigned int core_enabled;
> unsigned int core_sched_seq;
> struct rb_root core_tree;
> + struct irq_work core_irq_work; /* To force HT into kernel */
> + unsigned int core_this_unsafe_nest;
>
> /* shared state */
> unsigned int core_task_seq;
> @@ -1068,6 +1070,7 @@ struct rq {
> unsigned long core_cookie;
> unsigned char core_forceidle;
> unsigned int core_forceidle_seq;
> + unsigned int core_unsafe_nest;
> #endif
> };
>
>

2020-11-16 14:53:50

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On Mon, Nov 16, 2020 at 11:08:25AM +0100, Alexandre Chartre wrote:
[..]
> > > > static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > > > unsigned long ti_work)
> > > > {
> > > > @@ -175,7 +198,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > > > * enabled above.
> > > > */
> > > > local_irq_disable_exit_to_user();
> > > > - ti_work = READ_ONCE(current_thread_info()->flags);
> > > > + ti_work = exit_to_user_get_work();
> > > > }
> > >
> > > What happen if the task is scheduled out in exit_to_user_mode_loop? (e.g. if it has
> > > _TIF_NEED_RESCHED set). It will have call sched_core_unsafe_enter() and force siblings
> > > to wait for it. So shouldn't sched_core_unsafe_exit() be called when the task is
> > > scheduled out? (because it won't run anymore) And sched_core_unsafe_enter() when
> > > the task is scheduled back in?
> >
> > No, when the task is scheduled out, it will in kernel mode on the task being
> > scheduled in. That task (being scheduled-in) would have already done a
> > sched_core_unsafe_enter(). When that task returns to user made, it will do a
> > sched_core_unsafe_exit(). When all tasks goto sleep, the last task that
> > enters the idle loop will do a sched_core_unsafe_exit(). Just to note: the
> > "unsafe kernel context" is per-CPU and not per-task. Does that answer your
> > question?
>
> Ok, I think I get it: it works because when a task is scheduled out then the
> scheduler will schedule in a new tagged task (because we have core scheduling).
> So that new task should be accounted for core-wide protection the same way as
> the previous one.

Exactly!

> > > Shouldn't we be using smp_store_release() like sched_core_unsafe_exit() does?
> > >
> > > In any case, it is worth having a comment why WRITE_ONCE() or smp_store_release() is
> > > used.
> >
> > The smp_store_release() in exit() ensures that the write to the nesting
> > counter happens *after* all prior reads and write accesses done by this CPU
> > are seen by the spinning CPU doing the smp_load_acquire() before that
> > spinning CPU returns. I did put a comment there.
> >
> > But, I think I don't need smp_store_release() at all here. The spin_unlock
> > that follows already has the required release semantics. I will demote it to
> > a WRITE_ONCE() in enter() as well, and add appropriate comments.
> >
>
> I think a WRITE_ONCE() is not even be useful here. The WRITE_ONCE() will only prevent
> some possible compiler optimization in the function wrt rq->core->core_unsafe_nest, but
> rq->core->core_unsafe_nest is just updated here, and concurrent changes are protected
> by the rq_lockp(rq) spinlock, and the memory barrier is ensured by raw_spin_unlock().
>
> So I thing you can just do: rq->core->core_unsafe_nest++;
>
> And in sched_core_unsafe_exit(), you can just do: rq->core->core_unsafe_nest = nest - 1

Hmm, I believe KCSAN will flag this as data-race though. Even though the
variable is modified under lock, it is still locklessly read in
wait_till_safe(). Though I agree that in practice it may not be that useful
because we are only checking if the variable is > 0. If its Ok with you, I
will leave it as WRITE_ONCE for now.

> Also comment in sched_core_wait_till_safe() wrt smp_load_acquire() should be updated,
> it should say:
>
> /*
> * Wait till the core of this HT is not in an unsafe state.
> *
> * Pair with raw_spin_unlock(rq_lockp(rq) in sched_core_unsafe_enter/exit()
> */

Ah right, fixed. Thanks.

> > Can I add your Reviewed-by tag to below updated patch? Thanks for review!
>
> Reviewed-by: Alexandre Chartre <[email protected]>

Will add, thanks!

- Joel

>
> >
> > ---8<---
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index bd1a5b87a5e2..a36f08d74e09 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -4678,6 +4678,15 @@
> > sbni= [NET] Granch SBNI12 leased line adapter
> > + sched_core_protect_kernel=
> > + [SCHED_CORE] Pause SMT siblings of a core running in
> > + user mode, if at least one of the siblings of the core
> > + is running in kernel mode. This is to guarantee that
> > + kernel data is not leaked to tasks which are not trusted
> > + by the kernel. A value of 0 disables protection, 1
> > + enables protection. The default is 1. Note that protection
> > + depends on the arch defining the _TIF_UNSAFE_RET flag.
> > +
> > sched_debug [KNL] Enables verbose scheduler debug messages.
> > schedstats= [KNL,X86] Enable or disable scheduled statistics.
> > diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> > index 474f29638d2c..62278c5b3b5f 100644
> > --- a/include/linux/entry-common.h
> > +++ b/include/linux/entry-common.h
> > @@ -33,6 +33,10 @@
> > # define _TIF_PATCH_PENDING (0)
> > #endif
> > +#ifndef _TIF_UNSAFE_RET
> > +# define _TIF_UNSAFE_RET (0)
> > +#endif
> > +
> > #ifndef _TIF_UPROBE
> > # define _TIF_UPROBE (0)
> > #endif
> > @@ -69,7 +73,7 @@
> > #define EXIT_TO_USER_MODE_WORK \
> > (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> > - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
> > + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
> > ARCH_EXIT_TO_USER_MODE_WORK)
> > /**
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d38e904dd603..fe6f225bfbf9 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
> > const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
> > +#ifdef CONFIG_SCHED_CORE
> > +void sched_core_unsafe_enter(void);
> > +void sched_core_unsafe_exit(void);
> > +bool sched_core_wait_till_safe(unsigned long ti_check);
> > +bool sched_core_kernel_protected(void);
> > +#else
> > +#define sched_core_unsafe_enter(ignore) do { } while (0)
> > +#define sched_core_unsafe_exit(ignore) do { } while (0)
> > +#define sched_core_wait_till_safe(ignore) do { } while (0)
> > +#define sched_core_kernel_protected(ignore) do { } while (0)
> > +#endif
> > +
> > #endif
> > diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> > index 2b8366693d5c..d5d88e735d55 100644
> > --- a/kernel/entry/common.c
> > +++ b/kernel/entry/common.c
> > @@ -28,6 +28,8 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
> > instrumentation_begin();
> > trace_hardirqs_off_finish();
> > + if (_TIF_UNSAFE_RET) /* Kernel protection depends on arch defining the flag. */
> > + sched_core_unsafe_enter();
> > instrumentation_end();
> > }
> > @@ -137,6 +139,27 @@ static __always_inline void exit_to_user_mode(void)
> > /* Workaround to allow gradual conversion of architecture code */
> > void __weak arch_do_signal(struct pt_regs *regs) { }
> > +static unsigned long exit_to_user_get_work(void)
> > +{
> > + unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > +
> > + if ((IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
> > + || !_TIF_UNSAFE_RET)
> > + return ti_work;
> > +
> > +#ifdef CONFIG_SCHED_CORE
> > + ti_work &= EXIT_TO_USER_MODE_WORK;
> > + if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
> > + sched_core_unsafe_exit();
> > + if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
> > + sched_core_unsafe_enter(); /* not exiting to user yet. */
> > + }
> > + }
> > +
> > + return READ_ONCE(current_thread_info()->flags);
> > +#endif
> > +}
> > +
> > static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > unsigned long ti_work)
> > {
> > @@ -174,7 +197,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > * enabled above.
> > */
> > local_irq_disable_exit_to_user();
> > - ti_work = READ_ONCE(current_thread_info()->flags);
> > + ti_work = exit_to_user_get_work();
> > }
> > /* Return the latest work state for arch_exit_to_user_mode() */
> > @@ -183,9 +206,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > static void exit_to_user_mode_prepare(struct pt_regs *regs)
> > {
> > - unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > + unsigned long ti_work;
> > lockdep_assert_irqs_disabled();
> > + ti_work = exit_to_user_get_work();
> > if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> > ti_work = exit_to_user_mode_loop(regs, ti_work);
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index fa68941998e3..429f9b8ca38e 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -76,6 +76,27 @@ __read_mostly int scheduler_running;
> > #ifdef CONFIG_SCHED_CORE
> > +DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
> > +static int __init set_sched_core_protect_kernel(char *str)
> > +{
> > + unsigned long val = 0;
> > +
> > + if (!str)
> > + return 0;
> > +
> > + if (!kstrtoul(str, 0, &val) && !val)
> > + static_branch_disable(&sched_core_protect_kernel);
> > +
> > + return 1;
> > +}
> > +__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
> > +
> > +/* Is the kernel protected by core scheduling? */
> > +bool sched_core_kernel_protected(void)
> > +{
> > + return static_branch_likely(&sched_core_protect_kernel);
> > +}
> > +
> > DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
> > /* kernel prio, less is more */
> > @@ -4596,6 +4617,226 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> > return a->core_cookie == b->core_cookie;
> > }
> > +/*
> > + * Handler to attempt to enter kernel. It does nothing because the exit to
> > + * usermode or guest mode will do the actual work (of waiting if needed).
> > + */
> > +static void sched_core_irq_work(struct irq_work *work)
> > +{
> > + return;
> > +}
> > +
> > +static inline void init_sched_core_irq_work(struct rq *rq)
> > +{
> > + init_irq_work(&rq->core_irq_work, sched_core_irq_work);
> > +}
> > +
> > +/*
> > + * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
> > + * exits the core-wide unsafe state. Obviously the CPU calling this function
> > + * should not be responsible for the core being in the core-wide unsafe state
> > + * otherwise it will deadlock.
> > + *
> > + * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
> > + * the loop if TIF flags are set and notify caller about it.
> > + *
> > + * IRQs should be disabled.
> > + */
> > +bool sched_core_wait_till_safe(unsigned long ti_check)
> > +{
> > + bool restart = false;
> > + struct rq *rq;
> > + int cpu;
> > +
> > + /* We clear the thread flag only at the end, so no need to check for it. */
> > + ti_check &= ~_TIF_UNSAFE_RET;
> > +
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > +
> > + if (!sched_core_enabled(rq))
> > + goto ret;
> > +
> > + /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> > + preempt_disable();
> > + local_irq_enable();
> > +
> > + /*
> > + * Wait till the core of this HT is not in an unsafe state.
> > + *
> > + * Pair with smp_store_release() in sched_core_unsafe_exit().
> > + */
> > + while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> > + cpu_relax();
> > + if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> > + restart = true;
> > + break;
> > + }
> > + }
> > +
> > + /* Upgrade it back to the expectations of entry code. */
> > + local_irq_disable();
> > + preempt_enable();
> > +
> > +ret:
> > + if (!restart)
> > + clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> > +
> > + return restart;
> > +}
> > +
> > +/*
> > + * Enter the core-wide IRQ state. Sibling will be paused if it is running
> > + * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
> > + * avoid sending useless IPIs is made. Must be called only from hard IRQ
> > + * context.
> > + */
> > +void sched_core_unsafe_enter(void)
> > +{
> > + const struct cpumask *smt_mask;
> > + unsigned long flags;
> > + struct rq *rq;
> > + int i, cpu;
> > +
> > + if (!static_branch_likely(&sched_core_protect_kernel))
> > + return;
> > +
> > + local_irq_save(flags);
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > + if (!sched_core_enabled(rq))
> > + goto ret;
> > +
> > + /* Ensure that on return to user/guest, we check whether to wait. */
> > + if (current->core_cookie)
> > + set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> > +
> > + /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
> > + rq->core_this_unsafe_nest++;
> > +
> > + /*
> > + * Should not nest: enter() should only pair with exit(). Both are done
> > + * during the first entry into kernel and the last exit from kernel.
> > + * Nested kernel entries (such as nested interrupts) will only trigger
> > + * enter() and exit() on the outer most kernel entry and exit.
> > + */
> > + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> > + goto ret;
> > +
> > + raw_spin_lock(rq_lockp(rq));
> > + smt_mask = cpu_smt_mask(cpu);
> > +
> > + /*
> > + * Contribute this CPU's unsafe_enter() to the core-wide unsafe_enter()
> > + * count. The raw_spin_unlock() release semantics pairs with the nest
> > + * counter's smp_load_acquire() in sched_core_wait_till_safe().
> > + */
> > + WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> > +
> > + if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> > + goto unlock;
> > +
> > + if (irq_work_is_busy(&rq->core_irq_work)) {
> > + /*
> > + * Do nothing more since we are in an IPI sent from another
> > + * sibling to enforce safety. That sibling would have sent IPIs
> > + * to all of the HTs.
> > + */
> > + goto unlock;
> > + }
> > +
> > + /*
> > + * If we are not the first ones on the core to enter core-wide unsafe
> > + * state, do nothing.
> > + */
> > + if (rq->core->core_unsafe_nest > 1)
> > + goto unlock;
> > +
> > + /* Do nothing more if the core is not tagged. */
> > + if (!rq->core->core_cookie)
> > + goto unlock;
> > +
> > + for_each_cpu(i, smt_mask) {
> > + struct rq *srq = cpu_rq(i);
> > +
> > + if (i == cpu || cpu_is_offline(i))
> > + continue;
> > +
> > + if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> > + continue;
> > +
> > + /* Skip if HT is not running a tagged task. */
> > + if (!srq->curr->core_cookie && !srq->core_pick)
> > + continue;
> > +
> > + /*
> > + * Force sibling into the kernel by IPI. If work was already
> > + * pending, no new IPIs are sent. This is Ok since the receiver
> > + * would already be in the kernel, or on its way to it.
> > + */
> > + irq_work_queue_on(&srq->core_irq_work, i);
> > + }
> > +unlock:
> > + raw_spin_unlock(rq_lockp(rq));
> > +ret:
> > + local_irq_restore(flags);
> > +}
> > +
> > +/*
> > + * Process any work need for either exiting the core-wide unsafe state, or for
> > + * waiting on this hyperthread if the core is still in this state.
> > + *
> > + * @idle: Are we called from the idle loop?
> > + */
> > +void sched_core_unsafe_exit(void)
> > +{
> > + unsigned long flags;
> > + unsigned int nest;
> > + struct rq *rq;
> > + int cpu;
> > +
> > + if (!static_branch_likely(&sched_core_protect_kernel))
> > + return;
> > +
> > + local_irq_save(flags);
> > + cpu = smp_processor_id();
> > + rq = cpu_rq(cpu);
> > +
> > + /* Do nothing if core-sched disabled. */
> > + if (!sched_core_enabled(rq))
> > + goto ret;
> > +
> > + /*
> > + * Can happen when a process is forked and the first return to user
> > + * mode is a syscall exit. Either way, there's nothing to do.
> > + */
> > + if (rq->core_this_unsafe_nest == 0)
> > + goto ret;
> > +
> > + rq->core_this_unsafe_nest--;
> > +
> > + /* enter() should be paired with exit() only. */
> > + if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
> > + goto ret;
> > +
> > + raw_spin_lock(rq_lockp(rq));
> > + /*
> > + * Core-wide nesting counter can never be 0 because we are
> > + * still in it on this CPU.
> > + */
> > + nest = rq->core->core_unsafe_nest;
> > + WARN_ON_ONCE(!nest);
> > +
> > + WRITE_ONCE(rq->core->core_unsafe_nest, nest - 1);
> > + /*
> > + * The raw_spin_unlock release semantics pairs with the nest counter's
> > + * smp_load_acquire() in sched_core_wait_till_safe().
> > + */
> > + raw_spin_unlock(rq_lockp(rq));
> > +ret:
> > + local_irq_restore(flags);
> > +}
> > +
> > // XXX fairness/fwd progress conditions
> > /*
> > * Returns
> > @@ -4991,6 +5232,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
> > rq = cpu_rq(i);
> > if (rq->core && rq->core == rq)
> > core_rq = rq;
> > + init_sched_core_irq_work(rq);
> > }
> > if (!core_rq)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 001382bc67f9..20937a5b6272 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -1061,6 +1061,8 @@ struct rq {
> > unsigned int core_enabled;
> > unsigned int core_sched_seq;
> > struct rb_root core_tree;
> > + struct irq_work core_irq_work; /* To force HT into kernel */
> > + unsigned int core_this_unsafe_nest;
> > /* shared state */
> > unsigned int core_task_seq;
> > @@ -1068,6 +1070,7 @@ struct rq {
> > unsigned long core_cookie;
> > unsigned char core_forceidle;
> > unsigned int core_forceidle_seq;
> > + unsigned int core_unsafe_nest;
> > #endif
> > };
> >

2020-11-16 15:48:05

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

On Mon, Nov 16, 2020 at 09:50:37AM -0500, Joel Fernandes wrote:

> > > Can I add your Reviewed-by tag to below updated patch? Thanks for review!
> >
> > Reviewed-by: Alexandre Chartre <[email protected]>
>
> Will add, thanks!
>
> - Joel

Alexandre, there was one more trivial fixup I had to make. Just fyi:
https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git/commit/?h=coresched&id=06a302df95f3a235e2680102ec4e5da10c9b87a0

Basically, I need to conditionally call sched_core_unsafe_exit() depending on
value of sched_core_protect_kernel= option and CONFIG_SCHED_CORE. Below is
updated patch:

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bd1a5b87a5e2..a36f08d74e09 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,15 @@

sbni= [NET] Granch SBNI12 leased line adapter

+ sched_core_protect_kernel=
+ [SCHED_CORE] Pause SMT siblings of a core running in
+ user mode, if at least one of the siblings of the core
+ is running in kernel mode. This is to guarantee that
+ kernel data is not leaked to tasks which are not trusted
+ by the kernel. A value of 0 disables protection, 1
+ enables protection. The default is 1. Note that protection
+ depends on the arch defining the _TIF_UNSAFE_RET flag.
+
sched_debug [KNL] Enables verbose scheduler debug messages.

schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 1a128baf3628..022e1f114157 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -33,6 +33,10 @@
# define _TIF_PATCH_PENDING (0)
#endif

+#ifndef _TIF_UNSAFE_RET
+# define _TIF_UNSAFE_RET (0)
+#endif
+
#ifndef _TIF_UPROBE
# define _TIF_UPROBE (0)
#endif
@@ -74,7 +78,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
- ARCH_EXIT_TO_USER_MODE_WORK)
+ _TIF_UNSAFE_RET | ARCH_EXIT_TO_USER_MODE_WORK)

/**
* arch_check_user_regs - Architecture specific sanity check for user mode regs
@@ -444,4 +448,10 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
*/
void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);

+/* entry_kernel_protected - Is kernel protection on entry/exit into kernel supported? */
+static inline bool entry_kernel_protected(void)
+{
+ return IS_ENABLED(CONFIG_SCHED_CORE) && sched_core_kernel_protected()
+ && _TIF_UNSAFE_RET != 0;
+}
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d38e904dd603..fe6f225bfbf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);

const struct cpumask *sched_trace_rd_span(struct root_domain *rd);

+#ifdef CONFIG_SCHED_CORE
+void sched_core_unsafe_enter(void);
+void sched_core_unsafe_exit(void);
+bool sched_core_wait_till_safe(unsigned long ti_check);
+bool sched_core_kernel_protected(void);
+#else
+#define sched_core_unsafe_enter(ignore) do { } while (0)
+#define sched_core_unsafe_exit(ignore) do { } while (0)
+#define sched_core_wait_till_safe(ignore) do { } while (0)
+#define sched_core_kernel_protected(ignore) do { } while (0)
+#endif
+
#endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bc75c114c1b3..9d9d926f2a1c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -28,6 +28,9 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)

instrumentation_begin();
trace_hardirqs_off_finish();
+
+ if (entry_kernel_protected())
+ sched_core_unsafe_enter();
instrumentation_end();
}

@@ -145,6 +148,26 @@ static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
}

+static unsigned long exit_to_user_get_work(void)
+{
+ unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+ if (!entry_kernel_protected())
+ return ti_work;
+
+#ifdef CONFIG_SCHED_CORE
+ ti_work &= EXIT_TO_USER_MODE_WORK;
+ if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
+ sched_core_unsafe_exit();
+ if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
+ sched_core_unsafe_enter(); /* not exiting to user yet. */
+ }
+ }
+
+ return READ_ONCE(current_thread_info()->flags);
+#endif
+}
+
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -182,7 +205,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* enabled above.
*/
local_irq_disable_exit_to_user();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = exit_to_user_get_work();
}

/* Return the latest work state for arch_exit_to_user_mode() */
@@ -191,9 +214,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,

static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work;

lockdep_assert_irqs_disabled();
+ ti_work = exit_to_user_get_work();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5e04078ba5d..56d6a382e3ff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -76,6 +76,27 @@ __read_mostly int scheduler_running;

#ifdef CONFIG_SCHED_CORE

+DEFINE_STATIC_KEY_TRUE(sched_core_protect_kernel);
+static int __init set_sched_core_protect_kernel(char *str)
+{
+ unsigned long val = 0;
+
+ if (!str)
+ return 0;
+
+ if (!kstrtoul(str, 0, &val) && !val)
+ static_branch_disable(&sched_core_protect_kernel);
+
+ return 1;
+}
+__setup("sched_core_protect_kernel=", set_sched_core_protect_kernel);
+
+/* Is the kernel protected by core scheduling? */
+bool sched_core_kernel_protected(void)
+{
+ return static_branch_likely(&sched_core_protect_kernel);
+}
+
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */
@@ -4596,6 +4617,226 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}

+/*
+ * Handler to attempt to enter kernel. It does nothing because the exit to
+ * usermode or guest mode will do the actual work (of waiting if needed).
+ */
+static void sched_core_irq_work(struct irq_work *work)
+{
+ return;
+}
+
+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+ init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
+/*
+ * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
+ * exits the core-wide unsafe state. Obviously the CPU calling this function
+ * should not be responsible for the core being in the core-wide unsafe state
+ * otherwise it will deadlock.
+ *
+ * @ti_check: We spin here with IRQ enabled and preempt disabled. Break out of
+ * the loop if TIF flags are set and notify caller about it.
+ *
+ * IRQs should be disabled.
+ */
+bool sched_core_wait_till_safe(unsigned long ti_check)
+{
+ bool restart = false;
+ struct rq *rq;
+ int cpu;
+
+ /* We clear the thread flag only at the end, so no need to check for it. */
+ ti_check &= ~_TIF_UNSAFE_RET;
+
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Down grade to allow interrupts to prevent stop_machine lockups.. */
+ preempt_disable();
+ local_irq_enable();
+
+ /*
+ * Wait till the core of this HT is not in an unsafe state.
+ *
+ * Pair with raw_spin_lock/unlock() in sched_core_unsafe_enter/exit().
+ */
+ while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
+ cpu_relax();
+ if (READ_ONCE(current_thread_info()->flags) & ti_check) {
+ restart = true;
+ break;
+ }
+ }
+
+ /* Upgrade it back to the expectations of entry code. */
+ local_irq_disable();
+ preempt_enable();
+
+ret:
+ if (!restart)
+ clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ return restart;
+}
+
+/*
+ * Enter the core-wide IRQ state. Sibling will be paused if it is running
+ * 'untrusted' code, until sched_core_unsafe_exit() is called. Every attempt to
+ * avoid sending useless IPIs is made. Must be called only from hard IRQ
+ * context.
+ */
+void sched_core_unsafe_enter(void)
+{
+ const struct cpumask *smt_mask;
+ unsigned long flags;
+ struct rq *rq;
+ int i, cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /* Ensure that on return to user/guest, we check whether to wait. */
+ if (current->core_cookie)
+ set_tsk_thread_flag(current, TIF_UNSAFE_RET);
+
+ /* Count unsafe_enter() calls received without unsafe_exit() on this CPU. */
+ rq->core_this_unsafe_nest++;
+
+ /*
+ * Should not nest: enter() should only pair with exit(). Both are done
+ * during the first entry into kernel and the last exit from kernel.
+ * Nested kernel entries (such as nested interrupts) will only trigger
+ * enter() and exit() on the outer most kernel entry and exit.
+ */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ smt_mask = cpu_smt_mask(cpu);
+
+ /*
+ * Contribute this CPU's unsafe_enter() to the core-wide unsafe_enter()
+ * count. The raw_spin_unlock() release semantics pairs with the nest
+ * counter's smp_load_acquire() in sched_core_wait_till_safe().
+ */
+ WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
+
+ if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
+ goto unlock;
+
+ if (irq_work_is_busy(&rq->core_irq_work)) {
+ /*
+ * Do nothing more since we are in an IPI sent from another
+ * sibling to enforce safety. That sibling would have sent IPIs
+ * to all of the HTs.
+ */
+ goto unlock;
+ }
+
+ /*
+ * If we are not the first ones on the core to enter core-wide unsafe
+ * state, do nothing.
+ */
+ if (rq->core->core_unsafe_nest > 1)
+ goto unlock;
+
+ /* Do nothing more if the core is not tagged. */
+ if (!rq->core->core_cookie)
+ goto unlock;
+
+ for_each_cpu(i, smt_mask) {
+ struct rq *srq = cpu_rq(i);
+
+ if (i == cpu || cpu_is_offline(i))
+ continue;
+
+ if (!srq->curr->mm || is_task_rq_idle(srq->curr))
+ continue;
+
+ /* Skip if HT is not running a tagged task. */
+ if (!srq->curr->core_cookie && !srq->core_pick)
+ continue;
+
+ /*
+ * Force sibling into the kernel by IPI. If work was already
+ * pending, no new IPIs are sent. This is Ok since the receiver
+ * would already be in the kernel, or on its way to it.
+ */
+ irq_work_queue_on(&srq->core_irq_work, i);
+ }
+unlock:
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
+/*
+ * Process any work need for either exiting the core-wide unsafe state, or for
+ * waiting on this hyperthread if the core is still in this state.
+ *
+ * @idle: Are we called from the idle loop?
+ */
+void sched_core_unsafe_exit(void)
+{
+ unsigned long flags;
+ unsigned int nest;
+ struct rq *rq;
+ int cpu;
+
+ if (!static_branch_likely(&sched_core_protect_kernel))
+ return;
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+
+ /* Do nothing if core-sched disabled. */
+ if (!sched_core_enabled(rq))
+ goto ret;
+
+ /*
+ * Can happen when a process is forked and the first return to user
+ * mode is a syscall exit. Either way, there's nothing to do.
+ */
+ if (rq->core_this_unsafe_nest == 0)
+ goto ret;
+
+ rq->core_this_unsafe_nest--;
+
+ /* enter() should be paired with exit() only. */
+ if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 0))
+ goto ret;
+
+ raw_spin_lock(rq_lockp(rq));
+ /*
+ * Core-wide nesting counter can never be 0 because we are
+ * still in it on this CPU.
+ */
+ nest = rq->core->core_unsafe_nest;
+ WARN_ON_ONCE(!nest);
+
+ WRITE_ONCE(rq->core->core_unsafe_nest, nest - 1);
+ /*
+ * The raw_spin_unlock release semantics pairs with the nest counter's
+ * smp_load_acquire() in sched_core_wait_till_safe().
+ */
+ raw_spin_unlock(rq_lockp(rq));
+ret:
+ local_irq_restore(flags);
+}
+
// XXX fairness/fwd progress conditions
/*
* Returns
@@ -4991,6 +5232,7 @@ static inline void sched_core_cpu_starting(unsigned int cpu)
rq = cpu_rq(i);
if (rq->core && rq->core == rq)
core_rq = rq;
+ init_sched_core_irq_work(rq);
}

if (!core_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cd74cc41c8da..acf187c36fc4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1063,6 +1063,8 @@ struct rq {
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
+ struct irq_work core_irq_work; /* To force HT into kernel */
+ unsigned int core_this_unsafe_nest;

/* shared state */
unsigned int core_task_seq;
@@ -1070,6 +1072,7 @@ struct rq {
unsigned long core_cookie;
unsigned char core_forceidle;
unsigned int core_forceidle_seq;
+ unsigned int core_unsafe_nest;
#endif
};