2022-02-02 14:23:51

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH 1/3] Introduce per thread group current virtual cpu id

This feature allows the scheduler to expose a current virtual cpu id
to user-space. This virtual cpu id is within the possible cpus range,
and is temporarily (and uniquely) assigned while threads are actively
running within a thread group. If a thread group has fewer threads than
cores, or is limited to run on few cores concurrently through sched
affinity or cgroup cpusets, the virtual cpu ids will be values close
to 0, thus allowing efficient use of user-space memory for per-cpu
data structures.

This feature is meant to be exposed by a new rseq thread area field.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
fs/exec.c | 4 +++
include/linux/sched.h | 4 +++
include/linux/sched/signal.h | 49 ++++++++++++++++++++++++++++++++++++
init/Kconfig | 14 +++++++++++
kernel/sched/core.c | 2 ++
5 files changed, 73 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302..bc9a8c5f17f4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1153,6 +1153,10 @@ static int de_thread(struct task_struct *tsk)
sig->group_exec_task = NULL;
sig->notify_count = 0;

+ /* Release possibly high vcpu id, get vcpu id 0. */
+ tg_vcpu_put(tsk);
+ tg_vcpu_get(tsk);
+
no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 838c9e0b4cae..0f199daed26a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1300,6 +1300,10 @@ struct task_struct {
unsigned long rseq_event_mask;
#endif

+#ifdef CONFIG_SCHED_THREAD_GROUP_VCPU
+ int tg_vcpu; /* Current vcpu in thread group */
+#endif
+
struct tlbflush_unmap_batch tlb_ubc;

union {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index b6ecb9fc4cd2..c87e7ad5a1ea 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -244,6 +244,12 @@ struct signal_struct {
* and may have inconsistent
* permissions.
*/
+#ifdef CONFIG_SCHED_THREAD_GROUP_VCPU
+ /*
+ * Mask of allocated vcpu ids within the thread group.
+ */
+ cpumask_t vcpu_mask;
+#endif
} __randomize_layout;

/*
@@ -742,4 +748,47 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}

+#ifdef CONFIG_SCHED_THREAD_GROUP_VCPU
+static inline void tg_vcpu_get(struct task_struct *t)
+{
+ struct cpumask *cpumask = &t->signal->vcpu_mask;
+ unsigned int vcpu;
+
+ if (t->flags & PF_KTHREAD)
+ return;
+ /* Atomically reserve lowest available vcpu number. */
+ do {
+ vcpu = cpumask_first_zero(cpumask);
+ WARN_ON_ONCE(vcpu >= nr_cpu_ids);
+ } while (cpumask_test_and_set_cpu(vcpu, cpumask));
+ t->tg_vcpu = vcpu;
+}
+
+static inline void tg_vcpu_put(struct task_struct *t)
+{
+ if (t->flags & PF_KTHREAD)
+ return;
+ cpumask_clear_cpu(t->tg_vcpu, &t->signal->vcpu_mask);
+ t->tg_vcpu = 0;
+}
+
+static inline int task_tg_vcpu_id(struct task_struct *t)
+{
+ return t->tg_vcpu;
+}
+#else
+static inline void tg_vcpu_get(struct task_struct *t) { }
+static inline void tg_vcpu_put(struct task_struct *t) { }
+static inline int task_tg_vcpu_id(struct task_struct *t)
+{
+ /*
+ * Use the processor id as a fall-back when the thread group vcpu
+ * feature is disabled. This provides functional per-cpu data structure
+ * accesses in user-space, althrough it won't provide the memory usage
+ * benefits.
+ */
+ return raw_smp_processor_id();
+}
+#endif
+
#endif /* _LINUX_SCHED_SIGNAL_H */
diff --git a/init/Kconfig b/init/Kconfig
index e9119bf54b1f..5f72b4212a33 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1023,6 +1023,20 @@ config RT_GROUP_SCHED

endif #CGROUP_SCHED

+config SCHED_THREAD_GROUP_VCPU
+ bool "Provide per-thread-group virtual cpu id"
+ depends on SMP
+ default n
+ help
+ This feature allows the scheduler to expose a current virtual cpu id
+ to user-space. This virtual cpu id is within the possible cpus range,
+ and is temporarily (and uniquely) assigned while threads are actively
+ running within a thread group. If a thread group has fewer threads than
+ cores, or is limited to run on few cores concurrently through sched
+ affinity or cgroup cpusets, the virtual cpu ids will be values close
+ to 0, thus allowing efficient use of user-space memory for per-cpu
+ data structures.
+
config UCLAMP_TASK_GROUP
bool "Utilization clamping per group of tasks"
depends on CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e4ae00e52d1..2690e80977b1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4795,6 +4795,8 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
+ tg_vcpu_put(prev);
+ tg_vcpu_get(next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
--
2.17.1


2022-02-02 16:18:05

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] Introduce per thread group current virtual cpu id

----- On Feb 2, 2022, at 6:23 AM, Peter Zijlstra [email protected] wrote:

> On Tue, Feb 01, 2022 at 02:25:38PM -0500, Mathieu Desnoyers wrote:
>
>> +static inline void tg_vcpu_get(struct task_struct *t)
>> +{
>> + struct cpumask *cpumask = &t->signal->vcpu_mask;
>> + unsigned int vcpu;
>> +
>> + if (t->flags & PF_KTHREAD)
>> + return;
>> + /* Atomically reserve lowest available vcpu number. */
>> + do {
>> + vcpu = cpumask_first_zero(cpumask);
>> + WARN_ON_ONCE(vcpu >= nr_cpu_ids);
>> + } while (cpumask_test_and_set_cpu(vcpu, cpumask));
>> + t->tg_vcpu = vcpu;
>> +}
>> +
>> +static inline void tg_vcpu_put(struct task_struct *t)
>> +{
>> + if (t->flags & PF_KTHREAD)
>> + return;
>> + cpumask_clear_cpu(t->tg_vcpu, &t->signal->vcpu_mask);
>> + t->tg_vcpu = 0;
>> +}
>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 2e4ae00e52d1..2690e80977b1 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -4795,6 +4795,8 @@ prepare_task_switch(struct rq *rq, struct task_struct
>> *prev,
>> sched_info_switch(rq, prev, next);
>> perf_event_task_sched_out(prev, next);
>> rseq_preempt(prev);
>> + tg_vcpu_put(prev);
>> + tg_vcpu_get(next);
>
>
> URGGHHH!!! that's *2* atomics extra on the context switch path. Worse,
> that's on a line that's trivially contended with a few threads.

There is one obvious optimization that just begs to be done here: when
switching between threads belonging to the same process, we can simply
take the vcpu_id tag of the prev thread and use it for next,
without requiring any atomic operation.

This only leaves the overhead of added atomics when scheduling between
threads which belong to different processes. Does it matter as much ?
If it's the case, then we should really scratch our heads a little more
to come up with improvements.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

2022-02-02 22:44:07

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH 3/3] selftests/rseq: Implement rseq tg_vcpu_id field support

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
tools/testing/selftests/rseq/rseq-abi.h | 15 +++++++++++++++
tools/testing/selftests/rseq/rseq.c | 6 +++---
2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
index 850827e8d089..929183c2b3c0 100644
--- a/tools/testing/selftests/rseq/rseq-abi.h
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -169,6 +169,21 @@ struct rseq_abi {
* rseq_len. Use the offset immediately after the node_id field as
* rseq_len.
*/
+
+ /*
+ * Restartable sequences tg_vcpu_id field. Updated by the kernel. Read by
+ * user-space with single-copy atomicity semantics. This field should
+ * only be read by the thread which registered this data structure.
+ * Aligned on 32-bit. Contains the current thread's virtual CPU ID
+ * (allocated uniquely within thread group).
+ */
+ __u32 tg_vcpu_id;
+
+ /*
+ * This is a valid end of rseq ABI for the purpose of rseq registration
+ * rseq_len. Use the offset immediately after the tg_vcpu_id field as
+ * rseq_len.
+ */
} __attribute__((aligned(4 * sizeof(__u64))));

#endif /* _RSEQ_ABI_H */
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 4b0e68051db8..c8d30e770d59 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -88,7 +88,7 @@ int rseq_register_current_thread(void)
/* Treat libc's ownership as a successful registration. */
return 0;
}
- rc = sys_rseq(&__rseq_abi, rseq_offsetofend(struct rseq_abi, node_id), 0, RSEQ_SIG);
+ rc = sys_rseq(&__rseq_abi, rseq_offsetofend(struct rseq_abi, tg_vcpu_id), 0, RSEQ_SIG);
if (rc)
return -1;
assert(rseq_current_cpu_raw() >= 0);
@@ -103,7 +103,7 @@ int rseq_unregister_current_thread(void)
/* Treat libc's ownership as a successful unregistration. */
return 0;
}
- rc = sys_rseq(&__rseq_abi, rseq_offsetofend(struct rseq_abi, node_id), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
+ rc = sys_rseq(&__rseq_abi, rseq_offsetofend(struct rseq_abi, tg_vcpu_id), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
if (rc)
return -1;
return 0;
@@ -126,7 +126,7 @@ void rseq_init(void)
return;
rseq_ownership = 1;
rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
- rseq_size = rseq_offsetofend(struct rseq_abi, node_id);
+ rseq_size = rseq_offsetofend(struct rseq_abi, tg_vcpu_id);
rseq_flags = 0;
}

--
2.17.1

2022-02-05 01:17:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] Introduce per thread group current virtual cpu id

On Tue, Feb 01, 2022 at 02:25:38PM -0500, Mathieu Desnoyers wrote:

> +static inline void tg_vcpu_get(struct task_struct *t)
> +{
> + struct cpumask *cpumask = &t->signal->vcpu_mask;
> + unsigned int vcpu;
> +
> + if (t->flags & PF_KTHREAD)
> + return;
> + /* Atomically reserve lowest available vcpu number. */
> + do {
> + vcpu = cpumask_first_zero(cpumask);
> + WARN_ON_ONCE(vcpu >= nr_cpu_ids);
> + } while (cpumask_test_and_set_cpu(vcpu, cpumask));
> + t->tg_vcpu = vcpu;
> +}
> +
> +static inline void tg_vcpu_put(struct task_struct *t)
> +{
> + if (t->flags & PF_KTHREAD)
> + return;
> + cpumask_clear_cpu(t->tg_vcpu, &t->signal->vcpu_mask);
> + t->tg_vcpu = 0;
> +}

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2e4ae00e52d1..2690e80977b1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4795,6 +4795,8 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
> sched_info_switch(rq, prev, next);
> perf_event_task_sched_out(prev, next);
> rseq_preempt(prev);
> + tg_vcpu_put(prev);
> + tg_vcpu_get(next);


URGGHHH!!! that's *2* atomics extra on the context switch path. Worse,
that's on a line that's trivially contended with a few threads.