Export the rseq feature size supported by the kernel as well as the
required allocation alignment for the rseq per-thread area to user-space
through ELF auxiliary vector entries.
This is part of the extensible rseq ABI.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
fs/binfmt_elf.c | 5 +++++
include/uapi/linux/auxvec.h | 2 ++
include/uapi/linux/rseq.h | 5 +++++
3 files changed, 12 insertions(+)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 605017eb9349..77776582e76d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,6 +46,7 @@
#include <linux/cred.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
+#include <linux/rseq.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -286,6 +287,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
if (bprm->have_execfd) {
NEW_AUX_ENT(AT_EXECFD, bprm->execfd);
}
+#ifdef CONFIG_RSEQ
+ NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end));
+ NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq));
+#endif
#undef NEW_AUX_ENT
/* AT_NULL is zero; clear the rest too */
memset(elf_info, 0, (char *)mm->saved_auxv +
diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index c7e502bf5a6f..6991c4b8ab18 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -30,6 +30,8 @@
* differ from AT_PLATFORM. */
#define AT_RANDOM 25 /* address of 16 random bytes */
#define AT_HWCAP2 26 /* extension of AT_HWCAP */
+#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
+#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */
#define AT_EXECFN 31 /* filename of program */
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 77ee207623a9..05d3c4cdeb40 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -130,6 +130,11 @@ struct rseq {
* this thread.
*/
__u32 flags;
+
+ /*
+ * Flexible array member at end of structure, after last feature field.
+ */
+ char end[];
} __attribute__((aligned(4 * sizeof(__u64))));
#endif /* _UAPI_LINUX_RSEQ_H */
--
2.17.1
Introduce the extensible rseq ABI, where the feature size supported by
the kernel and the required alignment are communicated to user-space
through ELF auxiliary vectors.
This allows user-space to call rseq registration with a rseq_len of
either 32 bytes for the original struct rseq size (which includes
padding), or larger.
If rseq_len is larger than 32 bytes, then it must be large enough to
contain the feature size communicated to user-space through ELF
auxiliary vectors.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/linux/sched.h | 4 ++++
kernel/ptrace.c | 2 +-
kernel/rseq.c | 33 +++++++++++++++++++++++++++------
3 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 508b91d57470..838c9e0b4cae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1291,6 +1291,7 @@ struct task_struct {
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
+ u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
@@ -2260,10 +2261,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
if (clone_flags & CLONE_VM) {
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
} else {
t->rseq = current->rseq;
+ t->rseq_len = current->rseq_len;
t->rseq_sig = current->rseq_sig;
t->rseq_event_mask = current->rseq_event_mask;
}
@@ -2272,6 +2275,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
static inline void rseq_execve(struct task_struct *t)
{
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..f5edde5b7805 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -800,7 +800,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 97ac20b4f738..46dc5c2ce2b7 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,6 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
@@ -86,10 +89,15 @@ static int rseq_update_cpu_id(struct task_struct *t)
u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -116,6 +124,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}
@@ -336,7 +349,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -345,6 +358,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}
@@ -357,7 +371,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -366,15 +380,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}
/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len != ORIG_RSEQ_SIZE && rseq_len < offsetof(struct rseq, end)))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
--
2.17.1
----- On Feb 6, 2022, at 4:49 PM, Peter Zijlstra [email protected] wrote:
> On Thu, Feb 03, 2022 at 02:38:51PM -0500, Mathieu Desnoyers wrote:
>
>> @@ -286,6 +287,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct
>> elfhdr *exec,
>> if (bprm->have_execfd) {
>> NEW_AUX_ENT(AT_EXECFD, bprm->execfd);
>> }
>> +#ifdef CONFIG_RSEQ
>> + NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end));
>
> I've gotta ask, what's up with offsetof(, end) vs sizeof() ?
sizeof() includes the 12 bytes of padding at the end of struct rseq, for a
total of 32 bytes (currently). offsetof(, end) is currently 20 bytes, which
is the offset exactly after the last field.
For the "feature size" (meaning the populated fields), we really want a size
that excludes padding.
Thanks,
Mathieu
>
>> + NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq));
> > +#endif
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
Adding the NUMA node id to struct rseq is a straightforward thing to do,
and a good way to figure out if anything in the user-space ecosystem
prevents extending struct rseq.
This NUMA node id field allows memory allocators such as tcmalloc to
take advantage of fast access to the current NUMA node id to perform
NUMA-aware memory allocation.
It can also be useful for implementing fast-paths for NUMA-aware
user-space mutexes.
It also allows implementing getcpu(2) purely in user-space.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/trace/events/rseq.h | 4 +++-
include/uapi/linux/rseq.h | 8 ++++++++
kernel/rseq.c | 19 +++++++++++++------
3 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
index a04a64bc1a00..6bd442697354 100644
--- a/include/trace/events/rseq.h
+++ b/include/trace/events/rseq.h
@@ -16,13 +16,15 @@ TRACE_EVENT(rseq_update,
TP_STRUCT__entry(
__field(s32, cpu_id)
+ __field(s32, node_id)
),
TP_fast_assign(
__entry->cpu_id = raw_smp_processor_id();
+ __entry->node_id = cpu_to_node(raw_smp_processor_id());
),
- TP_printk("cpu_id=%d", __entry->cpu_id)
+ TP_printk("cpu_id=%d node_id=%d", __entry->cpu_id, __entry->node_id)
);
TRACE_EVENT(rseq_ip_fixup,
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 05d3c4cdeb40..1cb90a435c5c 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -131,6 +131,14 @@ struct rseq {
*/
__u32 flags;
+ /*
+ * Restartable sequences node_id field. Updated by the kernel. Read by
+ * user-space with single-copy atomicity semantics. This field should
+ * only be read by the thread which registered this data structure.
+ * Aligned on 32-bit. Contains the current NUMA node ID.
+ */
+ __u32 node_id;
+
/*
* Flexible array member at end of structure, after last feature field.
*/
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 46dc5c2ce2b7..cb7d8a5afc82 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -84,15 +84,17 @@
* F1. <failure>
*/
-static int rseq_update_cpu_id(struct task_struct *t)
+static int rseq_update_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;
+ u32 cpu_id = raw_smp_processor_id();
+ u32 node_id = cpu_to_node(cpu_id);
if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_put_user(node_id, &rseq->node_id, efault_end);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally updated only if
@@ -108,9 +110,9 @@ static int rseq_update_cpu_id(struct task_struct *t)
return -EFAULT;
}
-static int rseq_reset_rseq_cpu_id(struct task_struct *t)
+static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+ u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0;
/*
* Reset cpu_id_start to its initial state (0).
@@ -124,6 +126,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Reset node_id to its initial state (0).
+ */
+ if (put_user(node_id, &t->rseq->node_id))
+ return -EFAULT;
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
@@ -306,7 +313,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(ret < 0))
goto error;
}
- if (unlikely(rseq_update_cpu_id(t)))
+ if (unlikely(rseq_update_cpu_node_id(t)))
goto error;
return;
@@ -353,7 +360,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_id(current);
+ ret = rseq_reset_rseq_cpu_node_id(current);
if (ret)
return ret;
current->rseq = NULL;
--
2.17.1
On Thu, Feb 03, 2022 at 02:38:51PM -0500, Mathieu Desnoyers wrote:
> @@ -286,6 +287,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> if (bprm->have_execfd) {
> NEW_AUX_ENT(AT_EXECFD, bprm->execfd);
> }
> +#ifdef CONFIG_RSEQ
> + NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end));
I've gotta ask, what's up with offsetof(, end) vs sizeof() ?
> + NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq));
> +#endif