2022-07-29 19:16:40

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH v3 02/23] rseq: Introduce extensible rseq ABI

Introduce the extensible rseq ABI, where the feature size supported by
the kernel and the required alignment are communicated to user-space
through ELF auxiliary vectors.

This allows user-space to call rseq registration with a rseq_len of
either 32 bytes for the original struct rseq size (which includes
padding), or larger.

If rseq_len is larger than 32 bytes, then it must be large enough to
contain the feature size communicated to user-space through ELF
auxiliary vectors.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/linux/sched.h | 4 ++++
kernel/ptrace.c | 2 +-
kernel/rseq.c | 33 +++++++++++++++++++++++++++------
3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a8911b1f35aa..68b23937b4a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1290,6 +1290,7 @@ struct task_struct {

#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
+ u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
@@ -2282,10 +2283,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
if (clone_flags & CLONE_VM) {
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
} else {
t->rseq = current->rseq;
+ t->rseq_len = current->rseq_len;
t->rseq_sig = current->rseq_sig;
t->rseq_event_mask = current->rseq_event_mask;
}
@@ -2294,6 +2297,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
static inline void rseq_execve(struct task_struct *t)
{
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6149ca5e0e14..390c71e9e573 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -817,7 +817,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 97ac20b4f738..46dc5c2ce2b7 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,6 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>

+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)

@@ -86,10 +89,15 @@ static int rseq_update_cpu_id(struct task_struct *t)
u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;

- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -116,6 +124,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}

@@ -336,7 +349,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -345,6 +358,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}

@@ -357,7 +371,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -366,15 +380,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}

/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len != ORIG_RSEQ_SIZE && rseq_len < offsetof(struct rseq, end)))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
--
2.17.1


2022-08-10 07:03:07

by Florian Weimer

[permalink] [raw]
Subject: Re: [PATCH v3 02/23] rseq: Introduce extensible rseq ABI

* Mathieu Desnoyers:

> Introduce the extensible rseq ABI, where the feature size supported by
> the kernel and the required alignment are communicated to user-space
> through ELF auxiliary vectors.
>
> This allows user-space to call rseq registration with a rseq_len of
> either 32 bytes for the original struct rseq size (which includes
> padding), or larger.
>
> If rseq_len is larger than 32 bytes, then it must be large enough to
> contain the feature size communicated to user-space through ELF
> auxiliary vectors.

I don't think this works with the glibc extension mechanism because
__rseq_size does not change until the padding is exhausted.

I think you'll need to add the suggested flags to the auxiliary vector,
and then we can use that during registration and also communicate these
flags via __rseq_flags.

Size and alignment can be stored in a single auxiliary vector entry.

Thanks,
Florian

2022-08-10 14:20:29

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH v3 02/23] rseq: Introduce extensible rseq ABI

----- On Aug 10, 2022, at 2:33 AM, Florian Weimer [email protected] wrote:

> * Mathieu Desnoyers:
>
>> Introduce the extensible rseq ABI, where the feature size supported by
>> the kernel and the required alignment are communicated to user-space
>> through ELF auxiliary vectors.
>>
>> This allows user-space to call rseq registration with a rseq_len of
>> either 32 bytes for the original struct rseq size (which includes
>> padding), or larger.
>>
>> If rseq_len is larger than 32 bytes, then it must be large enough to
>> contain the feature size communicated to user-space through ELF
>> auxiliary vectors.
>
> I don't think this works with the glibc extension mechanism because
> __rseq_size does not change until the padding is exhausted.

In order to deal with this existing discrepancy between feature-set vs
size, the proposed extension scheme would require that glibc expose a new
__rseq_feature_size, which would give us:

/*
* Size of the registered rseq area. 0 if the registration was
* unsuccessful.
*/
unsigned int __rseq_size = -1U;

/* Flags used during rseq registration. */
unsigned int __rseq_flags;

/*
* rseq feature size supported by the kernel. 0 if the registration was
* unsuccessful.
*/
unsigned int __rseq_feature_size = -1U;

> I think you'll need to add the suggested flags to the auxiliary vector,
> and then we can use that during registration and also communicate these
> flags via __rseq_flags.

For the struct rseq extension, with the "__rseq_feature_size" symbol
I don't think we need to suggest rseq registration flags through
auxiliary vectors.

However, the kernel could provide the set of "supported flags" which
can be passed as rseq flags argument through auxiliary vectors. Is
that what you have in mind ?

This can be useful to ensure we don't require userspace to rely on
rseq returning -1, errno=-EINVAL to detect supported feature flags.

>
> Size and alignment can be stored in a single auxiliary vector entry.

getauxval returns a single "unsigned long". I'm not sure how to extract
size and alignment other than using bitwise operations. Is that what you
have in mind ? Are there other auxval entries that use this kind of
bitwise scheme ?

And by "size", do you mean "supported feature size" or "allocation size" ?
Because the allocation size will be typically aligned to the next power of
two, but not the "supported feature size". AFAIU though, the kernel only
needs to express the supported feature size and the allocation alignment
through auxv. The rest can be figured out from userspace.

Thanks,

Mathieu

>
> Thanks,
> Florian

--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com