2022-02-19 23:06:02

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH v2 00/11] RSEQ node id and virtual cpu id extensions

Hi,

I'm sending this series for feedback. There appears to be a lot of
interest in the virtual cpu id feature for use in user-space memory
allocators (e.g. glibc malloc), so I am sending this out as RFC.

The most interesting patch in here is "sched: Introduce per memory space
current virtual cpu id". So if you want to jump to the meat, go
immediately to that patch.

This series is based on the tip tree core/sched branch [1] at commit
ed3b362d54f0 ("sched/isolation: Split housekeeping cpumask per isolation
features").

Feedback is welcome!

Thanks,

Mathieu

[1] https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/

Mathieu Desnoyers (11):
rseq: Introduce feature size and alignment ELF auxiliary vector
entries
rseq: Introduce extensible rseq ABI
rseq: extend struct rseq with numa node id
selftests/rseq: Use ELF auxiliary vector for extensible rseq
selftests/rseq: Implement rseq numa node id field selftest
lib: invert _find_next_bit source arguments
lib: implement find_{first,next}_{zero,one}_and_zero_bit
cpumask: implement cpumask_{first,next}_{zero,one}_and_zero
sched: Introduce per memory space current virtual cpu id
rseq: extend struct rseq with per memory space vcpu id
selftests/rseq: Implement rseq vm_vcpu_id field support

fs/binfmt_elf.c | 5 +
fs/exec.c | 4 +
include/linux/cpumask.h | 94 ++++++
include/linux/find.h | 123 +++++++-
include/linux/mm.h | 25 ++
include/linux/mm_types.h | 111 +++++++
include/linux/sched.h | 9 +
include/trace/events/rseq.h | 4 +-
include/uapi/linux/auxvec.h | 2 +
include/uapi/linux/rseq.h | 22 ++
init/Kconfig | 4 +
kernel/fork.c | 15 +-
kernel/ptrace.c | 2 +-
kernel/rseq.c | 60 +++-
kernel/sched/core.c | 82 +++++
kernel/sched/deadline.c | 3 +
kernel/sched/debug.c | 13 +
kernel/sched/fair.c | 1 +
kernel/sched/rt.c | 2 +
kernel/sched/sched.h | 364 ++++++++++++++++++++++
kernel/sched/stats.c | 16 +-
lib/find_bit.c | 17 +-
tools/include/linux/find.h | 9 +-
tools/lib/find_bit.c | 17 +-
tools/testing/selftests/rseq/basic_test.c | 5 +
tools/testing/selftests/rseq/rseq-abi.h | 22 ++
tools/testing/selftests/rseq/rseq.c | 86 ++++-
tools/testing/selftests/rseq/rseq.h | 46 ++-
28 files changed, 1106 insertions(+), 57 deletions(-)

--
2.17.1


2022-02-20 07:08:46

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH v2 08/11] cpumask: implement cpumask_{first,next}_{zero,one}_and_zero

Allow finding the first or next bit within two input cpumasks which is
either:

- both zero and zero,
- respectively one and zero.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/linux/cpumask.h | 94 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 94 insertions(+)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 64dae70d31f5..040476134557 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -134,6 +134,18 @@ static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
return 0;
}

+static inline unsigned int cpumask_first_one_and_zero(const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return 0;
+}
+
+static inline unsigned int cpumask_first_zero_and_zero(const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return 0;
+}
+
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return 0;
@@ -157,6 +169,20 @@ static inline unsigned int cpumask_next_and(int n,
return n+1;
}

+static inline unsigned int cpumask_next_one_and_zero(int n,
+ const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return n+1;
+}
+
+static inline unsigned int cpumask_next_zero_and_zero(int n,
+ const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return n+1;
+}
+
static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
int start, bool wrap)
{
@@ -230,6 +256,36 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
}

+/**
+ * cpumask_first_one_and_zero - return the first cpu from *srcp1 & ~*srcp2
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Returns >= nr_cpu_ids if no cpus match in both.
+ */
+static inline
+unsigned int cpumask_first_one_and_zero(const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return find_first_one_and_zero_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ nr_cpumask_bits);
+}
+
+/**
+ * cpumask_first_zero_and_zero - return the first cpu from ~*srcp1 & ~*srcp2
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Returns >= nr_cpu_ids if no cpus match in both.
+ */
+static inline
+unsigned int cpumask_first_zero_and_zero(const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ return find_first_zero_and_zero_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ nr_cpumask_bits);
+}
+
/**
* cpumask_last - get the last CPU in a cpumask
* @srcp: - the cpumask pointer
@@ -258,6 +314,44 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
}

+/**
+ * cpumask_next_one_and_zero - return the next cpu from *srcp1 & ~*srcp2
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Returns >= nr_cpu_ids if no cpus match in both.
+ */
+static inline
+unsigned int cpumask_next_one_and_zero(int n, const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1)
+ cpumask_check(n);
+ return find_next_one_and_zero_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ nr_cpumask_bits, n+1);
+}
+
+/**
+ * cpumask_next_zero_and_zero - return the next cpu from ~*srcp1 & ~*srcp2
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Returns >= nr_cpu_ids if no cpus match in both.
+ */
+static inline
+unsigned int cpumask_next_zero_and_zero(int n, const struct cpumask *srcp1,
+ const struct cpumask *srcp2)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1)
+ cpumask_check(n);
+ return find_next_zero_and_zero_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ nr_cpumask_bits, n+1);
+}
+
int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
--
2.17.1

2022-02-20 09:59:40

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH v2 01/11] rseq: Introduce feature size and alignment ELF auxiliary vector entries

Export the rseq feature size supported by the kernel as well as the
required allocation alignment for the rseq per-thread area to user-space
through ELF auxiliary vector entries.

This is part of the extensible rseq ABI.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
fs/binfmt_elf.c | 5 +++++
include/uapi/linux/auxvec.h | 2 ++
include/uapi/linux/rseq.h | 5 +++++
3 files changed, 12 insertions(+)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 605017eb9349..77776582e76d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,6 +46,7 @@
#include <linux/cred.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
+#include <linux/rseq.h>
#include <asm/param.h>
#include <asm/page.h>

@@ -286,6 +287,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
if (bprm->have_execfd) {
NEW_AUX_ENT(AT_EXECFD, bprm->execfd);
}
+#ifdef CONFIG_RSEQ
+ NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end));
+ NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq));
+#endif
#undef NEW_AUX_ENT
/* AT_NULL is zero; clear the rest too */
memset(elf_info, 0, (char *)mm->saved_auxv +
diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index c7e502bf5a6f..6991c4b8ab18 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -30,6 +30,8 @@
* differ from AT_PLATFORM. */
#define AT_RANDOM 25 /* address of 16 random bytes */
#define AT_HWCAP2 26 /* extension of AT_HWCAP */
+#define AT_RSEQ_FEATURE_SIZE 27 /* rseq supported feature size */
+#define AT_RSEQ_ALIGN 28 /* rseq allocation alignment */

#define AT_EXECFN 31 /* filename of program */

diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 77ee207623a9..05d3c4cdeb40 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -130,6 +130,11 @@ struct rseq {
* this thread.
*/
__u32 flags;
+
+ /*
+ * Flexible array member at end of structure, after last feature field.
+ */
+ char end[];
} __attribute__((aligned(4 * sizeof(__u64))));

#endif /* _UAPI_LINUX_RSEQ_H */
--
2.17.1

2022-02-21 09:24:17

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH v2 06/11] lib: invert _find_next_bit source arguments

Apply bit-invert operations before the AND operation in _find_next_bit.
Allows AND operations on combined bitmasks in which we search either for
one or zero, e.g.: find first bit which is both zero in one bitmask AND
one in the second bitmask.

The existing use for find first zero bit does not use the second
argument, so whether the inversion is performed before or after the AND
operator does not matter.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/linux/find.h | 13 +++++++------
lib/find_bit.c | 17 ++++++++---------
tools/include/linux/find.h | 9 +++++----
tools/lib/find_bit.c | 17 ++++++++---------
4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/include/linux/find.h b/include/linux/find.h
index 5bb6db213bcb..41941cb9cad7 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -10,7 +10,8 @@

extern unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le);
+ unsigned long start, unsigned long invert_src1,
+ unsigned long src2, unsigned long le);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size);
@@ -41,7 +42,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
return val ? __ffs(val) : size;
}

- return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 0UL, 0);
}
#endif

@@ -71,7 +72,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
return val ? __ffs(val) : size;
}

- return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+ return _find_next_bit(addr1, addr2, size, offset, 0UL, 0UL, 0);
}
#endif

@@ -99,7 +100,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
return val == ~0UL ? size : ffz(val);
}

- return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 0UL, 0);
}
#endif

@@ -247,7 +248,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
return val == ~0UL ? size : ffz(val);
}

- return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 0UL, 1);
}
#endif

@@ -266,7 +267,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned
return val ? __ffs(val) : size;
}

- return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 0UL, 1);
}
#endif

diff --git a/lib/find_bit.c b/lib/find_bit.c
index 1b8e4b2a9cba..73e78565e691 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -25,23 +25,23 @@
/*
* This is a common helper function for find_next_bit, find_next_zero_bit, and
* find_next_and_bit. The differences are:
- * - The "invert" argument, which is XORed with each fetched word before
- * searching it for one bits.
+ * - The "invert_src1" and "invert_src2" arguments, which are XORed to
+ * each source word before applying the 'and' operator.
* - The optional "addr2", which is anded with "addr1" if present.
*/
unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le)
+ unsigned long start, unsigned long invert_src1,
+ unsigned long invert_src2, unsigned long le)
{
unsigned long tmp, mask;

if (unlikely(start >= nbits))
return nbits;

- tmp = addr1[start / BITS_PER_LONG];
+ tmp = addr1[start / BITS_PER_LONG] ^ invert_src1;
if (addr2)
- tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
+ tmp &= addr2[start / BITS_PER_LONG] ^ invert_src2;

/* Handle 1st word. */
mask = BITMAP_FIRST_WORD_MASK(start);
@@ -57,10 +57,9 @@ unsigned long _find_next_bit(const unsigned long *addr1,
if (start >= nbits)
return nbits;

- tmp = addr1[start / BITS_PER_LONG];
+ tmp = addr1[start / BITS_PER_LONG] ^ invert_src1;
if (addr2)
- tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
+ tmp &= addr2[start / BITS_PER_LONG] ^ invert_src2;
}

if (le)
diff --git a/tools/include/linux/find.h b/tools/include/linux/find.h
index 47e2bd6c5174..5ab0c95086ad 100644
--- a/tools/include/linux/find.h
+++ b/tools/include/linux/find.h
@@ -10,7 +10,8 @@

extern unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le);
+ unsigned long start, unsigned long invert_src1,
+ unsigned long src2, unsigned long le);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size);
@@ -41,7 +42,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
return val ? __ffs(val) : size;
}

- return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 0UL, 0);
}
#endif

@@ -71,7 +72,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
return val ? __ffs(val) : size;
}

- return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+ return _find_next_bit(addr1, addr2, size, offset, 0UL, 0UL, 0);
}
#endif

@@ -99,7 +100,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
return val == ~0UL ? size : ffz(val);
}

- return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 0UL, 0);
}
#endif

diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
index ba4b8d94e004..4176232de7f9 100644
--- a/tools/lib/find_bit.c
+++ b/tools/lib/find_bit.c
@@ -24,13 +24,14 @@
/*
* This is a common helper function for find_next_bit, find_next_zero_bit, and
* find_next_and_bit. The differences are:
- * - The "invert" argument, which is XORed with each fetched word before
- * searching it for one bits.
+ * - The "invert_src1" and "invert_src2" arguments, which are XORed to
+ * each source word before applying the 'and' operator.
* - The optional "addr2", which is anded with "addr1" if present.
*/
unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le)
+ unsigned long start, unsigned long invert_src1,
+ unsigned long invert_src2, unsigned long le)
{
unsigned long tmp, mask;
(void) le;
@@ -38,10 +39,9 @@ unsigned long _find_next_bit(const unsigned long *addr1,
if (unlikely(start >= nbits))
return nbits;

- tmp = addr1[start / BITS_PER_LONG];
+ tmp = addr1[start / BITS_PER_LONG] ^ invert_src1;
if (addr2)
- tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
+ tmp &= addr2[start / BITS_PER_LONG] ^ invert_src2;

/* Handle 1st word. */
mask = BITMAP_FIRST_WORD_MASK(start);
@@ -64,10 +64,9 @@ unsigned long _find_next_bit(const unsigned long *addr1,
if (start >= nbits)
return nbits;

- tmp = addr1[start / BITS_PER_LONG];
+ tmp = addr1[start / BITS_PER_LONG] ^ invert_src1;
if (addr2)
- tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
+ tmp &= addr2[start / BITS_PER_LONG] ^ invert_src2;
}

#if (0)
--
2.17.1

2022-02-21 09:28:20

by Mathieu Desnoyers

[permalink] [raw]
Subject: [RFC PATCH v2 02/11] rseq: Introduce extensible rseq ABI

Introduce the extensible rseq ABI, where the feature size supported by
the kernel and the required alignment are communicated to user-space
through ELF auxiliary vectors.

This allows user-space to call rseq registration with a rseq_len of
either 32 bytes for the original struct rseq size (which includes
padding), or larger.

If rseq_len is larger than 32 bytes, then it must be large enough to
contain the feature size communicated to user-space through ELF
auxiliary vectors.

Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/linux/sched.h | 4 ++++
kernel/ptrace.c | 2 +-
kernel/rseq.c | 33 +++++++++++++++++++++++++++------
3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 508b91d57470..838c9e0b4cae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1291,6 +1291,7 @@ struct task_struct {

#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
+ u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
@@ -2260,10 +2261,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
if (clone_flags & CLONE_VM) {
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
} else {
t->rseq = current->rseq;
+ t->rseq_len = current->rseq_len;
t->rseq_sig = current->rseq_sig;
t->rseq_event_mask = current->rseq_event_mask;
}
@@ -2272,6 +2275,7 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
static inline void rseq_execve(struct task_struct *t)
{
t->rseq = NULL;
+ t->rseq_len = 0;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..f5edde5b7805 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -800,7 +800,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
{
struct ptrace_rseq_configuration conf = {
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = sizeof(*task->rseq),
+ .rseq_abi_size = task->rseq_len,
.signature = task->rseq_sig,
.flags = 0,
};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 97ac20b4f738..46dc5c2ce2b7 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,6 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>

+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)

@@ -86,10 +89,15 @@ static int rseq_update_cpu_id(struct task_struct *t)
u32 cpu_id = raw_smp_processor_id();
struct rseq __user *rseq = t->rseq;

- if (!user_write_access_begin(rseq, sizeof(*rseq)))
+ if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally updated only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
user_write_access_end();
trace_rseq_update(t);
return 0;
@@ -116,6 +124,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
*/
if (put_user(cpu_id, &t->rseq->cpu_id))
return -EFAULT;
+ /*
+ * Additional feature fields added after ORIG_RSEQ_SIZE
+ * need to be conditionally reset only if
+ * t->rseq_len != ORIG_RSEQ_SIZE.
+ */
return 0;
}

@@ -336,7 +349,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (rseq_len != sizeof(*rseq))
+ if (rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -345,6 +358,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return ret;
current->rseq = NULL;
current->rseq_sig = 0;
+ current->rseq_len = 0;
return 0;
}

@@ -357,7 +371,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != sizeof(*rseq))
+ if (current->rseq != rseq || rseq_len != current->rseq_len)
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -366,15 +380,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
}

/*
- * If there was no rseq previously registered,
- * ensure the provided rseq is properly aligned and valid.
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
*/
if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
- rseq_len != sizeof(*rseq))
+ rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len != ORIG_RSEQ_SIZE && rseq_len < offsetof(struct rseq, end)))
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
+ current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
--
2.17.1