Validating the abort_ip field of rseq_cs ensures that the kernel don't
return to an invalid address when returning to userspace after an abort.
I don't fully trust each architecture code to cleanly deal with invalid
return addresses.
Validating the range [ start_ip, start_ip + post_commit_offset ] is an
extra validation step ensuring that userspace provides valid values to
describe the critical section.
If validation fails, the process is killed with a segmentation fault.
Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip
fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather
that ignoring the 32 upper bits on 32-bit kernels. This ensures we have
a consistent behavior for a 32-bit binary executed on 32-bit kernels and
in compat mode on 64-bit kernels.
Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: "Paul E. McKenney" <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Paul Turner <[email protected]>
CC: Thomas Gleixner <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Andi Kleen <[email protected]>
CC: Dave Watson <[email protected]>
CC: Chris Lameter <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Ben Maurer <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Josh Triplett <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Andrew Morton <[email protected]>
CC: Russell King <[email protected]>
CC: Catalin Marinas <[email protected]>
CC: Will Deacon <[email protected]>
CC: Michael Kerrisk <[email protected]>
CC: Boqun Feng <[email protected]>
CC: [email protected]
---
include/uapi/linux/rseq.h | 6 +++---
kernel/rseq.c | 7 +++++--
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index d620fa43756c..519ad6e176d1 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -52,10 +52,10 @@ struct rseq_cs {
__u32 version;
/* enum rseq_cs_flags */
__u32 flags;
- LINUX_FIELD_u32_u64(start_ip);
+ __u64 start_ip;
/* Offset from start_ip. */
- LINUX_FIELD_u32_u64(post_commit_offset);
- LINUX_FIELD_u32_u64(abort_ip);
+ __u64 post_commit_offset;
+ __u64 abort_ip;
} __attribute__((aligned(4 * sizeof(__u64))));
/*
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 22b6acf1ad63..4ba582046fcd 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -128,7 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
return 0;
}
urseq_cs = (struct rseq_cs __user *)ptr;
- if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
+ if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)) ||
+ rseq_cs->abort_ip >= TASK_SIZE ||
+ rseq_cs->start_ip >= TASK_SIZE ||
+ rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE)
return -EFAULT;
if (rseq_cs->version > 0)
return -EINVAL;
@@ -137,7 +140,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
return -EINVAL;
- usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32));
+ usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
ret = get_user(sig, usig);
if (ret)
return ret;
--
2.11.0
On 32-bit kernels, the rseq->rseq_cs_padding field is never read by the
kernel. However, 64-bit kernels dealing with 32-bit compat tasks read the
full 64-bit in its entirety, and terminates the offending process with
a segmentation fault if the upper 32 bits are set due to failure of
copy_from_user().
Ensure that both 32-bit and 64-bit kernels dealing with 32-bit tasks end
up terminating offending tasks with a segmentation fault if the upper
32-bit padding bits (rseq->rseq_cs_padding) are set by adding an explicit
check that padding is zero on 32-bit kernels.
Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: "Paul E. McKenney" <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Paul Turner <[email protected]>
CC: Thomas Gleixner <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Andi Kleen <[email protected]>
CC: Dave Watson <[email protected]>
CC: Chris Lameter <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Ben Maurer <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Josh Triplett <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Andrew Morton <[email protected]>
CC: Russell King <[email protected]>
CC: Catalin Marinas <[email protected]>
CC: Will Deacon <[email protected]>
CC: Michael Kerrisk <[email protected]>
CC: Boqun Feng <[email protected]>
CC: [email protected]
---
kernel/rseq.c | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 4ba582046fcd..b038f35a60d6 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -112,6 +112,29 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
return 0;
}
+#ifndef __LP64__
+/*
+ * Ensure that padding is zero.
+ */
+static int check_rseq_cs_padding(struct task_struct *t)
+{
+ unsigned long pad;
+ int ret;
+
+ ret = __get_user(pad, &t->rseq->rseq_cs_padding);
+ if (ret)
+ return ret;
+ if (pad)
+ return -EFAULT;
+ return 0;
+}
+#else
+static int check_rseq_cs_padding(struct task_struct *t)
+{
+ return 0;
+}
+#endif
+
static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
{
struct rseq_cs __user *urseq_cs;
@@ -123,6 +146,8 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
ret = __get_user(ptr, &t->rseq->rseq_cs);
if (ret)
return ret;
+ if (check_rseq_cs_padding(t))
+ return -EFAULT;
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
--
2.11.0
Hi Mathieu,
On Thu, Jun 28, 2018 at 12:23:59PM -0400, Mathieu Desnoyers wrote:
> On 32-bit kernels, the rseq->rseq_cs_padding field is never read by the
> kernel. However, 64-bit kernels dealing with 32-bit compat tasks read the
> full 64-bit in its entirety, and terminates the offending process with
> a segmentation fault if the upper 32 bits are set due to failure of
> copy_from_user().
>
> Ensure that both 32-bit and 64-bit kernels dealing with 32-bit tasks end
> up terminating offending tasks with a segmentation fault if the upper
> 32-bit padding bits (rseq->rseq_cs_padding) are set by adding an explicit
> check that padding is zero on 32-bit kernels.
>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
> CC: "Paul E. McKenney" <[email protected]>
> CC: Peter Zijlstra <[email protected]>
> CC: Paul Turner <[email protected]>
> CC: Thomas Gleixner <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Andi Kleen <[email protected]>
> CC: Dave Watson <[email protected]>
> CC: Chris Lameter <[email protected]>
> CC: Ingo Molnar <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Ben Maurer <[email protected]>
> CC: Steven Rostedt <[email protected]>
> CC: Josh Triplett <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Andrew Morton <[email protected]>
> CC: Russell King <[email protected]>
> CC: Catalin Marinas <[email protected]>
> CC: Will Deacon <[email protected]>
> CC: Michael Kerrisk <[email protected]>
> CC: Boqun Feng <[email protected]>
> CC: [email protected]
> ---
> kernel/rseq.c | 25 +++++++++++++++++++++++++
> 1 file changed, 25 insertions(+)
>
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 4ba582046fcd..b038f35a60d6 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -112,6 +112,29 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
> return 0;
> }
>
> +#ifndef __LP64__
> +/*
> + * Ensure that padding is zero.
> + */
> +static int check_rseq_cs_padding(struct task_struct *t)
> +{
> + unsigned long pad;
> + int ret;
> +
> + ret = __get_user(pad, &t->rseq->rseq_cs_padding);
> + if (ret)
> + return ret;
> + if (pad)
> + return -EFAULT;
> + return 0;
> +}
> +#else
> +static int check_rseq_cs_padding(struct task_struct *t)
> +{
> + return 0;
> +}
> +#endif
I'm still not sure how this works with a 64-bit kernel and a compat (32-bit)
task. The check_rseq_cs_padding() will return 0 regardless of the upper bits
of the rseq_cs field, whereas a native 32-bit kernel would actually go and
check them.
What am I missing here?
Will
----- On Jun 28, 2018, at 12:53 PM, Will Deacon [email protected] wrote:
> Hi Mathieu,
>
> On Thu, Jun 28, 2018 at 12:23:59PM -0400, Mathieu Desnoyers wrote:
>> On 32-bit kernels, the rseq->rseq_cs_padding field is never read by the
>> kernel. However, 64-bit kernels dealing with 32-bit compat tasks read the
>> full 64-bit in its entirety, and terminates the offending process with
>> a segmentation fault if the upper 32 bits are set due to failure of
>> copy_from_user().
>>
>> Ensure that both 32-bit and 64-bit kernels dealing with 32-bit tasks end
>> up terminating offending tasks with a segmentation fault if the upper
>> 32-bit padding bits (rseq->rseq_cs_padding) are set by adding an explicit
>> check that padding is zero on 32-bit kernels.
>>
>> Signed-off-by: Mathieu Desnoyers <[email protected]>
>> CC: "Paul E. McKenney" <[email protected]>
>> CC: Peter Zijlstra <[email protected]>
>> CC: Paul Turner <[email protected]>
>> CC: Thomas Gleixner <[email protected]>
>> CC: Andy Lutomirski <[email protected]>
>> CC: Andi Kleen <[email protected]>
>> CC: Dave Watson <[email protected]>
>> CC: Chris Lameter <[email protected]>
>> CC: Ingo Molnar <[email protected]>
>> CC: "H. Peter Anvin" <[email protected]>
>> CC: Ben Maurer <[email protected]>
>> CC: Steven Rostedt <[email protected]>
>> CC: Josh Triplett <[email protected]>
>> CC: Linus Torvalds <[email protected]>
>> CC: Andrew Morton <[email protected]>
>> CC: Russell King <[email protected]>
>> CC: Catalin Marinas <[email protected]>
>> CC: Will Deacon <[email protected]>
>> CC: Michael Kerrisk <[email protected]>
>> CC: Boqun Feng <[email protected]>
>> CC: [email protected]
>> ---
>> kernel/rseq.c | 25 +++++++++++++++++++++++++
>> 1 file changed, 25 insertions(+)
>>
>> diff --git a/kernel/rseq.c b/kernel/rseq.c
>> index 4ba582046fcd..b038f35a60d6 100644
>> --- a/kernel/rseq.c
>> +++ b/kernel/rseq.c
>> @@ -112,6 +112,29 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)
>> return 0;
>> }
>>
>> +#ifndef __LP64__
>> +/*
>> + * Ensure that padding is zero.
>> + */
>> +static int check_rseq_cs_padding(struct task_struct *t)
>> +{
>> + unsigned long pad;
>> + int ret;
>> +
>> + ret = __get_user(pad, &t->rseq->rseq_cs_padding);
>> + if (ret)
>> + return ret;
>> + if (pad)
>> + return -EFAULT;
>> + return 0;
>> +}
>> +#else
>> +static int check_rseq_cs_padding(struct task_struct *t)
>> +{
>> + return 0;
>> +}
>> +#endif
>
> I'm still not sure how this works with a 64-bit kernel and a compat (32-bit)
> task. The check_rseq_cs_padding() will return 0 regardless of the upper bits
> of the rseq_cs field, whereas a native 32-bit kernel would actually go and
> check them.
>
> What am I missing here?
With a 64-bit kernel, we end up in the #else, which means check_rseq_cs_padding()
always returns 0.
On that 64-bit kernel, all 64 bits of rseq->rseq_cs are read, including the
padding. Therefore, all those bits are contained in the pointer passed as
argument to copy_from_user(), which will cause copy_from_user() to accurately
fail on an invalid user-space address.
Therefore, 64-bit kernels already check those padding bits by means of trying to use
that pointer to access user-space data with copy_from_user, which does an access_ok
check.
So both 32-bit and 64-bit kernels will end up killing the process with segmentation
fault if a 32-bit userland populates those padding bits with anything other than
0.
Does it seem acceptable ?
Thanks,
Mathieu
>
> Will
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
----- On Jun 28, 2018, at 4:22 PM, Andy Lutomirski [email protected] wrote:
> On Thu, Jun 28, 2018 at 9:23 AM, Mathieu Desnoyers
> <[email protected]> wrote:
>> Validating the abort_ip field of rseq_cs ensures that the kernel don't
>> return to an invalid address when returning to userspace after an abort.
>> I don't fully trust each architecture code to cleanly deal with invalid
>> return addresses.
>>
>> Validating the range [ start_ip, start_ip + post_commit_offset ] is an
>> extra validation step ensuring that userspace provides valid values to
>> describe the critical section.
>>
>> If validation fails, the process is killed with a segmentation fault.
>>
>> Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip
>> fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather
>> that ignoring the 32 upper bits on 32-bit kernels. This ensures we have
>> a consistent behavior for a 32-bit binary executed on 32-bit kernels and
>> in compat mode on 64-bit kernels.
>
> This is okay with me for a fix outside the merge window. Can you do a
> followup for the next merge window that fixes it better, though? In
> particular, TASK_SIZE is generally garbage. I think a better fix
> would be something like adding a new arch-overridable helper like:
>
> static inline unsigned long current_max_user_addr(void) { return TASK_SIZE; }
>
> and overriding it on x86 as something like:
>
> static inline unsigned long current_max_user_addr(void) {
> #ifdef CONFIG_IA32_EMULATION
> return user_64bit_mode(current_pt_regs()) ? TASK_SIZE_MAX : (1UL << 32) - 1;
> #else
> return TASK_SIZE_MAX;
> }
>
> TASK_SIZE really needs to die.
Sure, I'll put it in my backlog.
Thanks!
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Thu, Jun 28, 2018 at 1:23 PM Andy Lutomirski <[email protected]> wrote:
>
> This is okay with me for a fix outside the merge window. Can you do a
> followup for the next merge window that fixes it better, though? In
> particular, TASK_SIZE is generally garbage. I think a better fix
> would be something like adding a new arch-overridable helper like:
>
> static inline unsigned long current_max_user_addr(void) { return TASK_SIZE; }
We already have that. It's called "user_addr_max()".
It's the limit we use for user accesses.
That said, I don't see why we should even check the IP. It's not like
that's done by signal handling either.
Linus
On Thu, Jun 28, 2018 at 9:23 AM, Mathieu Desnoyers
<[email protected]> wrote:
> Validating the abort_ip field of rseq_cs ensures that the kernel don't
> return to an invalid address when returning to userspace after an abort.
> I don't fully trust each architecture code to cleanly deal with invalid
> return addresses.
>
> Validating the range [ start_ip, start_ip + post_commit_offset ] is an
> extra validation step ensuring that userspace provides valid values to
> describe the critical section.
>
> If validation fails, the process is killed with a segmentation fault.
>
> Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip
> fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather
> that ignoring the 32 upper bits on 32-bit kernels. This ensures we have
> a consistent behavior for a 32-bit binary executed on 32-bit kernels and
> in compat mode on 64-bit kernels.
This is okay with me for a fix outside the merge window. Can you do a
followup for the next merge window that fixes it better, though? In
particular, TASK_SIZE is generally garbage. I think a better fix
would be something like adding a new arch-overridable helper like:
static inline unsigned long current_max_user_addr(void) { return TASK_SIZE; }
and overriding it on x86 as something like:
static inline unsigned long current_max_user_addr(void) {
#ifdef CONFIG_IA32_EMULATION
return user_64bit_mode(current_pt_regs()) ? TASK_SIZE_MAX : (1UL << 32) - 1;
#else
return TASK_SIZE_MAX;
}
TASK_SIZE really needs to die.
----- On Jun 28, 2018, at 5:22 PM, Linus Torvalds [email protected] wrote:
> On Thu, Jun 28, 2018 at 1:23 PM Andy Lutomirski <[email protected]> wrote:
>>
>> This is okay with me for a fix outside the merge window. Can you do a
>> followup for the next merge window that fixes it better, though? In
>> particular, TASK_SIZE is generally garbage. I think a better fix
>> would be something like adding a new arch-overridable helper like:
>>
>> static inline unsigned long current_max_user_addr(void) { return TASK_SIZE; }
>
> We already have that. It's called "user_addr_max()".
>
> It's the limit we use for user accesses.
Good point. I will simply replace TASK_SIZE with user_addr_max() then.
> That said, I don't see why we should even check the IP. It's not like
> that's done by signal handling either.
The goal stated by Andy and Thomas is to design rseq so it
does not need any compat syscall whatsoever. Considering
that this is a new system call, we should be able to do that.
One thing we want is to provide a consistent behavior when
a 32-bit binary is executed on native 32-bit kernel or on
64-bit kernel in compat mode, even if userspace chooses to
put garbage in the upper 32-bit of padding within 64-bit
fields.
Now let's look at the comparison with signals. If we look
at ia32_setup_frame() for instance, it sets:
regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
where the sa_handler pointer has been received as input
as a 32-bit pointer by the compat syscall rt_sigaction
through struct compat_sigaction. I agree with you that
it does not appear to validate that it's below
user_addr_max(), but at least it's guaranteed to never be
over 32-bit.
The first big difference between rseq and signals: rseq
does not have a compat structure. The layout is the same
for both 32-bit and 64-bit userspace (see include/uapi/linux/rseq.h).
Another difference between rseq and signals is that rseq
only registers the TLS struct rseq for each thread. Then,
it's up to user-space to update the rseq_cs field of
struct rseq to indicate that it enters a critical section.
So the actual content of (struct rseq *)->rseq_cs is updated
with single-copy atomicity after registration of rseq.
Therefore, the current critical section pointed to by the current
rseq_cs user-space pointer also changes after registration. So we
cannot validate the content of the rseq_cs field, nor of the fields
contained within every possible struct rseq_cs descriptor when
registering rseq through sys_rseq: those need to be read when
returning to user-space. Not just on return from system call, but
also on return from interrupt/trap after a preemption.
This is very much different from registering a sigaction,
where the kernel can validate or truncate the content of
sa_handler at will.
Without validation of the content of e.g. rseq_cs->abort_ip
(read as a 64-bit integer by a 64-bit kernel), we end up setting
the return IP to that address on abort, even though user-space may
have put garbage in the high bits:
instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
without any validation or truncation whatsoever.
I'm concerned that some architecture code may not deal so well without
prior validation or truncation of the IP register content upper 32 bits
when returning to a 32-bit compat task.
This is why I'm considering comparison of abort_ip against user_addr_max()
to ensure we're not provided with an incorrect user input.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Thu, Jun 28, 2018 at 2:22 PM, Linus Torvalds
<[email protected]> wrote:
> On Thu, Jun 28, 2018 at 1:23 PM Andy Lutomirski <[email protected]> wrote:
>>
>> This is okay with me for a fix outside the merge window. Can you do a
>> followup for the next merge window that fixes it better, though? In
>> particular, TASK_SIZE is generally garbage. I think a better fix
>> would be something like adding a new arch-overridable helper like:
>>
>> static inline unsigned long current_max_user_addr(void) { return TASK_SIZE; }
>
> We already have that. It's called "user_addr_max()".
Nah, that one is more or less equivalent to TASK_SIZE_MAX, except that
it's different if set_fs() is used.
>
> It's the limit we use for user accesses.
>
> That said, I don't see why we should even check the IP. It's not like
> that's done by signal handling either.
The idea is that, if someone screws up and sticks a number like
0xbaadf00d00045678 into their rseq abort_ip in a 32-bit x86 program
(when they actually mean 0x00045678), we want to something consistent.
On a 32-bit kernel, presumably it gets cast to u32 somewhere and it
works. On a 64-bit kernel, we end up shoving 0xbaadf00d00045678 into
regs->ip, and then the entry code will do, um, something. If I had to
guess, I would guess that at least IRET is likely to truncate if we're
returning to a 32-bit CS. But I really don't want to start promising
that we won't segfault if a different path gets invoked on some future
kernel on some future CPU of if we're on an AMD CPU using their
utterly braindead SYSRETL microcode, etc.
So I think we're much better off if we either promise that rseq
truncates the address for 32-bit users or that it segfaults if high
bits are set for 32-bit users.
TASK_SIZE is a super shitty way to do this. The correct thing is to
either add some check to the exit-to-usermode slowpath that rseq can
trigger or if we add some reasonable way for rseq to say "is this
address a legitimate addressable virtual address for the current
task's user space operating mode." We don't have such a thing right
now.
On Thu, Jun 28, 2018 at 4:30 PM Andy Lutomirski <[email protected]> wrote:
>
> The idea is that, if someone screws up and sticks a number like
> 0xbaadf00d00045678 into their rseq abort_ip in a 32-bit x86 program
> (when they actually mean 0x00045678), we want to something consistent.
I think the "something consistent" is perfectly fine with just "it won't work".
Make it do
if (rseq_cs->abort_ip != (unsigned long)rseq_cs->abort_ip)
return -EINVAL;
at abort time.
Done.
If it's a 32-bit kernel, the above will reject the thing, and if it's
a 64-bit kernel, it will be a no-op, but the abort won't work in a
32-bit caller.
Problem solved.
Linus
----- On Jun 28, 2018, at 8:18 PM, Linus Torvalds [email protected] wrote:
> On Thu, Jun 28, 2018 at 4:30 PM Andy Lutomirski <[email protected]> wrote:
>>
>> The idea is that, if someone screws up and sticks a number like
>> 0xbaadf00d00045678 into their rseq abort_ip in a 32-bit x86 program
>> (when they actually mean 0x00045678), we want to something consistent.
>
> I think the "something consistent" is perfectly fine with just "it won't work".
>
> Make it do
>
> if (rseq_cs->abort_ip != (unsigned long)rseq_cs->abort_ip)
> return -EINVAL;
>
> at abort time.
>
> Done.
>
> If it's a 32-bit kernel, the above will reject the thing, and if it's
> a 64-bit kernel, it will be a no-op, but the abort won't work in a
> 32-bit caller.
>
> Problem solved.
This assumes a 64-bit kernel returning to a 32-bit compat task with
garbage it the upper 32 bits of regs->ip behaves correctly (e.g.
kill the offending process rather than crash the kernel) on all
architectures.
Is this something we can rely on ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
> On Jun 28, 2018, at 5:18 PM, Linus Torvalds <[email protected]> wrote:
>
>> On Thu, Jun 28, 2018 at 4:30 PM Andy Lutomirski <[email protected]> wrote:
>>
>> The idea is that, if someone screws up and sticks a number like
>> 0xbaadf00d00045678 into their rseq abort_ip in a 32-bit x86 program
>> (when they actually mean 0x00045678), we want to something consistent.
>
> I think the "something consistent" is perfectly fine with just "it won't work".
>
> Make it do
>
> if (rseq_cs->abort_ip != (unsigned long)rseq_cs->abort_ip)
> return -EINVAL;
>
> at abort time.
You sure? Because, unless I remember wrong, a 32-bit user program on a 64-bit kernel will actually work at least most of the time even if high bits are set. I’m okay with straight-up promising “will always work” or “will never work”, but “sometimes” is bad.
>
> Done.
>
> If it's a 32-bit kernel, the above will reject the thing, and if it's
> a 64-bit kernel, it will be a no-op, but the abort won't work in a
> 32-bit caller.
>
> Problem solved.
>
> Linus
----- On Jun 29, 2018, at 10:02 AM, Linus Torvalds [email protected] wrote:
> On Thu, Jun 28, 2018 at 6:08 PM Andy Lutomirski <[email protected]> wrote:
>> > On Jun 28, 2018, at 5:18 PM, Linus Torvalds <[email protected]>
>> > wrote:
>> >
>> >
>> > Make it do
>> >
>> > if (rseq_cs->abort_ip != (unsigned long)rseq_cs->abort_ip)
>> > return -EINVAL;
>> >
>> > at abort time.
>>
>> You sure? Because, unless I remember wrong, a 32-bit user program on a 64-bit
>> kernel will actually work at least most of the time even if high bits are set.
>
> Sure.
>
> If you run a 32-bit binary on a 64-bit kernel,. you will have access
> to the 0xc0000000 - 0xffffffff area that you wouldn't have had access
> to if it ran on a 32-bit kernel.
>
> But exactly *because* you have access to that area, those addresses
> are actually valid addresses for the 32-bit case, so they shouldn't be
> considered bad. They can't happen on a native 32-bit kerne, but a
> 32-bit program doesn't even care. If it has user memory mapped in that
> area, it should work.
>
> And if it *doesn't* have user memory mapped in that area, then it will
> fail when the trying to execute the (non-existent) abort sequence.
>
> After all, depending on configuration, a native 32-bit kernel might
> limit user space even more (ie some vendors had a 2G:2G split instead
> of the traditional 3G:1G split.
>
> Was that the case you were thinking of, or was it something else?
What I'm worried about is setting regs->ip of a compat 32-bit task to
addresses in the range 0x100000000-0xFFFFFFFFFFFFFFFF.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Fri, Jun 29, 2018 at 7:05 AM Mathieu Desnoyers
<[email protected]> wrote:
>
> What I'm worried about is setting regs->ip of a compat 32-bit task to
> addresses in the range 0x100000000-0xFFFFFFFFFFFFFFFF.
Well, they won't have anything mapped in that range, so it really
shouldn't matter.
Linus
----- On Jun 28, 2018, at 7:29 PM, Andy Lutomirski [email protected] wrote:
> On Thu, Jun 28, 2018 at 2:22 PM, Linus Torvalds
> <[email protected]> wrote:
>> On Thu, Jun 28, 2018 at 1:23 PM Andy Lutomirski <[email protected]> wrote:
>>>
>>> This is okay with me for a fix outside the merge window. Can you do a
>>> followup for the next merge window that fixes it better, though? In
>>> particular, TASK_SIZE is generally garbage. I think a better fix
>>> would be something like adding a new arch-overridable helper like:
>>>
>>> static inline unsigned long current_max_user_addr(void) { return TASK_SIZE; }
>>
>> We already have that. It's called "user_addr_max()".
>
> Nah, that one is more or less equivalent to TASK_SIZE_MAX, except that
> it's different if set_fs() is used.
So which one would be right in this case ? AFAIU we want to ensure we don't
populate regs->ip with a bogus address that would make SYSRET or other return
to userspace instructions explode.
Is that guaranteed by TASK_SIZE or TASK_SIZE_MAX (aliased by user_addr_max()) ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Thu, Jun 28, 2018 at 6:08 PM Andy Lutomirski <[email protected]> wrote:
> > On Jun 28, 2018, at 5:18 PM, Linus Torvalds <[email protected]> wrote:
> >
> >
> > Make it do
> >
> > if (rseq_cs->abort_ip != (unsigned long)rseq_cs->abort_ip)
> > return -EINVAL;
> >
> > at abort time.
>
> You sure? Because, unless I remember wrong, a 32-bit user program on a 64-bit kernel will actually work at least most of the time even if high bits are set.
Sure.
If you run a 32-bit binary on a 64-bit kernel,. you will have access
to the 0xc0000000 - 0xffffffff area that you wouldn't have had access
to if it ran on a 32-bit kernel.
But exactly *because* you have access to that area, those addresses
are actually valid addresses for the 32-bit case, so they shouldn't be
considered bad. They can't happen on a native 32-bit kerne, but a
32-bit program doesn't even care. If it has user memory mapped in that
area, it should work.
And if it *doesn't* have user memory mapped in that area, then it will
fail when the trying to execute the (non-existent) abort sequence.
After all, depending on configuration, a native 32-bit kernel might
limit user space even more (ie some vendors had a 2G:2G split instead
of the traditional 3G:1G split.
Was that the case you were thinking of, or was it something else?
Linus
On Fri, Jun 29, 2018 at 8:27 AM Linus Torvalds
<[email protected]> wrote:
>
> You simply can't have it both ways.
Put another way.
This is ok in the native path:
if ((unsigned long) rseq_cs->abort_ip != rseq_cs->abort_ip)
return -EINVAL;
because it's checking that the value fits in the native register size
(and it also ends up being a no-op if the native size is the same size
as abort_ip).
And this is very much ok in a compat syscall:
if (rseq_cs->abort_ip & ~(unsigned long)-1u)
return -EINVAL;
because it's checking that the pointer doesn't have (invalid in
compat) high bits set.
But it is NOT OK to say "the rseq system call doesn't have any compat
syscall, but we'll do that compat check in the native case, because we
worry about compat issues".
See what I'm saying? Either you worry about compat issues (and have a
compat syscall), or you don't.
The whole "let's not do a compat syscall, but then check compat issues
at run-time in the native system call because compat processes will
use it" is braindamage.
Linus
On Fri, Jun 29, 2018 at 8:27 AM, Linus Torvalds
<[email protected]> wrote:
>
>
> On Fri, Jun 29, 2018, 08:03 Mathieu Desnoyers
> <[email protected]> wrote:
>>
>>
>> Considering those inconsistencies between architectures (either
>> the task gets killed, or the top bits are silently cleared), I'm
>> very much tempted to be restrictive in the inputs accepted by
>> rseq, and not rely on architectures as providing consistent
>> validation of the return IP.
>>
>> Thoughts ?
>
>
> Then you need to make it a compat system call, since clearly you and Andy
> want the 32-bit case to do something different from the 64-bit case.
I personally would like the compat and non-compat cases to do exactly
the same thing. If abort_ip is the address (as a u64) of a valid
executable instruction and an abort happens, then that instruction
should get executed. If abort_ip does not point to user-executable
memory, then the process should get a signal.
The problem isn't with rseq per se -- it's with the daft way that the
x86 return-to-userspace instructions work. If I apply this patch:
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b81918..26e4ba44e87b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -346,6 +346,8 @@ __visible void do_int80_syscall_32(struct pt_regs *regs)
{
enter_from_user_mode();
local_irq_enable();
+ if (!user_64bit_mode(regs))
+ regs->ip |= (1UL << 32);
do_syscall_32_irqs_on(regs);
}
the kernel *still works*. But this unconditionally uses the IRET
path, and I don't even want to speculate as to what the hell happens
if we exit using SYSRETL, or LRET, or SYSEXITL, or if Intel or AMD
ever gives us a new mode that gets rid of the espfix shite. IOW, I
don't think that the x86 entry code should make a promise that it will
continue ignoring the high bits of regs->ip when
!user_64bit_mode(regs) on a 64-bit kernel.
The problem with rseq as it stands is that this oddity gets
accidentally exposed all the way to userspace. If we're not careful,
it'll be possible for a slightly buggy user program to goof up the
code that generates the data structure that supplies abort_ip such
that the high bits are garbage (0xcccccccc due to padding, for
example), and the kernel will do exactly what the user code requested,
and we'll get regs->ip = 0xcccccccc00000000 | (the actual intended
abort_ip), and we'll pass that crap value all the way to IRET, and
IRET will truncate it back down to the correct value. And then some
CPU will add new behavior or we'll invoke SYSRETL or whatever on some
weird CPU, and the program will crash. And we'll be sad.
I suppose we could handle this in the entry code by coming up with a
way to reject out-of-bounds regs->ip for 32-bit tasks, but that's
going to be a bit messy and will slow down normal code that doesn't
use rseq.
Other than rseq, I don't think that there's any real issue. The only
ways to get regs->ip >= 2^32 in a 32-bit task involve ptrace or manual
fiddling with signal contexts, and I don't expect to ever have any
real software depend on precisely what happens.
On Fri, Jun 29, 2018 at 9:07 AM Mathieu Desnoyers
<[email protected]> wrote:
>
> This code is not invoked from syscalls, but rather on return from
> interrupt/trap after a preemption.
But when we register the rseq, we could easily check that the top bits
of the IP is clear, no?
Sure, user space can change it after the fact, but at that point it's
literally "user space is being intentionally stupid".
The real worry is that 32-bit compat code never initializes those bits
at all, no?
Linus
----- On Jun 29, 2018, at 10:17 AM, Linus Torvalds [email protected] wrote:
> On Fri, Jun 29, 2018 at 7:05 AM Mathieu Desnoyers
> <[email protected]> wrote:
>>
>> What I'm worried about is setting regs->ip of a compat 32-bit task to
>> addresses in the range 0x100000000-0xFFFFFFFFFFFFFFFF.
>
> Well, they won't have anything mapped in that range, so it really
> shouldn't matter.
It appears that arm64 simply clears the top bits of regs->ip when
returning to 32-bit compat userspace. So this would be inconsistent
between 32-bit kernel and 64-bit kernel with a 32-bit compat task:
a 32-bit kernel would kill the process, but a 64-bit kernel would
silently clear the top bits.
Considering those inconsistencies between architectures (either
the task gets killed, or the top bits are silently cleared), I'm
very much tempted to be restrictive in the inputs accepted by
rseq, and not rely on architectures as providing consistent
validation of the return IP.
Thoughts ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
----- On Jun 29, 2018, at 11:54 AM, Linus Torvalds [email protected] wrote:
> On Fri, Jun 29, 2018 at 8:27 AM Linus Torvalds
> <[email protected]> wrote:
>>
>> You simply can't have it both ways.
>
> Put another way.
>
> This is ok in the native path:
>
> if ((unsigned long) rseq_cs->abort_ip != rseq_cs->abort_ip)
> return -EINVAL;
>
> because it's checking that the value fits in the native register size
> (and it also ends up being a no-op if the native size is the same size
> as abort_ip).
>
> And this is very much ok in a compat syscall:
>
> if (rseq_cs->abort_ip & ~(unsigned long)-1u)
> return -EINVAL;
>
> because it's checking that the pointer doesn't have (invalid in
> compat) high bits set.
>
> But it is NOT OK to say "the rseq system call doesn't have any compat
> syscall, but we'll do that compat check in the native case, because we
> worry about compat issues".
>
> See what I'm saying? Either you worry about compat issues (and have a
> compat syscall), or you don't.
>
> The whole "let's not do a compat syscall, but then check compat issues
> at run-time in the native system call because compat processes will
> use it" is braindamage.
This code is not invoked from syscalls, but rather on return from
interrupt/trap after a preemption.
So a compat system call does not solve it. Unless we grab the "compat"
state on rseq registration, save it in a rseq_compat flag within the
task struct, and then use it on return from interrupt/trap/syscall.
Otherwise we need to figure out whether we are dealing with a compat
task when interrupt and trap context return to userspace. We had
is_compat_task() for that before, but now it has vanished from x86.
We could use user_64bit_mode(struct pt_regs *) on x86, but it does not
exist on other architectures.
One possibility is to introduce a new API that calls user_64bit_mode()
on x86, and is_compat_task() on other archs.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
----- On Jun 29, 2018, at 1:03 PM, Linus Torvalds [email protected] wrote:
> On Fri, Jun 29, 2018 at 9:07 AM Mathieu Desnoyers
> <[email protected]> wrote:
>>
>> This code is not invoked from syscalls, but rather on return from
>> interrupt/trap after a preemption.
>
> But when we register the rseq, we could easily check that the top bits
> of the IP is clear, no?
When a thread registers rseq, it registers a pointer to a user-space
address where a struct rseq is located.
That struct rseq is typically in a TLS area. It contains a pointer
to the current "struct rseq_cs": the content of rseq_cs describes the
current rseq critical section.
So when we register rseq, the rseq->rseq_cs pointer value is typically
NULL, because there is no currently active critical section. It's after
return from sys_rseq registration that user-space eventually sets the
pointer to a non-NULL value when it enters a critical section.
So at rseq registration, there is no point in validating the value of
the rseq_cs pointer, nor of any fields in the struct rseq_cs that would
be currently pointed to by that rseq_cs pointer, because those all change
after registration.
> Sure, user space can change it after the fact, but at that point it's
> literally "user space is being intentionally stupid".
User-space can be either stupid, or really clever and trying to attack
the kernel.
> The real worry is that 32-bit compat code never initializes those bits
> at all, no?
There are two aspects I'm concerned about here:
1) security: we don't want 32-bit user-space to feed a 64-bit value over 4GB
as abort_ip that may end up causing OOPSes on architectures that would
lack proper validation of those values on return to userspace.
2) behavior consistency of 32-bit userspace on both native 32-bit and 32-bit
compat on 64-bit kernel:
- for testing: having repeatable behavior on native and compat deployments
ensures that testing results are the same. This is the difference between
having "undefined behavior" when the upper bits are set or "defined behavior:
the process is terminated with sigsegv",
- for security: if the behavior differs between 32-bit compat and native
32-bit, this leaks information about which specific architecture the kernel
is running on, which facilitates attacks on the kernel.
But perhaps I'm caring too much about those aspects ? Maybe they matter less
than I presume.
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Fri, Jun 29, 2018 at 12:48 PM, Mathieu Desnoyers
<[email protected]> wrote:
> There are two aspects I'm concerned about here:
>
> 1) security: we don't want 32-bit user-space to feed a 64-bit value over 4GB
> as abort_ip that may end up causing OOPSes on architectures that would
> lack proper validation of those values on return to userspace.
I'm not too worried about this. As long as you're doing it from
signal-delivery context (which you are AFAICT) you're fine.
But I re-read the code and I think I have a really straightforward
solution. Two choices:
(1) Change instruction_pointer_set() to return an error code if the
address passed in is garbage in a way that could cause unexpected
behavior (like >=2^32 on x86_64 if regs->cs is 32-bit). It has very
very few callers.
(2) Add instruction_pointer_validate() to go along with
instruction_pointer_set().
That should be enough to solve the problem, right?
----- On Jun 29, 2018, at 4:39 PM, Andy Lutomirski [email protected] wrote:
> On Fri, Jun 29, 2018 at 12:48 PM, Mathieu Desnoyers
> <[email protected]> wrote:
>> There are two aspects I'm concerned about here:
>>
>> 1) security: we don't want 32-bit user-space to feed a 64-bit value over 4GB
>> as abort_ip that may end up causing OOPSes on architectures that would
>> lack proper validation of those values on return to userspace.
>
> I'm not too worried about this. As long as you're doing it from
> signal-delivery context (which you are AFAICT) you're fine.
No, it's not just signal-delivery context. It's _also_ called from
return to usermode loop, which can by called on return from
interrupt/trap/syscall.
>
> But I re-read the code and I think I have a really straightforward
> solution. Two choices:
>
> (1) Change instruction_pointer_set() to return an error code if the
> address passed in is garbage in a way that could cause unexpected
> behavior (like >=2^32 on x86_64 if regs->cs is 32-bit). It has very
> very few callers.
This would take care of my security concern wrt abort_ip, but would not
provide consistent behavior for the other fields. Also, perhaps this
kind of change should aim the next merge window ?
>
> (2) Add instruction_pointer_validate() to go along with
> instruction_pointer_set().
>
> That should be enough to solve the problem, right?
This would only handle the "security" part of the matter, which
is specifically related to rseq->rseq_cs->abort_ip.
What is left is ensuring that we have consistent behavior for
other fields:
[ Note: we have introduced this helper macro: LINUX_FIELD_u32_u64
which defines a field which is 64-bit for 64-bit processes, and 32-bit
with 32-bit of padding for 32-bit processes. ]
* rseq->rseq_cs: (userspace pointer to user-space, updated by user-space
with single-copy atomicity): current type: LINUX_FIELD_u32_u64,
cannot be changed to __u64 due to single-copy atomicity requirement,
* rseq->rseq_cs->start_ip: currently a LINUX_FIELD_u32_u64,
could become a __u64,
* rseq->rseq_cs->post_commit_ip: currently a LINUX_FIELD_u32_u64,
could become a __u64,
* rseq->rseq_cs->abort_ip: currently a LINUX_FIELD_u32_u64,
could become a __u64,
For abort_ip, changing the type to __u64 and using the
instruction_pointer_validate() approach you propose would work.
For start_ip and post_commit_ip, we need to decide whether we
want to kill a 32-bit process setting the high bits or if we just
accept and use the full __u64 content on both 32-bit and 64-bit
kernels. Those two fields are only used for arithmetic comparison.
Using the full __u64 content means using 64-bit arithmetic on
32-bit native kernels though.
If we decide to kill the offending process (whether the field type is
__u64 or LINUX_FIELD_u32_u64), we need to be able to figure out if
the process is a compat task when called from signal delivery and from
return to usermode loop (return from irq/trap/syscall).
Comparison with TASK_SIZE solves the issue for abort_ip, post_commit_ip
and start_ip, although nobody seems to like that approach very much.
For rseq->rseq_cs, we cannot use __u64 due to single-copy atomicity
update requirement for 32-bit processes. However, we are using this
field in a copy_from_user(), so it will EFAULT if the high-bits are
set by a compat 32-bit task on a 64-bit kernel. We can therefore check
that the padding is zeroed explicitly on a native 32-bit kernel to
provide a consistent behavior. Specifically because rseq->rseq_cs is
checked with access_ok(), it is therefore enough to check the padding
when __LP64__ is not defined by the preprocessor.
But rather than trying to play games with input validation, I would
favor an approach that would allow rseq to validate all its inputs
straightforwardly. Introducing user_64bit_mode(struct pt_regs *)
across all architectures would allow doing just that. rseq signal
delivery and return to usermode code could then ensure that high bits are
cleared by 32-bit tasks for all fields and thus provide a consistent
behavior for 32-bit tasks running on 32-bit and 64-bit kernels.
Thoughts ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
----- On Jul 2, 2018, at 10:32 AM, Mathieu Desnoyers [email protected] wrote:
[...]
>
> But rather than trying to play games with input validation, I would
> favor an approach that would allow rseq to validate all its inputs
> straightforwardly. Introducing user_64bit_mode(struct pt_regs *)
> across all architectures would allow doing just that. rseq signal
> delivery and return to usermode code could then ensure that high bits are
> cleared by 32-bit tasks for all fields and thus provide a consistent
> behavior for 32-bit tasks running on 32-bit and 64-bit kernels.
AFAIU this could be achieved by re-introducing is_compat_task() on x86 as:
#ifdef CONFIG_COMPAT
static bool is_compat_task(void)
{
return user_64bit_mode(current_pt_regs()));
}
#else
static bool is_compat_task(void) { return false; };
#endif
Or am I missing something ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Mon, Jul 2, 2018 at 7:32 AM Mathieu Desnoyers
<[email protected]> wrote:
>
> ----- On Jun 29, 2018, at 4:39 PM, Andy Lutomirski [email protected] wrote:
>
> > On Fri, Jun 29, 2018 at 12:48 PM, Mathieu Desnoyers
> > <[email protected]> wrote:
> >> There are two aspects I'm concerned about here:
> >>
> >> 1) security: we don't want 32-bit user-space to feed a 64-bit value over 4GB
> >> as abort_ip that may end up causing OOPSes on architectures that would
> >> lack proper validation of those values on return to userspace.
> >
> > I'm not too worried about this. As long as you're doing it from
> > signal-delivery context (which you are AFAICT) you're fine.
>
> No, it's not just signal-delivery context. It's _also_ called from
> return to usermode loop, which can by called on return from
> interrupt/trap/syscall.
>
TIF_NOTIFY_RESUME context in the exit slowpath is fine, too.
> >
> > But I re-read the code and I think I have a really straightforward
> > solution. Two choices:
> >
> > (1) Change instruction_pointer_set() to return an error code if the
> > address passed in is garbage in a way that could cause unexpected
> > behavior (like >=2^32 on x86_64 if regs->cs is 32-bit). It has very
> > very few callers.
>
> This would take care of my security concern wrt abort_ip, but would not
> provide consistent behavior for the other fields. Also, perhaps this
> kind of change should aim the next merge window ?
It's not about security. The idea is that instruction_pointer_set()
should return some indication of whether it actually set the
instruction pointer to the requested value. On x86, if you have
!user_64bit_mode(regs) and you call instruction_pointer_set() to set
ip to 0xbaadc0de12345678, then you end up with a state where we will
probably execute user code at the address 0x12345678. Conversely, if
you have user_64bit_mode(regs) == true and you set ip to
0xbaadc0de12345678, then you will end up sending a signal to the task
because 0xbaadc0de12345678 is not executable (and, in fact, is highly
likely to be noncanonical).
So I would argue that the semantics *should* be:
/*
* Attempts to modify @regs such that the next user instruction to be
executed is
* the instruction at @addr. instruction_pointer_set() may return
false to indicate
* that addr was invalid in the sense that the next user instruction executed
* might be some other address instead. The most likely cause is that
* regs refers to a 32-bit compat context, addr != (u32)addr, and the
architecture
* might silently truncate the address on the next return to user code.
*
* instruction_pointer_set() must only be called from a context in
which the architecture
* allows arbitrary modifications of @regs.
*
* Architecture implementations promise that calling
instruction_pointer_set() will not
* crash or otherwise corrupt the kernel when called from a valid
context, regardless
* of what value is passed in @addr.
*/
bool instruction_pointer_set(struct pt_regs *regs, unsigned long addr);
>
> >
> > (2) Add instruction_pointer_validate() to go along with
> > instruction_pointer_set().
> >
> > That should be enough to solve the problem, right?
>
> This would only handle the "security" part of the matter, which
> is specifically related to rseq->rseq_cs->abort_ip.
>
> What is left is ensuring that we have consistent behavior for
> other fields:
>
> [ Note: we have introduced this helper macro: LINUX_FIELD_u32_u64
> which defines a field which is 64-bit for 64-bit processes, and 32-bit
> with 32-bit of padding for 32-bit processes. ]
>
> * rseq->rseq_cs: (userspace pointer to user-space, updated by user-space
> with single-copy atomicity): current type: LINUX_FIELD_u32_u64,
> cannot be changed to __u64 due to single-copy atomicity requirement,
>
> * rseq->rseq_cs->start_ip: currently a LINUX_FIELD_u32_u64,
> could become a __u64,
>
> * rseq->rseq_cs->post_commit_ip: currently a LINUX_FIELD_u32_u64,
> could become a __u64,
>
> * rseq->rseq_cs->abort_ip: currently a LINUX_FIELD_u32_u64,
> could become a __u64,
>
> For abort_ip, changing the type to __u64 and using the
> instruction_pointer_validate() approach you propose would work.
>
> For start_ip and post_commit_ip, we need to decide whether we
> want to kill a 32-bit process setting the high bits or if we just
> accept and use the full __u64 content on both 32-bit and 64-bit
> kernels. Those two fields are only used for arithmetic comparison.
> Using the full __u64 content means using 64-bit arithmetic on
> 32-bit native kernels though.
Just use the 64-bit values, I think. I see no point in killing the task.
>
> For rseq->rseq_cs, we cannot use __u64 due to single-copy atomicity
> update requirement for 32-bit processes. However, we are using this
> field in a copy_from_user(), so it will EFAULT if the high-bits are
> set by a compat 32-bit task on a 64-bit kernel. We can therefore check
> that the padding is zeroed explicitly on a native 32-bit kernel to
> provide a consistent behavior. Specifically because rseq->rseq_cs is
> checked with access_ok(), it is therefore enough to check the padding
> when __LP64__ is not defined by the preprocessor.
Agreed.
>
> But rather than trying to play games with input validation, I would
> favor an approach that would allow rseq to validate all its inputs
> straightforwardly. Introducing user_64bit_mode(struct pt_regs *)
> across all architectures would allow doing just that.
I would be okay with that, too, but I think it would have to be
user_64bit_mode(task, regs), since
sane architectures would have the task bitness somewhere other than in
regs. x86 is IMO rather
weird in this regard. When I added user_64bit_mode(), I didn't
envision its use outside x86 arch code.
> AFAIU this could be achieved by re-introducing is_compat_task() on x86 as:
>
> #ifdef CONFIG_COMPAT
> static bool is_compat_task(void)
> {
> return user_64bit_mode(current_pt_regs()));
> }
> #else
> static bool is_compat_task(void) { return false; };
> #endif
>
> Or am I missing something ?
is_compat_task() historically literally meant "am I in a compat system
call". It never worked consistently on x86 outside of syscall
context. While I do have fundamental objections to having a generic
concept of "is this a compat task?" on Linux, that's not why I removed
is_compat_task(). I removed it because it didn't do what the name
suggested.
Unfortunately, while it's gone from generic code, it's still there on
non-x86 arches, and it probably still has inconsistent semantics. So
I don't want to re-add it.
But I think that the limited solution of changing
instruction_pointer_set() really is a sufficient
architecture-dependent change to fully solve your problem.
----- On Jul 2, 2018, at 1:11 PM, Andy Lutomirski [email protected] wrote:
>
> But I think that the limited solution of changing
> instruction_pointer_set() really is a sufficient
> architecture-dependent change to fully solve your problem.
So let me recap with the changes I gather for 4.18 and 4.19:
4.18:
* Change struct rseq_cs field types from LINUX_FIELD_u32_u64() to __u64 in
uapi/linux/rseq.h,
* Compare rseq->rseq_cs->abort_ip with TASK_SIZE before using it. Kill offending
process if its value is over TASK_SIZE,
* Explicitly check that padding of rseq->rseq_cs is zero on 32-bit kernels
(#ifndef __LP64__).
4.19:
* Introduce instruction_pointer_set() with input validation, use it when setting
IP to abort_ip in rseq. This replaces the comparison of abort_ip with TASK_SIZE.
Is that consistent with what you have in mind ?
Thanks,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
On Mon, Jul 2, 2018 at 12:00 PM, Mathieu Desnoyers
<[email protected]> wrote:
> ----- On Jul 2, 2018, at 1:11 PM, Andy Lutomirski [email protected] wrote:
>>
>> But I think that the limited solution of changing
>> instruction_pointer_set() really is a sufficient
>> architecture-dependent change to fully solve your problem.
>
> So let me recap with the changes I gather for 4.18 and 4.19:
>
> 4.18:
>
> * Change struct rseq_cs field types from LINUX_FIELD_u32_u64() to __u64 in
> uapi/linux/rseq.h,
> * Compare rseq->rseq_cs->abort_ip with TASK_SIZE before using it. Kill offending
> process if its value is over TASK_SIZE,
> * Explicitly check that padding of rseq->rseq_cs is zero on 32-bit kernels
> (#ifndef __LP64__).
>
> 4.19:
>
> * Introduce instruction_pointer_set() with input validation, use it when setting
> IP to abort_ip in rseq. This replaces the comparison of abort_ip with TASK_SIZE.
>
> Is that consistent with what you have in mind ?
>
Works for me. Linus, any objection?
On Mon, Jul 2, 2018 at 12:02 PM Andy Lutomirski <[email protected]> wrote:
>
> Works for me. Linus, any objection?
I think the 4.19 stage may be overkill, but I don't hate it, so no
real objections.
If the main reason for this is that we silently clear the upper bits
when returning to compat mode, I actually think that a better fix
would be to just fix that. We shouldn't silently ignore bogus data in
the return path.
But I don't care enough, I think.
Linus
On Mon, Jul 2, 2018 at 12:31 PM, Linus Torvalds
<[email protected]> wrote:
> On Mon, Jul 2, 2018 at 12:02 PM Andy Lutomirski <[email protected]> wrote:
>>
>> Works for me. Linus, any objection?
>
> I think the 4.19 stage may be overkill, but I don't hate it, so no
> real objections.
>
> If the main reason for this is that we silently clear the upper bits
> when returning to compat mode, I actually think that a better fix
> would be to just fix that. We shouldn't silently ignore bogus data in
> the return path.
>
> But I don't care enough, I think.
Like this:
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b81918..ec40223c8856 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -170,6 +170,26 @@ static void exit_to_usermode_loop(struct pt_regs
*regs, u32 cached_flags)
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
+ if (unlikely(!user_64bit_mode(regs) &&
+ (regs->ip & 0xffffffff00000000ull))) {
+ siginfo_t info;
+ struct task_struct *tsk = current;
+
+ /* I haven't thought about this *that* hard. */
+ clear_siginfo(&info);
+ tsk->thread.cr2 = regs->ip;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_INSTR;
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = (void __user *)regs->ip;
+ /* si_addr_lsb? */
+ force_sig_info(SIGSEGV, &info, tsk);
+
+ /* And we'll go through the loop again. */
+ }
+
/* Disable IRQs and retry */
local_irq_disable();
It's whitespace damaged and barely tested, but it seems to at least
not be completely busted. I don't really love doing this, though.
On Mon, Jul 2, 2018 at 1:13 PM Andy Lutomirski <[email protected]> wrote:
>
> Like this:
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 3b2490b81918..ec40223c8856 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -170,6 +170,26 @@ static void exit_to_usermode_loop(struct pt_regs
> *regs, u32 cached_flags)
> if (cached_flags & _TIF_USER_RETURN_NOTIFY)
> fire_user_return_notifiers();
>
> + if (unlikely(!user_64bit_mode(regs) &&
> + (regs->ip & 0xffffffff00000000ull))) {
I'd be afraid that code generation is atrocious. So more something like this:
static noinline send_sigsegv(..) { }
...
if (unlikely(!user_64bit_mode(regs)) {
if (unlikely(*(1+(u32 *)®s->ip)))
send_sigsegv(tsk);
to make sure it doesn't do crazy big constants in the normal path, and
doesn't allocate silly stack frames.
But as mentioned, I'm not entirely convinced this is worth it. But I
wasn't sure it's worth it for rseq.
So basically, *if* we do these kinds of checks, I'd personally rather
do a *generic* "we don't return to garbage 64-bit values in compat
mode" than have special case code that is only for rseq and is truly
irrelevant to all normal cases.
The generic case might even be worth a test-case. And if we do that,
we should probably check that we don't do something odd like
sign-extending the %rip value we load from stack for signal return
etc, that just happens to work because nobody cared about the upper
bits.
So there's a lot of these kinds of small details that are of
questionable importance, but might in theory be worth it.
Linus