arch_prctl(ARCH_X86_CET_STATUS, u64 *args)
Get CET feature status.
The parameter 'args' is a pointer to a user buffer. The kernel returns
the following information:
*args = shadow stack/IBT status
*(args + 1) = shadow stack base address
*(args + 2) = shadow stack size
arch_prctl(ARCH_X86_CET_DISABLE, u64 features)
Disable CET features specified in 'features'. Return -EPERM if CET is
locked.
arch_prctl(ARCH_X86_CET_LOCK)
Lock in CET features.
arch_prctl(ARCH_X86_CET_MMAP_SHSTK, u64 *args)
Allocate a new shadow stack.
The parameter 'args' is a pointer to a user buffer.
*args = desired size
*(args + 1) = MAP_32BIT or MAP_POPULATE
On returning, *args is the allocated shadow stack address.
Also change do_arch_prctl_common()'s parameter 'cpuid_enabled' to
'arg2', as it is now also passed to prctl_cet().
Signed-off-by: Yu-cheng Yu <[email protected]>
---
v11:
- Check input for invalid features.
- Fix prctl_cet() return values.
- Change ARCH_X86_CET_ALLOC_SHSTK to ARCH_X86_CET_MMAP_SHSTK to take
MAP_32BIT, MAP_POPULATE as inputs.
v10:
- Verify CET is enabled before handling arch_prctl.
- Change input parameters from unsigned long to u64, to make it clear they
are 64-bit.
arch/x86/include/asm/cet.h | 4 +
arch/x86/include/uapi/asm/prctl.h | 5 ++
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/cet.c | 26 +++++++
arch/x86/kernel/cet_prctl.c | 98 +++++++++++++++++++++++++
arch/x86/kernel/process.c | 6 +-
tools/arch/x86/include/uapi/asm/prctl.h | 5 ++
7 files changed, 142 insertions(+), 4 deletions(-)
create mode 100644 arch/x86/kernel/cet_prctl.c
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 71dc92acd2f2..f7eb197998ad 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -14,16 +14,20 @@ struct sc_ext;
struct cet_status {
unsigned long shstk_base;
unsigned long shstk_size;
+ unsigned int locked:1;
};
#ifdef CONFIG_X86_INTEL_CET
+int prctl_cet(int option, u64 arg2);
int cet_setup_shstk(void);
int cet_setup_thread_shstk(struct task_struct *p);
+unsigned long cet_alloc_shstk(unsigned long size, int flags);
void cet_disable_free_shstk(struct task_struct *p);
int cet_verify_rstor_token(bool ia32, unsigned long ssp, unsigned long *new_ssp);
void cet_restore_signal(struct sc_ext *sc);
int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
#else
+static inline int prctl_cet(int option, u64 arg2) { return -EINVAL; }
static inline int cet_setup_thread_shstk(struct task_struct *p) { return 0; }
static inline void cet_disable_free_shstk(struct task_struct *p) {}
static inline void cet_restore_signal(struct sc_ext *sc) { return; }
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..3aaac13cdc87 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,9 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_X86_CET_STATUS 0x3001
+#define ARCH_X86_CET_DISABLE 0x3002
+#define ARCH_X86_CET_LOCK 0x3003
+#define ARCH_X86_CET_MMAP_SHSTK 0x3004
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 76f27f518266..97556e4204d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -145,7 +145,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_X86_INTEL_CET) += cet.o
+obj-$(CONFIG_X86_INTEL_CET) += cet.o cet_prctl.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index b30c61a66c8e..2bf1a6b6abb6 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -148,6 +148,32 @@ static int create_rstor_token(bool ia32, unsigned long ssp,
return 0;
}
+unsigned long cet_alloc_shstk(unsigned long len, int flags)
+{
+ unsigned long token;
+ unsigned long addr, ssp;
+
+ addr = alloc_shstk(round_up(len, PAGE_SIZE), flags);
+
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ /* Restore token is 8 bytes and aligned to 8 bytes */
+ ssp = addr + len;
+ token = ssp;
+
+ if (!in_ia32_syscall())
+ token |= TOKEN_MODE_64;
+ ssp -= 8;
+
+ if (write_user_shstk_64(ssp, token)) {
+ vm_munmap(addr, len);
+ return -EINVAL;
+ }
+
+ return addr;
+}
+
int cet_setup_shstk(void)
{
unsigned long addr, size;
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index 000000000000..cc49eef08ab0
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+#include <linux/prctl.h>
+#include <linux/compat.h>
+#include <linux/mman.h>
+#include <linux/elfcore.h>
+#include <asm/processor.h>
+#include <asm/prctl.h>
+#include <asm/cet.h>
+
+/* See Documentation/x86/intel_cet.rst. */
+
+static int copy_status_to_user(struct cet_status *cet, u64 arg2)
+{
+ u64 buf[3] = {0, 0, 0};
+
+ if (cet->shstk_size) {
+ buf[0] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+ buf[1] = (u64)cet->shstk_base;
+ buf[2] = (u64)cet->shstk_size;
+ }
+
+ return copy_to_user((u64 __user *)arg2, buf, sizeof(buf));
+}
+
+static int handle_mmap_shstk(u64 arg2)
+{
+ u64 buf[3];
+ unsigned long addr, size;
+ int allowed_flags;
+
+ if (copy_from_user(buf, (unsigned long __user *)arg2, sizeof(buf)))
+ return -EFAULT;
+
+ size = buf[0];
+
+ /*
+ * Check invalid flags
+ */
+ allowed_flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT | MAP_POPULATE;
+
+ if (buf[1] & ~allowed_flags)
+ return -EINVAL;
+
+ addr = cet_alloc_shstk(size, buf[1]);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ if (put_user(addr, (u64 __user *)arg2)) {
+ vm_munmap(addr, size);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int prctl_cet(int option, u64 arg2)
+{
+ struct cet_status *cet;
+
+ /*
+ * GLIBC's ENOTSUPP == EOPNOTSUPP == 95, and it does not recognize
+ * the kernel's ENOTSUPP (524). So return EOPNOTSUPP here.
+ */
+ if (!IS_ENABLED(CONFIG_X86_INTEL_CET))
+ return -EOPNOTSUPP;
+
+ cet = ¤t->thread.cet;
+
+ if (option == ARCH_X86_CET_STATUS)
+ return copy_status_to_user(cet, arg2);
+
+ if (!static_cpu_has(X86_FEATURE_SHSTK))
+ return -EOPNOTSUPP;
+
+ switch (option) {
+ case ARCH_X86_CET_DISABLE:
+ if (cet->locked)
+ return -EPERM;
+ if (arg2 & GNU_PROPERTY_X86_FEATURE_1_INVAL)
+ return -EINVAL;
+ if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+ cet_disable_free_shstk(current);
+ return 0;
+
+ case ARCH_X86_CET_LOCK:
+ cet->locked = 1;
+ return 0;
+
+ case ARCH_X86_CET_MMAP_SHSTK:
+ return handle_mmap_shstk(arg2);
+
+ default:
+ return -ENOSYS;
+ }
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6fe5b061841..5a657a9774dc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -980,14 +980,14 @@ unsigned long get_wchan(struct task_struct *p)
}
long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled)
+ unsigned long arg2)
{
switch (option) {
case ARCH_GET_CPUID:
return get_cpuid_mode();
case ARCH_SET_CPUID:
- return set_cpuid_mode(task, cpuid_enabled);
+ return set_cpuid_mode(task, arg2);
}
- return -EINVAL;
+ return prctl_cet(option, arg2);
}
diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..3aaac13cdc87 100644
--- a/tools/arch/x86/include/uapi/asm/prctl.h
+++ b/tools/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,9 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_X86_CET_STATUS 0x3001
+#define ARCH_X86_CET_DISABLE 0x3002
+#define ARCH_X86_CET_LOCK 0x3003
+#define ARCH_X86_CET_MMAP_SHSTK 0x3004
+
#endif /* _ASM_X86_PRCTL_H */
--
2.21.0
On Mon, Aug 24, 2020 at 5:30 PM Yu-cheng Yu <[email protected]> wrote:
> arch_prctl(ARCH_X86_CET_MMAP_SHSTK, u64 *args)
> Allocate a new shadow stack.
>
> The parameter 'args' is a pointer to a user buffer.
>
> *args = desired size
> *(args + 1) = MAP_32BIT or MAP_POPULATE
>
> On returning, *args is the allocated shadow stack address.
This is hideous. Would this be better as a new syscall?
--Andy
On 8/24/2020 5:36 PM, Andy Lutomirski wrote:
> On Mon, Aug 24, 2020 at 5:30 PM Yu-cheng Yu <[email protected]> wrote:
>
>> arch_prctl(ARCH_X86_CET_MMAP_SHSTK, u64 *args)
>> Allocate a new shadow stack.
>>
>> The parameter 'args' is a pointer to a user buffer.
>>
>> *args = desired size
>> *(args + 1) = MAP_32BIT or MAP_POPULATE
>>
>> On returning, *args is the allocated shadow stack address.
>
> This is hideous. Would this be better as a new syscall?
Could you point out why this is hideous, so that I can modify the
arch_prctl?
I think this is more arch-specific. Even if it becomes a new syscall,
we still need to pass the same parameters.
Yu-cheng
On 8/25/20 11:43 AM, Yu, Yu-cheng wrote:
>>> arch_prctl(ARCH_X86_CET_MMAP_SHSTK, u64 *args)
>>> Allocate a new shadow stack.
>>>
>>> The parameter 'args' is a pointer to a user buffer.
>>>
>>> *args = desired size
>>> *(args + 1) = MAP_32BIT or MAP_POPULATE
>>>
>>> On returning, *args is the allocated shadow stack address.
>>
>> This is hideous. Would this be better as a new syscall?
>
> Could you point out why this is hideous, so that I can modify the
> arch_prctl?
Passing values in memory is hideous when we don't have to. A syscall
would let you have separate arguments for size and flags and would also
let you have a nice return value instead of needing to do that in memory
too.
> I think this is more arch-specific. Even if it becomes a new syscall,
> we still need to pass the same parameters.
Right, but without the copying in and out of memory.
On 8/25/2020 12:19 PM, Dave Hansen wrote:
> On 8/25/20 11:43 AM, Yu, Yu-cheng wrote:
>>>> arch_prctl(ARCH_X86_CET_MMAP_SHSTK, u64 *args)
>>>> Allocate a new shadow stack.
>>>>
>>>> The parameter 'args' is a pointer to a user buffer.
>>>>
>>>> *args = desired size
>>>> *(args + 1) = MAP_32BIT or MAP_POPULATE
>>>>
>>>> On returning, *args is the allocated shadow stack address.
>>>
>>> This is hideous. Would this be better as a new syscall?
>>
>> Could you point out why this is hideous, so that I can modify the
>> arch_prctl?
>
> Passing values in memory is hideous when we don't have to. A syscall
> would let you have separate arguments for size and flags and would also
> let you have a nice return value instead of needing to do that in memory
> too.
That is a good justification.
>
>> I think this is more arch-specific. Even if it becomes a new syscall,
>> we still need to pass the same parameters.
>
> Right, but without the copying in and out of memory.
>
Linux-api is already on the Cc list. Do we need to add more people to
get some agreements for the syscall?
On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
>>> I think this is more arch-specific. Even if it becomes a new syscall,
>>> we still need to pass the same parameters.
>>
>> Right, but without the copying in and out of memory.
>>
> Linux-api is already on the Cc list. Do we need to add more people to
> get some agreements for the syscall?
What kind of agreement are you looking for? I'd suggest just coding it
up and posting the patches. Adding syscalls really is really pretty
straightforward and isn't much code at all.
On 8/25/2020 4:20 PM, Dave Hansen wrote:
> On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
>>>> I think this is more arch-specific. Even if it becomes a new syscall,
>>>> we still need to pass the same parameters.
>>>
>>> Right, but without the copying in and out of memory.
>>>
>> Linux-api is already on the Cc list. Do we need to add more people to
>> get some agreements for the syscall?
> What kind of agreement are you looking for? I'd suggest just coding it
> up and posting the patches. Adding syscalls really is really pretty
> straightforward and isn't much code at all.
>
Sure, I will do that.
Thanks,
Yu-cheng
On Tue, Aug 25, 2020 at 04:34:27PM -0700, Yu, Yu-cheng wrote:
> On 8/25/2020 4:20 PM, Dave Hansen wrote:
> >On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
> >>>>I think this is more arch-specific.? Even if it becomes a new syscall,
> >>>>we still need to pass the same parameters.
> >>>
> >>>Right, but without the copying in and out of memory.
> >>>
> >>Linux-api is already on the Cc list.? Do we need to add more people to
> >>get some agreements for the syscall?
> >What kind of agreement are you looking for? I'd suggest just coding it
> >up and posting the patches. Adding syscalls really is really pretty
> >straightforward and isn't much code at all.
> >
>
> Sure, I will do that.
Alternatively, would a regular prctl() work here?
arch_prctl() feels like a historical weirdness for x86 -- other arches
all seem to be using regular prctl(), which allows for 4 args. I don't
know the history behind the difference here.
(Since prctl() and arch_prctl() use non-clashing command numbers, I had
wondered whether it would be worth just merging the x86 calls in with
the rest and making the two calls aliases. That's one for later,
though...)
Cheers
---Dave
* Dave Martin:
> On Tue, Aug 25, 2020 at 04:34:27PM -0700, Yu, Yu-cheng wrote:
>> On 8/25/2020 4:20 PM, Dave Hansen wrote:
>> >On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
>> >>>>I think this is more arch-specific. Even if it becomes a new syscall,
>> >>>>we still need to pass the same parameters.
>> >>>
>> >>>Right, but without the copying in and out of memory.
>> >>>
>> >>Linux-api is already on the Cc list. Do we need to add more people to
>> >>get some agreements for the syscall?
>> >What kind of agreement are you looking for? I'd suggest just coding it
>> >up and posting the patches. Adding syscalls really is really pretty
>> >straightforward and isn't much code at all.
>> >
>>
>> Sure, I will do that.
>
> Alternatively, would a regular prctl() work here?
Is this something appliation code has to call, or just the dynamic
loader?
prctl in glibc is a variadic function, so if there's a mismatch between
the kernel/userspace syscall convention and the userspace calling
convention (for variadic functions) for specific types, it can't be made
to work in a generic way.
The loader can use inline assembly for system calls and does not have
this issue, but applications would be implcated by it.
Thanks,
Florian
On Wed, Aug 26, 2020 at 9:52 AM Florian Weimer <[email protected]> wrote:
>
> * Dave Martin:
>
> > On Tue, Aug 25, 2020 at 04:34:27PM -0700, Yu, Yu-cheng wrote:
> >> On 8/25/2020 4:20 PM, Dave Hansen wrote:
> >> >On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
> >> >>>>I think this is more arch-specific. Even if it becomes a new syscall,
> >> >>>>we still need to pass the same parameters.
> >> >>>
> >> >>>Right, but without the copying in and out of memory.
> >> >>>
> >> >>Linux-api is already on the Cc list. Do we need to add more people to
> >> >>get some agreements for the syscall?
> >> >What kind of agreement are you looking for? I'd suggest just coding it
> >> >up and posting the patches. Adding syscalls really is really pretty
> >> >straightforward and isn't much code at all.
> >> >
> >>
> >> Sure, I will do that.
> >
> > Alternatively, would a regular prctl() work here?
>
> Is this something appliation code has to call, or just the dynamic
> loader?
>
> prctl in glibc is a variadic function, so if there's a mismatch between
> the kernel/userspace syscall convention and the userspace calling
> convention (for variadic functions) for specific types, it can't be made
> to work in a generic way.
>
> The loader can use inline assembly for system calls and does not have
> this issue, but applications would be implcated by it.
>
I would expect things like Go and various JITs to call it directly.
If we wanted to be fancy and add a potentially more widely useful
syscall, how about:
mmap_special(void *addr, size_t length, int prot, int flags, int type);
Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
this is really just mmap() except that we want to map something a bit
magical, and we don't want to require opening a device node to do it.
--Andy
On Wed, Aug 26, 2020 at 06:51:48PM +0200, Florian Weimer wrote:
> * Dave Martin:
>
> > On Tue, Aug 25, 2020 at 04:34:27PM -0700, Yu, Yu-cheng wrote:
> >> On 8/25/2020 4:20 PM, Dave Hansen wrote:
> >> >On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
> >> >>>>I think this is more arch-specific.? Even if it becomes a new syscall,
> >> >>>>we still need to pass the same parameters.
> >> >>>
> >> >>>Right, but without the copying in and out of memory.
> >> >>>
> >> >>Linux-api is already on the Cc list.? Do we need to add more people to
> >> >>get some agreements for the syscall?
> >> >What kind of agreement are you looking for? I'd suggest just coding it
> >> >up and posting the patches. Adding syscalls really is really pretty
> >> >straightforward and isn't much code at all.
> >> >
> >>
> >> Sure, I will do that.
> >
> > Alternatively, would a regular prctl() work here?
>
> Is this something appliation code has to call, or just the dynamic
> loader?
>
> prctl in glibc is a variadic function, so if there's a mismatch between
> the kernel/userspace syscall convention and the userspace calling
> convention (for variadic functions) for specific types, it can't be made
> to work in a generic way.
>
> The loader can use inline assembly for system calls and does not have
> this issue, but applications would be implcated by it.
To the extent that this is a problem, libc's prctl() wrapper has to
handle it already. New prctl() calls tend to demand precisely 4
arguments and require unused arguments to be 0, but this is more down to
policy rather than because anything breaks otherwise.
You're right that this has implications: for i386, libc probably pulls
more arguments off the stack than are really there in some situations.
This isn't a new problem though. There are already generic prctls with
fewer than 4 args that are used on x86.
Merging the actual prctl() and arch_prctl() syscalls doesn't acutally
stop libc from retaining separate wrappers if they have different
argument marshaling requirements in some corner cases.
There might be some underlying reason by x86 has its own call and nobody
else followed the same model, but I don't know what it is.
Cheers
---Dave
On 8/26/2020 10:04 AM, Andy Lutomirski wrote:
> On Wed, Aug 26, 2020 at 9:52 AM Florian Weimer <[email protected]> wrote:
>>
>> * Dave Martin:
>>
>>> On Tue, Aug 25, 2020 at 04:34:27PM -0700, Yu, Yu-cheng wrote:
>>>> On 8/25/2020 4:20 PM, Dave Hansen wrote:
>>>>> On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
>>>>>>>> I think this is more arch-specific. Even if it becomes a new syscall,
>>>>>>>> we still need to pass the same parameters.
>>>>>>>
>>>>>>> Right, but without the copying in and out of memory.
>>>>>>>
>>>>>> Linux-api is already on the Cc list. Do we need to add more people to
>>>>>> get some agreements for the syscall?
>>>>> What kind of agreement are you looking for? I'd suggest just coding it
>>>>> up and posting the patches. Adding syscalls really is really pretty
>>>>> straightforward and isn't much code at all.
>>>>>
>>>>
>>>> Sure, I will do that.
>>>
>>> Alternatively, would a regular prctl() work here?
>>
>> Is this something appliation code has to call, or just the dynamic
>> loader?
>>
>> prctl in glibc is a variadic function, so if there's a mismatch between
>> the kernel/userspace syscall convention and the userspace calling
>> convention (for variadic functions) for specific types, it can't be made
>> to work in a generic way.
>>
>> The loader can use inline assembly for system calls and does not have
>> this issue, but applications would be implcated by it.
>>
>
> I would expect things like Go and various JITs to call it directly.
>
> If we wanted to be fancy and add a potentially more widely useful
> syscall, how about:
>
> mmap_special(void *addr, size_t length, int prot, int flags, int type);
>
> Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
> this is really just mmap() except that we want to map something a bit
> magical, and we don't want to require opening a device node to do it.
>
One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
Does ARM have similar needs for memory mapping, Dave?
Yu-cheng
On Wed, Aug 26, 2020 at 11:49 AM Yu, Yu-cheng <[email protected]> wrote:
>
> On 8/26/2020 10:04 AM, Andy Lutomirski wrote:
> > On Wed, Aug 26, 2020 at 9:52 AM Florian Weimer <[email protected]> wrote:
> >>
> >> * Dave Martin:
> >>
> >>> On Tue, Aug 25, 2020 at 04:34:27PM -0700, Yu, Yu-cheng wrote:
> >>>> On 8/25/2020 4:20 PM, Dave Hansen wrote:
> >>>>> On 8/25/20 2:04 PM, Yu, Yu-cheng wrote:
> >>>>>>>> I think this is more arch-specific. Even if it becomes a new syscall,
> >>>>>>>> we still need to pass the same parameters.
> >>>>>>>
> >>>>>>> Right, but without the copying in and out of memory.
> >>>>>>>
> >>>>>> Linux-api is already on the Cc list. Do we need to add more people to
> >>>>>> get some agreements for the syscall?
> >>>>> What kind of agreement are you looking for? I'd suggest just coding it
> >>>>> up and posting the patches. Adding syscalls really is really pretty
> >>>>> straightforward and isn't much code at all.
> >>>>>
> >>>>
> >>>> Sure, I will do that.
> >>>
> >>> Alternatively, would a regular prctl() work here?
> >>
> >> Is this something appliation code has to call, or just the dynamic
> >> loader?
> >>
> >> prctl in glibc is a variadic function, so if there's a mismatch between
> >> the kernel/userspace syscall convention and the userspace calling
> >> convention (for variadic functions) for specific types, it can't be made
> >> to work in a generic way.
> >>
> >> The loader can use inline assembly for system calls and does not have
> >> this issue, but applications would be implcated by it.
> >>
> >
> > I would expect things like Go and various JITs to call it directly.
> >
> > If we wanted to be fancy and add a potentially more widely useful
> > syscall, how about:
> >
> > mmap_special(void *addr, size_t length, int prot, int flags, int type);
> >
> > Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
> > this is really just mmap() except that we want to map something a bit
> > magical, and we don't want to require opening a device node to do it.
> >
>
> One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
> Does ARM have similar needs for memory mapping, Dave?
>
arch/arm64/include/uapi/asm/mman.h:
#define PROT_BTI 0x10 /* BTI guarded page */
--
H.J.
On 8/26/20 11:49 AM, Yu, Yu-cheng wrote:
>> I would expect things like Go and various JITs to call it directly.
>>
>> If we wanted to be fancy and add a potentially more widely useful
>> syscall, how about:
>>
>> mmap_special(void *addr, size_t length, int prot, int flags, int type);
>>
>> Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
>> this is really just mmap() except that we want to map something a bit
>> magical, and we don't want to require opening a device node to do it.
>
> One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
> Does ARM have similar needs for memory mapping, Dave?
No idea.
But, mmap_special() is *basically* mmap2() with extra-big flags space.
I suspect it will grow some more uses on top of shadow stacks. It could
have, for instance, been used to allocate MPX bounds tables.
* Dave Martin:
> You're right that this has implications: for i386, libc probably pulls
> more arguments off the stack than are really there in some situations.
> This isn't a new problem though. There are already generic prctls with
> fewer than 4 args that are used on x86.
As originally posted, glibc prctl would have to know that it has to pull
an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
then the u64 argument is a problem for arch_prctl as well.
Thanks,
Florian
* H. J. Lu:
> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
>>
>> * Dave Martin:
>>
>> > You're right that this has implications: for i386, libc probably pulls
>> > more arguments off the stack than are really there in some situations.
>> > This isn't a new problem though. There are already generic prctls with
>> > fewer than 4 args that are used on x86.
>>
>> As originally posted, glibc prctl would have to know that it has to pull
>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
>> then the u64 argument is a problem for arch_prctl as well.
>>
>
> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
The commit message and the C source say otherwise, I think (not sure
about the C source, not a kernel hacker).
Thanks,
Florian
On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
>
> * Dave Martin:
>
> > You're right that this has implications: for i386, libc probably pulls
> > more arguments off the stack than are really there in some situations.
> > This isn't a new problem though. There are already generic prctls with
> > fewer than 4 args that are used on x86.
>
> As originally posted, glibc prctl would have to know that it has to pull
> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
> then the u64 argument is a problem for arch_prctl as well.
>
Argument of ARCH_X86_CET_DISABLE is int and passed in register.
--
H.J.
On Thu, Aug 27, 2020 at 7:07 AM H.J. Lu <[email protected]> wrote:
>
> On Thu, Aug 27, 2020 at 6:36 AM Florian Weimer <[email protected]> wrote:
> >
> > * H. J. Lu:
> >
> > > On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
> > >>
> > >> * Dave Martin:
> > >>
> > >> > You're right that this has implications: for i386, libc probably pulls
> > >> > more arguments off the stack than are really there in some situations.
> > >> > This isn't a new problem though. There are already generic prctls with
> > >> > fewer than 4 args that are used on x86.
> > >>
> > >> As originally posted, glibc prctl would have to know that it has to pull
> > >> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
> > >> then the u64 argument is a problem for arch_prctl as well.
> > >>
> > >
> > > Argument of ARCH_X86_CET_DISABLE is int and passed in register.
> >
> > The commit message and the C source say otherwise, I think (not sure
> > about the C source, not a kernel hacker).
>
> It should read:
>
> arch_prctl(ARCH_X86_CET_DISABLE, unsigned long features)
>
Or
arch_prctl(ARCH_X86_CET_DISABLE, unsigned int features)
--
H.J.
On Thu, Aug 27, 2020 at 6:36 AM Florian Weimer <[email protected]> wrote:
>
> * H. J. Lu:
>
> > On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
> >>
> >> * Dave Martin:
> >>
> >> > You're right that this has implications: for i386, libc probably pulls
> >> > more arguments off the stack than are really there in some situations.
> >> > This isn't a new problem though. There are already generic prctls with
> >> > fewer than 4 args that are used on x86.
> >>
> >> As originally posted, glibc prctl would have to know that it has to pull
> >> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
> >> then the u64 argument is a problem for arch_prctl as well.
> >>
> >
> > Argument of ARCH_X86_CET_DISABLE is int and passed in register.
>
> The commit message and the C source say otherwise, I think (not sure
> about the C source, not a kernel hacker).
It should read:
arch_prctl(ARCH_X86_CET_DISABLE, unsigned long features)
--
H.J.
On Wed, Aug 26, 2020 at 12:57 PM Dave Hansen <[email protected]> wrote:
>
> On 8/26/20 11:49 AM, Yu, Yu-cheng wrote:
> >> I would expect things like Go and various JITs to call it directly.
> >>
> >> If we wanted to be fancy and add a potentially more widely useful
> >> syscall, how about:
> >>
> >> mmap_special(void *addr, size_t length, int prot, int flags, int type);
> >>
> >> Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
> >> this is really just mmap() except that we want to map something a bit
> >> magical, and we don't want to require opening a device node to do it.
> >
> > One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
> > Does ARM have similar needs for memory mapping, Dave?
>
> No idea.
>
> But, mmap_special() is *basically* mmap2() with extra-big flags space.
> I suspect it will grow some more uses on top of shadow stacks. It could
> have, for instance, been used to allocate MPX bounds tables.
There is no reason we can't use
long arch_prctl (int, unsigned long, unsigned long, unsigned long, ..);
for ARCH_X86_CET_MMAP_SHSTK. We just need to use
syscall (SYS_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, ...);
--
H.J.
On 8/27/2020 6:36 AM, Florian Weimer wrote:
> * H. J. Lu:
>
>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
>>>
>>> * Dave Martin:
>>>
>>>> You're right that this has implications: for i386, libc probably pulls
>>>> more arguments off the stack than are really there in some situations.
>>>> This isn't a new problem though. There are already generic prctls with
>>>> fewer than 4 args that are used on x86.
>>>
>>> As originally posted, glibc prctl would have to know that it has to pull
>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
>>> then the u64 argument is a problem for arch_prctl as well.
>>>
>>
>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
>
> The commit message and the C source say otherwise, I think (not sure
> about the C source, not a kernel hacker).
>
H.J. Lu suggested that we fix x86 arch_prctl() to take four arguments,
and then keep MMAP_SHSTK as an arch_prctl(). Because now the map flags
and size are all in registers, this also solves problems being pointed
out earlier. Without a wrapper, the shadow stack mmap call (from user
space) will be:
syscall(_NR_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, size, MAP_32BIT).
I think this would be a nice alternative to another new syscall.
If this looks good to everyone, I can send out new patches as response
to my current version, and then after all issues fixed, send v12.
Thanks,
Yu-cheng
> On Aug 27, 2020, at 11:13 AM, Yu, Yu-cheng <[email protected]> wrote:
>
> On 8/27/2020 6:36 AM, Florian Weimer wrote:
>> * H. J. Lu:
>>>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
>>>>>
>>>>> * Dave Martin:
>>>>>
>>>>>> You're right that this has implications: for i386, libc probably pulls
>>>>>> more arguments off the stack than are really there in some situations.
>>>>>> This isn't a new problem though. There are already generic prctls with
>>>>>> fewer than 4 args that are used on x86.
>>>>>
>>>>> As originally posted, glibc prctl would have to know that it has to pull
>>>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
>>>>> then the u64 argument is a problem for arch_prctl as well.
>>>>>
>>>
>>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
>> The commit message and the C source say otherwise, I think (not sure
>> about the C source, not a kernel hacker).
>
> H.J. Lu suggested that we fix x86 arch_prctl() to take four arguments, and then keep MMAP_SHSTK as an arch_prctl(). Because now the map flags and size are all in registers, this also solves problems being pointed out earlier. Without a wrapper, the shadow stack mmap call (from user space) will be:
>
> syscall(_NR_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, size, MAP_32BIT).
I admit I don’t see a show stopping technical reason we can’t add arguments to an existing syscall, but I’m pretty sure it’s unprecedented, and it doesn’t seem like a good idea.
On 8/27/2020 11:56 AM, Andy Lutomirski wrote:
>
>
>> On Aug 27, 2020, at 11:13 AM, Yu, Yu-cheng <[email protected]> wrote:
>>
>> On 8/27/2020 6:36 AM, Florian Weimer wrote:
>>> * H. J. Lu:
>>>>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
>>>>>>
>>>>>> * Dave Martin:
>>>>>>
>>>>>>> You're right that this has implications: for i386, libc probably pulls
>>>>>>> more arguments off the stack than are really there in some situations.
>>>>>>> This isn't a new problem though. There are already generic prctls with
>>>>>>> fewer than 4 args that are used on x86.
>>>>>>
>>>>>> As originally posted, glibc prctl would have to know that it has to pull
>>>>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
>>>>>> then the u64 argument is a problem for arch_prctl as well.
>>>>>>
>>>>
>>>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
>>> The commit message and the C source say otherwise, I think (not sure
>>> about the C source, not a kernel hacker).
>>
>> H.J. Lu suggested that we fix x86 arch_prctl() to take four arguments, and then keep MMAP_SHSTK as an arch_prctl(). Because now the map flags and size are all in registers, this also solves problems being pointed out earlier. Without a wrapper, the shadow stack mmap call (from user space) will be:
>>
>> syscall(_NR_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, size, MAP_32BIT).
>
> I admit I don’t see a show stopping technical reason we can’t add arguments to an existing syscall, but I’m pretty sure it’s unprecedented, and it doesn’t seem like a good idea.
>
There are nine existing arch_prctl calls now. If the concern is the
extra new arguments getting misused, we can mask them out for the
existing calls. Otherwise, I have not seen anything that can break.
Yu-cheng
On Thu, Aug 27, 2020 at 11:56 AM Andy Lutomirski <[email protected]> wrote:
>
>
>
> > On Aug 27, 2020, at 11:13 AM, Yu, Yu-cheng <[email protected]> wrote:
> >
> > On 8/27/2020 6:36 AM, Florian Weimer wrote:
> >> * H. J. Lu:
> >>>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
> >>>>>
> >>>>> * Dave Martin:
> >>>>>
> >>>>>> You're right that this has implications: for i386, libc probably pulls
> >>>>>> more arguments off the stack than are really there in some situations.
> >>>>>> This isn't a new problem though. There are already generic prctls with
> >>>>>> fewer than 4 args that are used on x86.
> >>>>>
> >>>>> As originally posted, glibc prctl would have to know that it has to pull
> >>>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
> >>>>> then the u64 argument is a problem for arch_prctl as well.
> >>>>>
> >>>
> >>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
> >> The commit message and the C source say otherwise, I think (not sure
> >> about the C source, not a kernel hacker).
> >
> > H.J. Lu suggested that we fix x86 arch_prctl() to take four arguments, and then keep MMAP_SHSTK as an arch_prctl(). Because now the map flags and size are all in registers, this also solves problems being pointed out earlier. Without a wrapper, the shadow stack mmap call (from user space) will be:
> >
> > syscall(_NR_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, size, MAP_32BIT).
>
> I admit I don’t see a show stopping technical reason we can’t add arguments to an existing syscall, but I’m pretty sure it’s unprecedented, and it doesn’t seem like a good idea.
prctl prototype is:
extern int prctl (int __option, ...)
and implemented in kernel as:
int prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5);
Not all prctl operations take all 5 arguments. It also applies
to arch_prctl. It is quite normal for different operations of
arch_prctl to take different numbers of arguments.
--
H.J.
On Thu, Aug 27, 2020 at 12:38 PM H.J. Lu <[email protected]> wrote:
>
> On Thu, Aug 27, 2020 at 11:56 AM Andy Lutomirski <[email protected]> wrote:
> >
> >
> >
> > > On Aug 27, 2020, at 11:13 AM, Yu, Yu-cheng <[email protected]> wrote:
> > >
> > > On 8/27/2020 6:36 AM, Florian Weimer wrote:
> > >> * H. J. Lu:
> > >>>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
> > >>>>>
> > >>>>> * Dave Martin:
> > >>>>>
> > >>>>>> You're right that this has implications: for i386, libc probably pulls
> > >>>>>> more arguments off the stack than are really there in some situations.
> > >>>>>> This isn't a new problem though. There are already generic prctls with
> > >>>>>> fewer than 4 args that are used on x86.
> > >>>>>
> > >>>>> As originally posted, glibc prctl would have to know that it has to pull
> > >>>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
> > >>>>> then the u64 argument is a problem for arch_prctl as well.
> > >>>>>
> > >>>
> > >>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
> > >> The commit message and the C source say otherwise, I think (not sure
> > >> about the C source, not a kernel hacker).
> > >
> > > H.J. Lu suggested that we fix x86 arch_prctl() to take four arguments, and then keep MMAP_SHSTK as an arch_prctl(). Because now the map flags and size are all in registers, this also solves problems being pointed out earlier. Without a wrapper, the shadow stack mmap call (from user space) will be:
> > >
> > > syscall(_NR_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, size, MAP_32BIT).
> >
> > I admit I don’t see a show stopping technical reason we can’t add arguments to an existing syscall, but I’m pretty sure it’s unprecedented, and it doesn’t seem like a good idea.
>
> prctl prototype is:
>
> extern int prctl (int __option, ...)
>
> and implemented in kernel as:
>
> int prctl(int option, unsigned long arg2, unsigned long arg3,
> unsigned long arg4, unsigned long arg5);
>
> Not all prctl operations take all 5 arguments. It also applies
> to arch_prctl. It is quite normal for different operations of
> arch_prctl to take different numbers of arguments.
If by "quite normal" you mean "does not happen", then I agree.
In any event, I will not have anything to do with a patch that changes
an existing syscall signature unless Linus personally acks it. So if
you want to email him and linux-abi, be my guest.
On Thu, Aug 27, 2020 at 6:35 PM Andy Lutomirski <[email protected]> wrote:
>
> On Thu, Aug 27, 2020 at 12:38 PM H.J. Lu <[email protected]> wrote:
> >
> > On Thu, Aug 27, 2020 at 11:56 AM Andy Lutomirski <[email protected]> wrote:
> > >
> > >
> > >
> > > > On Aug 27, 2020, at 11:13 AM, Yu, Yu-cheng <[email protected]> wrote:
> > > >
> > > > On 8/27/2020 6:36 AM, Florian Weimer wrote:
> > > >> * H. J. Lu:
> > > >>>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
> > > >>>>>
> > > >>>>> * Dave Martin:
> > > >>>>>
> > > >>>>>> You're right that this has implications: for i386, libc probably pulls
> > > >>>>>> more arguments off the stack than are really there in some situations.
> > > >>>>>> This isn't a new problem though. There are already generic prctls with
> > > >>>>>> fewer than 4 args that are used on x86.
> > > >>>>>
> > > >>>>> As originally posted, glibc prctl would have to know that it has to pull
> > > >>>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
> > > >>>>> then the u64 argument is a problem for arch_prctl as well.
> > > >>>>>
> > > >>>
> > > >>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
> > > >> The commit message and the C source say otherwise, I think (not sure
> > > >> about the C source, not a kernel hacker).
> > > >
> > > > H.J. Lu suggested that we fix x86 arch_prctl() to take four arguments, and then keep MMAP_SHSTK as an arch_prctl(). Because now the map flags and size are all in registers, this also solves problems being pointed out earlier. Without a wrapper, the shadow stack mmap call (from user space) will be:
> > > >
> > > > syscall(_NR_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, size, MAP_32BIT).
> > >
> > > I admit I don’t see a show stopping technical reason we can’t add arguments to an existing syscall, but I’m pretty sure it’s unprecedented, and it doesn’t seem like a good idea.
> >
> > prctl prototype is:
> >
> > extern int prctl (int __option, ...)
> >
> > and implemented in kernel as:
> >
> > int prctl(int option, unsigned long arg2, unsigned long arg3,
> > unsigned long arg4, unsigned long arg5);
> >
> > Not all prctl operations take all 5 arguments. It also applies
> > to arch_prctl. It is quite normal for different operations of
> > arch_prctl to take different numbers of arguments.
>
> If by "quite normal" you mean "does not happen", then I agree.
>
> In any event, I will not have anything to do with a patch that changes
> an existing syscall signature unless Linus personally acks it. So if
> you want to email him and linux-abi, be my guest.
Can you think of ANY issues of passing more arguments to arch_prctl?
syscall () provided by glibc always passes 6 arguments to the kernel.
Arguments are already in the registers. What kind of problems do
you see?
--
H.J.
* H. J. Lu:
> Can you think of ANY issues of passing more arguments to arch_prctl?
On x32, the glibc arch_prctl system call wrapper only passes two
arguments to the kernel, and applications have no way of detecting that.
musl only passes two arguments on all architectures. It happens to work
anyway with default compiler flags, but that's an accident.
Thanks,
Florian
On Thu, Aug 27, 2020 at 11:24 PM Florian Weimer <[email protected]> wrote:
>
> * H. J. Lu:
>
> > Can you think of ANY issues of passing more arguments to arch_prctl?
>
> On x32, the glibc arch_prctl system call wrapper only passes two
> arguments to the kernel, and applications have no way of detecting that.
> musl only passes two arguments on all architectures. It happens to work
> anyway with default compiler flags, but that's an accident.
In the current glibc, there is no arch_prctl wrapper for i386. There are
arch_prctl wrappers with 2 arguments for x86-64 and x32. But this isn't an
issue for glibc since glibc is both the provider and the user of the new
arch_prctl extension. Besides,
long syscall(long number, ...);
is always available.
--
H.J.
On Fri, Aug 28, 2020 at 4:38 AM H.J. Lu <[email protected]> wrote:
>
> On Thu, Aug 27, 2020 at 11:24 PM Florian Weimer <[email protected]> wrote:
> >
> > * H. J. Lu:
> >
> > > Can you think of ANY issues of passing more arguments to arch_prctl?
> >
> > On x32, the glibc arch_prctl system call wrapper only passes two
> > arguments to the kernel, and applications have no way of detecting that.
> > musl only passes two arguments on all architectures. It happens to work
> > anyway with default compiler flags, but that's an accident.
>
> In the current glibc, there is no arch_prctl wrapper for i386. There are
> arch_prctl wrappers with 2 arguments for x86-64 and x32. But this isn't an
> issue for glibc since glibc is both the provider and the user of the new
> arch_prctl extension. Besides,
>
> long syscall(long number, ...);
>
> is always available.
Userspace is probably full of tools and libraries that contain tables
of system calls and their signatures. Think tracing, audit, container
management, etc. I don't know how they will react to the addition of
new arguments.
On Fri, Aug 28, 2020 at 10:39 AM Andy Lutomirski <[email protected]> wrote:
>
> On Fri, Aug 28, 2020 at 4:38 AM H.J. Lu <[email protected]> wrote:
> >
> > On Thu, Aug 27, 2020 at 11:24 PM Florian Weimer <[email protected]> wrote:
> > >
> > > * H. J. Lu:
> > >
> > > > Can you think of ANY issues of passing more arguments to arch_prctl?
> > >
> > > On x32, the glibc arch_prctl system call wrapper only passes two
> > > arguments to the kernel, and applications have no way of detecting that.
> > > musl only passes two arguments on all architectures. It happens to work
> > > anyway with default compiler flags, but that's an accident.
> >
> > In the current glibc, there is no arch_prctl wrapper for i386. There are
> > arch_prctl wrappers with 2 arguments for x86-64 and x32. But this isn't an
> > issue for glibc since glibc is both the provider and the user of the new
> > arch_prctl extension. Besides,
> >
> > long syscall(long number, ...);
> >
> > is always available.
>
> Userspace is probably full of tools and libraries that contain tables
> of system calls and their signatures. Think tracing, audit, container
> management, etc. I don't know how they will react to the addition of
> new arguments.
Yes, they need to be updated to understand other new operations
added for CET.
--
H.J.
On Thu, Aug 27, 2020 at 06:26:11AM -0700, H.J. Lu wrote:
> On Wed, Aug 26, 2020 at 12:57 PM Dave Hansen <[email protected]> wrote:
> >
> > On 8/26/20 11:49 AM, Yu, Yu-cheng wrote:
> > >> I would expect things like Go and various JITs to call it directly.
> > >>
> > >> If we wanted to be fancy and add a potentially more widely useful
> > >> syscall, how about:
> > >>
> > >> mmap_special(void *addr, size_t length, int prot, int flags, int type);
> > >>
> > >> Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
> > >> this is really just mmap() except that we want to map something a bit
> > >> magical, and we don't want to require opening a device node to do it.
> > >
> > > One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
> > > Does ARM have similar needs for memory mapping, Dave?
> >
> > No idea.
> >
> > But, mmap_special() is *basically* mmap2() with extra-big flags space.
> > I suspect it will grow some more uses on top of shadow stacks. It could
> > have, for instance, been used to allocate MPX bounds tables.
>
> There is no reason we can't use
>
> long arch_prctl (int, unsigned long, unsigned long, unsigned long, ..);
>
> for ARCH_X86_CET_MMAP_SHSTK. We just need to use
>
> syscall (SYS_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, ...);
For arm64 (and sparc etc.) we continue to use the regular mmap/mprotect
family of calls. One or two additional arch-specific mmap flags are
sufficient for now.
Is x86 definitely not going to fit within those calls?
For now, I can't see what arg[2] is used for (and hence the type
argument of mmap_special()), but I haven't dug through the whole series.
Cheers
---Dave
On 9/1/2020 3:28 AM, Dave Martin wrote:
> On Thu, Aug 27, 2020 at 06:26:11AM -0700, H.J. Lu wrote:
>> On Wed, Aug 26, 2020 at 12:57 PM Dave Hansen <[email protected]> wrote:
>>>
>>> On 8/26/20 11:49 AM, Yu, Yu-cheng wrote:
>>>>> I would expect things like Go and various JITs to call it directly.
>>>>>
>>>>> If we wanted to be fancy and add a potentially more widely useful
>>>>> syscall, how about:
>>>>>
>>>>> mmap_special(void *addr, size_t length, int prot, int flags, int type);
>>>>>
>>>>> Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
>>>>> this is really just mmap() except that we want to map something a bit
>>>>> magical, and we don't want to require opening a device node to do it.
>>>>
>>>> One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
>>>> Does ARM have similar needs for memory mapping, Dave?
>>>
>>> No idea.
>>>
>>> But, mmap_special() is *basically* mmap2() with extra-big flags space.
>>> I suspect it will grow some more uses on top of shadow stacks. It could
>>> have, for instance, been used to allocate MPX bounds tables.
>>
>> There is no reason we can't use
>>
>> long arch_prctl (int, unsigned long, unsigned long, unsigned long, ..);
>>
>> for ARCH_X86_CET_MMAP_SHSTK. We just need to use
>>
>> syscall (SYS_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, ...);
>
>
> For arm64 (and sparc etc.) we continue to use the regular mmap/mprotect
> family of calls. One or two additional arch-specific mmap flags are
> sufficient for now.
>
> Is x86 definitely not going to fit within those calls?
That can work for x86. Andy, what if we create PROT_SHSTK, which can
been seen only from the user. Once in kernel, it is translated to
VM_SHSTK. One question for mremap/mprotect is, do we allow a normal
data area to become shadow stack?
>
> For now, I can't see what arg[2] is used for (and hence the type
> argument of mmap_special()), but I haven't dug through the whole series.
If we use the approach above, then we don't need arch_prctl changes.
Thanks,
Yu-cheng
On Tue, Sep 1, 2020 at 10:23 AM Yu, Yu-cheng <[email protected]> wrote:
>
> On 9/1/2020 3:28 AM, Dave Martin wrote:
> > On Thu, Aug 27, 2020 at 06:26:11AM -0700, H.J. Lu wrote:
> >> On Wed, Aug 26, 2020 at 12:57 PM Dave Hansen <[email protected]> wrote:
> >>>
> >>> On 8/26/20 11:49 AM, Yu, Yu-cheng wrote:
> >>>>> I would expect things like Go and various JITs to call it directly.
> >>>>>
> >>>>> If we wanted to be fancy and add a potentially more widely useful
> >>>>> syscall, how about:
> >>>>>
> >>>>> mmap_special(void *addr, size_t length, int prot, int flags, int type);
> >>>>>
> >>>>> Where type is something like MMAP_SPECIAL_X86_SHSTK. Fundamentally,
> >>>>> this is really just mmap() except that we want to map something a bit
> >>>>> magical, and we don't want to require opening a device node to do it.
> >>>>
> >>>> One benefit of MMAP_SPECIAL_* is there are more free bits than MAP_*.
> >>>> Does ARM have similar needs for memory mapping, Dave?
> >>>
> >>> No idea.
> >>>
> >>> But, mmap_special() is *basically* mmap2() with extra-big flags space.
> >>> I suspect it will grow some more uses on top of shadow stacks. It could
> >>> have, for instance, been used to allocate MPX bounds tables.
> >>
> >> There is no reason we can't use
> >>
> >> long arch_prctl (int, unsigned long, unsigned long, unsigned long, ..);
> >>
> >> for ARCH_X86_CET_MMAP_SHSTK. We just need to use
> >>
> >> syscall (SYS_arch_prctl, ARCH_X86_CET_MMAP_SHSTK, ...);
> >
> >
> > For arm64 (and sparc etc.) we continue to use the regular mmap/mprotect
> > family of calls. One or two additional arch-specific mmap flags are
> > sufficient for now.
> >
> > Is x86 definitely not going to fit within those calls?
>
> That can work for x86. Andy, what if we create PROT_SHSTK, which can
> been seen only from the user. Once in kernel, it is translated to
> VM_SHSTK. One question for mremap/mprotect is, do we allow a normal
> data area to become shadow stack?
I'm unconvinced that we want to use a somewhat precious PROT_ or VM_
bit for this. Using a flag bit makes sense if we expect anyone to
ever map an fd or similar as a shadow stack, but that seems a bit odd
in the first place. To me, it seems more logical for a shadow stack
to be a special sort of mapping with a special vm_ops, not a normal
mapping with a special flag set. Although I realize that we want
shadow stacks to work like anonymous memory with respect to fork().
Dave?
--Andy
* Yu-cheng Yu:
> Like other arch_prctl()'s, this parameter was 'unsigned long'
> earlier. The idea was, since this arch_prctl is only implemented for
> the 64-bit kernel, we wanted it to look as 64-bit only. I will change
> it back to 'unsigned long'.
What about x32? In general, long is rather problematic for x32.
Thanks,
Florian
On 8/27/2020 7:08 AM, H.J. Lu wrote:
> On Thu, Aug 27, 2020 at 7:07 AM H.J. Lu <[email protected]> wrote:
>>
>> On Thu, Aug 27, 2020 at 6:36 AM Florian Weimer <[email protected]> wrote:
>>>
>>> * H. J. Lu:
>>>
>>>> On Thu, Aug 27, 2020 at 6:19 AM Florian Weimer <[email protected]> wrote:
>>>>>
>>>>> * Dave Martin:
>>>>>
>>>>>> You're right that this has implications: for i386, libc probably pulls
>>>>>> more arguments off the stack than are really there in some situations.
>>>>>> This isn't a new problem though. There are already generic prctls with
>>>>>> fewer than 4 args that are used on x86.
>>>>>
>>>>> As originally posted, glibc prctl would have to know that it has to pull
>>>>> an u64 argument off the argument list for ARCH_X86_CET_DISABLE. But
>>>>> then the u64 argument is a problem for arch_prctl as well.
>>>>>
>>>>
>>>> Argument of ARCH_X86_CET_DISABLE is int and passed in register.
>>>
>>> The commit message and the C source say otherwise, I think (not sure
>>> about the C source, not a kernel hacker).
>>
>> It should read:
>>
>> arch_prctl(ARCH_X86_CET_DISABLE, unsigned long features)
>>
>
> Or
>
> arch_prctl(ARCH_X86_CET_DISABLE, unsigned int features)
>
Like other arch_prctl()'s, this parameter was 'unsigned long' earlier.
The idea was, since this arch_prctl is only implemented for the 64-bit
kernel, we wanted it to look as 64-bit only. I will change it back to
'unsigned long'.
Yu-cheng
On 9/1/2020 10:50 AM, Florian Weimer wrote:
> * Yu-cheng Yu:
>
>> Like other arch_prctl()'s, this parameter was 'unsigned long'
>> earlier. The idea was, since this arch_prctl is only implemented for
>> the 64-bit kernel, we wanted it to look as 64-bit only. I will change
>> it back to 'unsigned long'.
>
> What about x32? In general, long is rather problematic for x32.
The problem is the size of 'long', right?
Because this parameter is passed in a register, and only the lower bits
are used, x32 works as well.
On 9/1/20 10:45 AM, Andy Lutomirski wrote:
>>> For arm64 (and sparc etc.) we continue to use the regular mmap/mprotect
>>> family of calls. One or two additional arch-specific mmap flags are
>>> sufficient for now.
>>>
>>> Is x86 definitely not going to fit within those calls?
>> That can work for x86. Andy, what if we create PROT_SHSTK, which can
>> been seen only from the user. Once in kernel, it is translated to
>> VM_SHSTK. One question for mremap/mprotect is, do we allow a normal
>> data area to become shadow stack?
> I'm unconvinced that we want to use a somewhat precious PROT_ or VM_
> bit for this. Using a flag bit makes sense if we expect anyone to
> ever map an fd or similar as a shadow stack, but that seems a bit odd
> in the first place. To me, it seems more logical for a shadow stack
> to be a special sort of mapping with a special vm_ops, not a normal
> mapping with a special flag set. Although I realize that we want
> shadow stacks to work like anonymous memory with respect to fork().
> Dave?
I actually don't like the idea of *creating* mappings much.
I think the pkey model has worked out pretty well where we separate
creating the mapping from doing something *to* it, like changing
protections. For instance, it would be nice if we could preserve things
like using hugetlbfs or heck even doing KSM for shadow stacks.
If we're *creating* mappings, we've pretty much ruled out things like
hugetlbfs.
Something like mprotect_shstk() would allow an implementation today that
only works on anonymous memory *and* sets up a special vm_ops. But, the
same exact ABI could do wonky stuff in the future if we decided we
wanted to do shadow stacks on DAX or hugetlbfs or whatever.
I don't really like the idea of PROT_SHSTK those are plumbed into a
bunch of interfaces. But, I also can't deny that it seems to be working
fine for the arm64 folks.
* Yu-cheng Yu:
> On 9/1/2020 10:50 AM, Florian Weimer wrote:
>> * Yu-cheng Yu:
>>
>>> Like other arch_prctl()'s, this parameter was 'unsigned long'
>>> earlier. The idea was, since this arch_prctl is only implemented for
>>> the 64-bit kernel, we wanted it to look as 64-bit only. I will change
>>> it back to 'unsigned long'.
>> What about x32? In general, long is rather problematic for x32.
>
> The problem is the size of 'long', right?
> Because this parameter is passed in a register, and only the lower
> bits are used, x32 works as well.
The userspace calling convention leaves the upper 32-bit undefined.
Therefore, this only works by accident if the kernel does not check that
the upper 32-bit are zero, which is probably a kernel bug.
It's unclear to me what you are trying to accomplish. Why do you want
to use unsigned long here? The correct type appears to be unsigned int.
This correctly expresses that the upper 32 bits of the register do not
matter.
Thanks,
Florian
On Tue, Sep 1, 2020 at 11:17 AM Florian Weimer <[email protected]> wrote:
>
> * Yu-cheng Yu:
>
> > On 9/1/2020 10:50 AM, Florian Weimer wrote:
> >> * Yu-cheng Yu:
> >>
> >>> Like other arch_prctl()'s, this parameter was 'unsigned long'
> >>> earlier. The idea was, since this arch_prctl is only implemented for
> >>> the 64-bit kernel, we wanted it to look as 64-bit only. I will change
> >>> it back to 'unsigned long'.
> >> What about x32? In general, long is rather problematic for x32.
> >
> > The problem is the size of 'long', right?
> > Because this parameter is passed in a register, and only the lower
> > bits are used, x32 works as well.
>
> The userspace calling convention leaves the upper 32-bit undefined.
> Therefore, this only works by accident if the kernel does not check that
> the upper 32-bit are zero, which is probably a kernel bug.
>
> It's unclear to me what you are trying to accomplish. Why do you want
> to use unsigned long here? The correct type appears to be unsigned int.
> This correctly expresses that the upper 32 bits of the register do not
> matter.
>
unsigned int is the correct type since only the lower 32 bits are used.
--
H.J.
On 9/1/2020 11:17 AM, Florian Weimer wrote:
> * Yu-cheng Yu:
>
>> On 9/1/2020 10:50 AM, Florian Weimer wrote:
>>> * Yu-cheng Yu:
>>>
>>>> Like other arch_prctl()'s, this parameter was 'unsigned long'
>>>> earlier. The idea was, since this arch_prctl is only implemented for
>>>> the 64-bit kernel, we wanted it to look as 64-bit only. I will change
>>>> it back to 'unsigned long'.
>>> What about x32? In general, long is rather problematic for x32.
>>
>> The problem is the size of 'long', right?
>> Because this parameter is passed in a register, and only the lower
>> bits are used, x32 works as well.
>
> The userspace calling convention leaves the upper 32-bit undefined.
> Therefore, this only works by accident if the kernel does not check that
> the upper 32-bit are zero, which is probably a kernel bug.
>
> It's unclear to me what you are trying to accomplish. Why do you want
> to use unsigned long here? The correct type appears to be unsigned int.
> This correctly expresses that the upper 32 bits of the register do not
> matter.
Yes, you are right. I will make it 'unsigned int'.
Thanks,
Yu-cheng
On Tue, Sep 01, 2020 at 11:11:37AM -0700, Dave Hansen wrote:
> On 9/1/20 10:45 AM, Andy Lutomirski wrote:
> >>> For arm64 (and sparc etc.) we continue to use the regular mmap/mprotect
> >>> family of calls. One or two additional arch-specific mmap flags are
> >>> sufficient for now.
> >>>
> >>> Is x86 definitely not going to fit within those calls?
> >> That can work for x86. Andy, what if we create PROT_SHSTK, which can
> >> been seen only from the user. Once in kernel, it is translated to
> >> VM_SHSTK. One question for mremap/mprotect is, do we allow a normal
> >> data area to become shadow stack?
> > I'm unconvinced that we want to use a somewhat precious PROT_ or VM_
> > bit for this. Using a flag bit makes sense if we expect anyone to
> > ever map an fd or similar as a shadow stack, but that seems a bit odd
> > in the first place. To me, it seems more logical for a shadow stack
> > to be a special sort of mapping with a special vm_ops, not a normal
> > mapping with a special flag set. Although I realize that we want
> > shadow stacks to work like anonymous memory with respect to fork().
> > Dave?
>
> I actually don't like the idea of *creating* mappings much.
>
> I think the pkey model has worked out pretty well where we separate
> creating the mapping from doing something *to* it, like changing
> protections. For instance, it would be nice if we could preserve things
> like using hugetlbfs or heck even doing KSM for shadow stacks.
>
> If we're *creating* mappings, we've pretty much ruled out things like
> hugetlbfs.
>
> Something like mprotect_shstk() would allow an implementation today that
> only works on anonymous memory *and* sets up a special vm_ops. But, the
> same exact ABI could do wonky stuff in the future if we decided we
> wanted to do shadow stacks on DAX or hugetlbfs or whatever.
>
> I don't really like the idea of PROT_SHSTK those are plumbed into a
> bunch of interfaces. But, I also can't deny that it seems to be working
> fine for the arm64 folks.
Note, there are some rough edges, such as what happens when someone
calls mprotect() on memory marked with PROT_BTI. Unless the caller
knows whether PROT_BTI should be set for that page, the flag may get
unintentionally cleared. Since the flag only applies to text pages
though, it's not _that_ much of a concern. Software that deals with
writable text pages is also usually involved in generating the code and
so will know about PROT_BTI. That's was the theory anyway.
In the longer term, it might be preferable to have a mprotect2() that
can leave some flags unmodified, and that doesn't silently ignore
unknown flags (at least one of mmap or mprotect does; I don't recall
which). We attempt didn't go this far, for now.
For arm64 it seemed fairly natural for the BTI flag to be a PROT_ flag,
but I don't know enough detail about x86 shstk to know whether it's a
natural fit there.
Cheers
---Dave
On 9/8/20 10:50 AM, Yu, Yu-cheng wrote:
> What about this:
>
> - Do not add any new syscall or arch_prctl for creating a new shadow stack.
>
> - Add a new arch_prctl that can turn an anonymous mapping to a shadow
> stack mapping.
>
> This allows the application to do whatever is necessary. It can even
> allow GDB or JIT code to create or fix a call stack.
Fine with me. But, it's going to effectively be
arch_prctl(PR_CONVERT_TO_SHS..., addr, len);
when it could just as easily be:
madvise(addr, len, MADV_SHSTK...);
Or a new syscall. The only question in my mind is whether we want to do
something generic that we can use for other similar things in the
future, like:
madvise2(addr, len, flags, MADV2_SHSTK...);
I don't really feel strongly about it, though. Could you please share
your logic on why you want a prctl() as opposed to a whole new syscall?
On 9/8/2020 10:57 AM, Dave Hansen wrote:
> On 9/8/20 10:50 AM, Yu, Yu-cheng wrote:
>> What about this:
>>
>> - Do not add any new syscall or arch_prctl for creating a new shadow stack.
>>
>> - Add a new arch_prctl that can turn an anonymous mapping to a shadow
>> stack mapping.
>>
>> This allows the application to do whatever is necessary. It can even
>> allow GDB or JIT code to create or fix a call stack.
>
> Fine with me. But, it's going to effectively be
>
> arch_prctl(PR_CONVERT_TO_SHS..., addr, len);
>
> when it could just as easily be:
>
> madvise(addr, len, MADV_SHSTK...);
>
> Or a new syscall. The only question in my mind is whether we want to do
> something generic that we can use for other similar things in the
> future, like:
>
> madvise2(addr, len, flags, MADV2_SHSTK...);
>
> I don't really feel strongly about it, though. Could you please share
> your logic on why you want a prctl() as opposed to a whole new syscall?
>
A new syscall is more intrusive, I think. When creating a new shadow
stack, the kernel also installs a restore token on the top of the new
shadow stack, and it is somewhat x86-specific. So far no other arch's
need this.
Yes, madvise is better if the kernel only needs to change the mapping.
The application itself can create the restore token before calling
madvise().
On 9/8/2020 11:25 AM, Yu, Yu-cheng wrote:
> On 9/8/2020 10:57 AM, Dave Hansen wrote:
>> On 9/8/20 10:50 AM, Yu, Yu-cheng wrote:
>>> What about this:
>>>
>>> - Do not add any new syscall or arch_prctl for creating a new shadow
>>> stack.
>>>
>>> - Add a new arch_prctl that can turn an anonymous mapping to a shadow
>>> stack mapping.
>>>
>>> This allows the application to do whatever is necessary. It can even
>>> allow GDB or JIT code to create or fix a call stack.
>>
>> Fine with me. But, it's going to effectively be
>>
>> arch_prctl(PR_CONVERT_TO_SHS..., addr, len);
>>
>> when it could just as easily be:
>>
>> madvise(addr, len, MADV_SHSTK...);
>>
>> Or a new syscall. The only question in my mind is whether we want to do
>> something generic that we can use for other similar things in the
>> future, like:
>>
>> madvise2(addr, len, flags, MADV2_SHSTK...);
>>
>> I don't really feel strongly about it, though. Could you please share
>> your logic on why you want a prctl() as opposed to a whole new syscall?
>>
>
> A new syscall is more intrusive, I think. When creating a new shadow
> stack, the kernel also installs a restore token on the top of the new
> shadow stack, and it is somewhat x86-specific. So far no other arch's
> need this.
>
> Yes, madvise is better if the kernel only needs to change the mapping.
> The application itself can create the restore token before calling
> madvise().
After looking at this more, I found the changes are more similar to
mprotect() than madvise(). We are going to change an anonymous mapping
to a read-only mapping, and add the VM_SHSTK flag to it. Would an
x86-specific mprotect(PROT_SHSTK) make more sense?
One alternative would be requiring a read-only mapping for
madvise(MADV_SHSTK). But that is inconvenient for the application.
Yu-cheng
On 9/9/2020 4:11 PM, Dave Hansen wrote:
> On 9/9/20 4:07 PM, Yu, Yu-cheng wrote:
>> What if a writable mapping is passed to madvise(MADV_SHSTK)? Should
>> that be rejected?
>
> It doesn't matter to me. Even if it's readable, it _stops_ being even
> directly readable after it's a shadow stack, right? I don't think
> writes are special in any way. If anything, we *want* it to be writable
> because that indicates that it can be written to, and we will want to
> write to it soon.
>
But in a PROT_WRITE mapping, all the pte's have _PAGE_BIT_RW set. To
change them to shadow stack, we need to clear that bit from the pte's.
That will be like mprotect_fixup()/change_protection_range().
On 9/9/20 3:08 PM, Yu, Yu-cheng wrote:
> After looking at this more, I found the changes are more similar to
> mprotect() than madvise(). We are going to change an anonymous mapping
> to a read-only mapping, and add the VM_SHSTK flag to it. Would an
> x86-specific mprotect(PROT_SHSTK) make more sense?
>
> One alternative would be requiring a read-only mapping for
> madvise(MADV_SHSTK). But that is inconvenient for the application.
Why? It's just:
mmap()/malloc();
mprotect(PROT_READ);
madvise(MADV_SHSTK);
vs.
mmap()/malloc();
mprotect(PROT_SHSTK);
I'm not sure a single syscall counts as inconvenient.
I don't quite think we should use a PROT_ bit for this. It seems like
the kind of thing that could be fragile and break existing expectations.
I don't care _that_ strongly though.
On 9/9/2020 3:59 PM, Dave Hansen wrote:
> On 9/9/20 3:08 PM, Yu, Yu-cheng wrote:
>> After looking at this more, I found the changes are more similar to
>> mprotect() than madvise(). We are going to change an anonymous mapping
>> to a read-only mapping, and add the VM_SHSTK flag to it. Would an
>> x86-specific mprotect(PROT_SHSTK) make more sense?
>>
>> One alternative would be requiring a read-only mapping for
>> madvise(MADV_SHSTK). But that is inconvenient for the application.
>
> Why? It's just:
>
> mmap()/malloc();
> mprotect(PROT_READ);
> madvise(MADV_SHSTK);
>
> vs.
>
> mmap()/malloc();
> mprotect(PROT_SHSTK);
>
> I'm not sure a single syscall counts as inconvenient.
>
> I don't quite think we should use a PROT_ bit for this. It seems like
> the kind of thing that could be fragile and break existing expectations.
> I don't care _that_ strongly though.
>
What if a writable mapping is passed to madvise(MADV_SHSTK)? Should
that be rejected?
On 9/9/20 4:07 PM, Yu, Yu-cheng wrote:
> What if a writable mapping is passed to madvise(MADV_SHSTK)? Should
> that be rejected?
It doesn't matter to me. Even if it's readable, it _stops_ being even
directly readable after it's a shadow stack, right? I don't think
writes are special in any way. If anything, we *want* it to be writable
because that indicates that it can be written to, and we will want to
write to it soon.
On 9/9/20 4:25 PM, Yu, Yu-cheng wrote:
> On 9/9/2020 4:11 PM, Dave Hansen wrote:
>> On 9/9/20 4:07 PM, Yu, Yu-cheng wrote:
>>> What if a writable mapping is passed to madvise(MADV_SHSTK)? Should
>>> that be rejected?
>>
>> It doesn't matter to me. Even if it's readable, it _stops_ being even
>> directly readable after it's a shadow stack, right? I don't think
>> writes are special in any way. If anything, we *want* it to be writable
>> because that indicates that it can be written to, and we will want to
>> write to it soon.
>>
> But in a PROT_WRITE mapping, all the pte's have _PAGE_BIT_RW set. To
> change them to shadow stack, we need to clear that bit from the pte's.
> That will be like mprotect_fixup()/change_protection_range().
The page table hardware bits don't matter. The user-visible protection
effects matter.
For instance, we have PROT_EXEC, which *CLEARS* a hardware NX PTE bit.
The PROT_ permissions are independent of the hardware.
I don't think the interface should be influenced at *all* by what whacko
PTE bit combinations we have to set to get the behavior.
On 9/9/2020 4:29 PM, Dave Hansen wrote:
> On 9/9/20 4:25 PM, Yu, Yu-cheng wrote:
>> On 9/9/2020 4:11 PM, Dave Hansen wrote:
>>> On 9/9/20 4:07 PM, Yu, Yu-cheng wrote:
>>>> What if a writable mapping is passed to madvise(MADV_SHSTK)? Should
>>>> that be rejected?
>>>
>>> It doesn't matter to me. Even if it's readable, it _stops_ being even
>>> directly readable after it's a shadow stack, right? I don't think
>>> writes are special in any way. If anything, we *want* it to be writable
>>> because that indicates that it can be written to, and we will want to
>>> write to it soon.
>>>
>> But in a PROT_WRITE mapping, all the pte's have _PAGE_BIT_RW set. To
>> change them to shadow stack, we need to clear that bit from the pte's.
>> That will be like mprotect_fixup()/change_protection_range().
>
> The page table hardware bits don't matter. The user-visible protection
> effects matter.
>
> For instance, we have PROT_EXEC, which *CLEARS* a hardware NX PTE bit.
> The PROT_ permissions are independent of the hardware.
Same for shadow stack here. We consider shadow stack "writable", but we
want to clear _PAGE_BIT_RW, which is set for the PTEs of the other type
of "writable" mapping. To change a writable data mapping to a writable
shadow stack mapping, we need to call change_protection_range().
>
> I don't think the interface should be influenced at *all* by what whacko
> PTE bit combinations we have to set to get the behavior.
>
On Wed, 2020-09-09 at 16:29 -0700, Dave Hansen wrote:
> On 9/9/20 4:25 PM, Yu, Yu-cheng wrote:
> > On 9/9/2020 4:11 PM, Dave Hansen wrote:
> > > On 9/9/20 4:07 PM, Yu, Yu-cheng wrote:
> > > > What if a writable mapping is passed to madvise(MADV_SHSTK)? Should
> > > > that be rejected?
> > >
> > > It doesn't matter to me. Even if it's readable, it _stops_ being even
> > > directly readable after it's a shadow stack, right? I don't think
> > > writes are special in any way. If anything, we *want* it to be writable
> > > because that indicates that it can be written to, and we will want to
> > > write to it soon.
> > >
> > But in a PROT_WRITE mapping, all the pte's have _PAGE_BIT_RW set. To
> > change them to shadow stack, we need to clear that bit from the pte's.
> > That will be like mprotect_fixup()/change_protection_range().
>
> The page table hardware bits don't matter. The user-visible protection
> effects matter.
>
> For instance, we have PROT_EXEC, which *CLEARS* a hardware NX PTE bit.
> The PROT_ permissions are independent of the hardware.
>
> I don't think the interface should be influenced at *all* by what whacko
> PTE bit combinations we have to set to get the behavior.
Here are the changes if we take the mprotect(PROT_SHSTK) approach.
Any comments/suggestions?
---
arch/x86/include/uapi/asm/mman.h | 26 +++++++++++++++++++++++++-
mm/mprotect.c | 11 +++++++++++
2 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index d4a8d0424bfb..024f006fcfe8 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -4,6 +4,8 @@
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+#define PROT_SHSTK 0x10 /* shadow stack pages */
+
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
/*
* Take the 4 protection key bits out of the vma->vm_flags
@@ -19,13 +21,35 @@
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
-#define arch_calc_vm_prot_bits(prot, key) ( \
+#define pkey_vm_prot_bits(prot, key) ( \
((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#else
+#define pkey_vm_prot_bits(prot, key)
#endif
+#define shstk_vm_prot_bits(prot) ( \
+ (static_cpu_has(X86_FEATURE_SHSTK) && (prot & PROT_SHSTK)) ? \
+ VM_SHSTK : 0)
+
+#define arch_calc_vm_prot_bits(prot, key) \
+ (pkey_vm_prot_bits(prot, key) | shstk_vm_prot_bits(prot))
+
#include <asm-generic/mman.h>
+static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
+{
+ unsigned long supported = PROT_READ | PROT_EXEC | PROT_SEM;
+
+ if (static_cpu_has(X86_FEATURE_SHSTK) && (prot & PROT_SHSTK))
+ supported |= PROT_SHSTK;
+ else
+ supported |= PROT_WRITE;
+
+ return (prot & ~supported) == 0;
+}
+#define arch_validate_prot arch_validate_prot
+
#endif /* _ASM_X86_MMAN_H */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a8edbcb3af99..520bd8caa005 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -571,6 +571,17 @@ static int do_mprotect_pkey(unsigned long start, size_t
len,
goto out;
}
}
+
+ /*
+ * Only anonymous mapping is suitable for shadow stack.
+ */
+ if (prot & PROT_SHSTK) {
+ if (vma->vm_file) {
+ error = -EINVAL;
+ goto out;
+ }
+ }
+
if (start > vma->vm_start)
prev = vma;
--
On 9/11/20 3:59 PM, Yu-cheng Yu wrote:
...
> Here are the changes if we take the mprotect(PROT_SHSTK) approach.
> Any comments/suggestions?
I still don't like it. :)
I'll also be much happier when there's a proper changelog to accompany
this which also spells out the alternatives any why they suck so much.
> diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
> index d4a8d0424bfb..024f006fcfe8 100644
> --- a/arch/x86/include/uapi/asm/mman.h
> +++ b/arch/x86/include/uapi/asm/mman.h
> @@ -4,6 +4,8 @@
>
> #define MAP_32BIT 0x40 /* only give out 32bit addresses */
>
> +#define PROT_SHSTK 0x10 /* shadow stack pages */
> +
> #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
> /*
> * Take the 4 protection key bits out of the vma->vm_flags
> @@ -19,13 +21,35 @@
> ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
> ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
>
> -#define arch_calc_vm_prot_bits(prot, key) ( \
> +#define pkey_vm_prot_bits(prot, key) ( \
> ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
> ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
> ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
> ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
> +#else
> +#define pkey_vm_prot_bits(prot, key)
> #endif
My inner compiler doesn't think this will compile:
( | shstk_vm_prot_bits(prot))
> +#define shstk_vm_prot_bits(prot) ( \
> + (static_cpu_has(X86_FEATURE_SHSTK) && (prot & PROT_SHSTK)) ? \
> + VM_SHSTK : 0)
Why do you need to filter PROT_SHSTK twice. Won't the prot passed in
here be filtered by arch_validate_prot()?
> +#define arch_calc_vm_prot_bits(prot, key) \
> + (pkey_vm_prot_bits(prot, key) | shstk_vm_prot_bits(prot))
> +
IMNHO, this is eminently more readable if you do:
#define arch_calc_vm_prot_bits(prot, key) \
(shstk_vm_prot_bits(prot)) \
pkey_vm_prot_bits(prot, key))
BTW, can these be static inlines? I forget if I had a good reason for
making them #defines.
> +static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
> +{
> + unsigned long supported = PROT_READ | PROT_EXEC | PROT_SEM;
> +
> + if (static_cpu_has(X86_FEATURE_SHSTK) && (prot & PROT_SHSTK))
> + supported |= PROT_SHSTK;
> + else
> + supported |= PROT_WRITE;
I generally like to make the common case dirt simple to understand.
That would probably be:
unsigned long supported = PROT_READ | PROT_WRITE |
PROT_EXEC | PROT_SEM;
if (static_cpu_has(X86_FEATURE_SHSTK) && (prot & PROT_SHSTK)) {
supported |= PROT_SHSTK;
// Comment about why SHSTK and WRITE
// are mutually exclusive.
supported &= ~PROT_WRITE;
}
> #endif /* _ASM_X86_MMAN_H */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index a8edbcb3af99..520bd8caa005 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -571,6 +571,17 @@ static int do_mprotect_pkey(unsigned long start, size_t
> len,
> goto out;
> }
> }
> +
> + /*
> + * Only anonymous mapping is suitable for shadow stack.
> + */
Why?
> + if (prot & PROT_SHSTK) {
> + if (vma->vm_file) {
> + error = -EINVAL;
> + goto out;
> + }
> + }
You can also save a couple of lines there. The two conditions are
pretty small.
> On Sep 14, 2020, at 7:50 AM, Dave Hansen <[email protected]> wrote:
>
> On 9/11/20 3:59 PM, Yu-cheng Yu wrote:
> ...
>> Here are the changes if we take the mprotect(PROT_SHSTK) approach.
>> Any comments/suggestions?
>
> I still don't like it. :)
>
> I'll also be much happier when there's a proper changelog to accompany
> this which also spells out the alternatives any why they suck so much.
>
Let’s take a step back here. Ignoring the precise API, what exactly is
a shadow stack from the perspective of a Linux user program?
The simplest answer is that it’s just memory that happens to have
certain protections. This enables all kinds of shenanigans. A
program could map a memfd twice, once as shadow stack and once as
non-shadow-stack, and change its control flow. Similarly, a program
could mprotect its shadow stack, modify it, and mprotect it back. In
some threat models, though could be seen as a WRSS bypass. (Although
if an attacker can coerce a process to call mprotect(), the game is
likely mostly over anyway.)
But we could be more restrictive, or perhaps we could allow user code
to opt into more restrictions. For example, we could have shadow
stacks be special memory that cannot be written from usermode by any
means other than ptrace() and friends, WRSS, and actual shadow stack
usage.
What is the goal?
No matter what we do, the effects of calling vfork() are going to be a
bit odd with SHSTK enabled. I suppose we could disallow this, but
that seems likely to cause its own issues.
On 9/14/2020 11:31 AM, Andy Lutomirski wrote:
>> On Sep 14, 2020, at 7:50 AM, Dave Hansen <[email protected]> wrote:
>>
>> On 9/11/20 3:59 PM, Yu-cheng Yu wrote:
>> ...
>>> Here are the changes if we take the mprotect(PROT_SHSTK) approach.
>>> Any comments/suggestions?
>>
>> I still don't like it. :)
>>
>> I'll also be much happier when there's a proper changelog to accompany
>> this which also spells out the alternatives any why they suck so much.
>>
>
> Let’s take a step back here. Ignoring the precise API, what exactly is
> a shadow stack from the perspective of a Linux user program?
>
> The simplest answer is that it’s just memory that happens to have
> certain protections. This enables all kinds of shenanigans. A
> program could map a memfd twice, once as shadow stack and once as
> non-shadow-stack, and change its control flow. Similarly, a program
> could mprotect its shadow stack, modify it, and mprotect it back. In
What if we do the following:
- If the mapping has VM_SHARED, it cannot be turned to shadow stack.
Shadow stack cannot be shared anyway.
- Only allow an anonymous mapping to be converted to shadow stack, but
not the other way.
> some threat models, though could be seen as a WRSS bypass. (Although
> if an attacker can coerce a process to call mprotect(), the game is
> likely mostly over anyway.)
>
> But we could be more restrictive, or perhaps we could allow user code
> to opt into more restrictions. For example, we could have shadow
> stacks be special memory that cannot be written from usermode by any
> means other than ptrace() and friends, WRSS, and actual shadow stack
> usage.
>
> What is the goal?
There primary goal is to allocate/mmap a shadow stack from user space.
>
> No matter what we do, the effects of calling vfork() are going to be a
> bit odd with SHSTK enabled. I suppose we could disallow this, but
> that seems likely to cause its own issues.
>
Do you mean vfork() has issues with call/return? That is taken care of
in GLIBC. Or do you mean it has issues with mprotect(PROT_SHSTK)?
On 9/14/20 11:31 AM, Andy Lutomirski wrote:
> No matter what we do, the effects of calling vfork() are going to be a
> bit odd with SHSTK enabled. I suppose we could disallow this, but
> that seems likely to cause its own issues.
What's odd about it? If you're a vfork()'d child, you can't touch the
stack at all, right? If you do, you or your parent will probably die a
horrible death.
The extra shadow stacks sanity checks means we'll probably see shadow
stack exceptions instead of the slightly more chaotic death without them.
On 9/16/2020 6:52 AM, Andy Lutomirski wrote:
> On Mon, Sep 14, 2020 at 2:14 PM Dave Hansen <[email protected]> wrote:
>>
>> On 9/14/20 11:31 AM, Andy Lutomirski wrote:
>>> No matter what we do, the effects of calling vfork() are going to be a
>>> bit odd with SHSTK enabled. I suppose we could disallow this, but
>>> that seems likely to cause its own issues.
>>
>> What's odd about it? If you're a vfork()'d child, you can't touch the
>> stack at all, right? If you do, you or your parent will probably die a
>> horrible death.
>>
>
> An evil program could vfork(), have the child do a bunch of returns
> and a bunch of calls, and exit. The net effect would be to change the
> parent's shadow stack contents. In a sufficiently strict model, this
> is potentially problematic.
When a vfork child returns, its parent's shadow stack pointer is where
it was before the child starts. To move the shadow stack pointer and
re-use the content left by the child, the parent needs to use CALL, RET,
INCSSP, or RSTORSSP. This seems to be difficult.
>
> The question is: how much do we want to protect userspace from itself?
>
If any issue comes up, people can always find ways to counter it.
> --Andy
>
On Mon, Sep 14, 2020 at 2:14 PM Dave Hansen <[email protected]> wrote:
>
> On 9/14/20 11:31 AM, Andy Lutomirski wrote:
> > No matter what we do, the effects of calling vfork() are going to be a
> > bit odd with SHSTK enabled. I suppose we could disallow this, but
> > that seems likely to cause its own issues.
>
> What's odd about it? If you're a vfork()'d child, you can't touch the
> stack at all, right? If you do, you or your parent will probably die a
> horrible death.
>
An evil program could vfork(), have the child do a bunch of returns
and a bunch of calls, and exit. The net effect would be to change the
parent's shadow stack contents. In a sufficiently strict model, this
is potentially problematic.
The question is: how much do we want to protect userspace from itself?
--Andy
On Mon, 2020-09-14 at 11:31 -0700, Andy Lutomirski wrote:
> > On Sep 14, 2020, at 7:50 AM, Dave Hansen <[email protected]>
> > wrote:
> >
> > On 9/11/20 3:59 PM, Yu-cheng Yu wrote:
> > ...
> > > Here are the changes if we take the mprotect(PROT_SHSTK)
> > > approach.
> > > Any comments/suggestions?
> >
> > I still don't like it. :)
> >
> > I'll also be much happier when there's a proper changelog to
> > accompany
> > this which also spells out the alternatives any why they suck so
> > much.
> >
>
> Let’s take a step back here. Ignoring the precise API, what exactly
> is
> a shadow stack from the perspective of a Linux user program?
>
> The simplest answer is that it’s just memory that happens to have
> certain protections. This enables all kinds of shenanigans. A
> program could map a memfd twice, once as shadow stack and once as
> non-shadow-stack, and change its control flow. Similarly, a program
> could mprotect its shadow stack, modify it, and mprotect it back. In
> some threat models, though could be seen as a WRSS bypass. (Although
> if an attacker can coerce a process to call mprotect(), the game is
> likely mostly over anyway.)
>
> But we could be more restrictive, or perhaps we could allow user code
> to opt into more restrictions. For example, we could have shadow
> stacks be special memory that cannot be written from usermode by any
> means other than ptrace() and friends, WRSS, and actual shadow stack
> usage.
>
> What is the goal?
>
> No matter what we do, the effects of calling vfork() are going to be
> a
> bit odd with SHSTK enabled. I suppose we could disallow this, but
> that seems likely to cause its own issues.
Hi,
Resurrecting this old thread to highlight a consequence of the design
change that came out of it. I am going to be taking over this series
from Yu-cheng, and wanted to check if people would be interested in re-
visiting this interface.
The consequence I wanted to highlight, is that making userspace be
responsible for mapping memory as shadow stack, also requires moving
the writing of the restore token to userspace for glibc ucontext
operations. Since these operations involve creating/pivoting to new
stacks in userspace, ucontext cet support involves also creating a new
shadow stack. For normal thread stacks, the kernel has always done the
shadow stack allocation and so it is never writable (in the normal
sense) from userspace. But after this change makecontext() now first
has to mmap() writable memory, then write the restore token, then
mprotect() it as shadow stack. See the glibc changes to support
PROT_SHADOW_STACK here[0].
The writable window leaves an opening for an attacker to create an
arbitrary shadow stack that could be pivoted to later by tweaking the
ucontext_t structure. To try to see how much this matters, we have done
a small test that uses this window to ROP from writes in another
thread during the makecontext()/setcontext() window. (offensive work
credit to Joao on CC). This would require a real app to already to be
using ucontext in the course of normal runtime.
The original prctl solution prevents this case since the kernel did the
allocation and restore token setup, but of course it had other issues.
The other ideas discussed previously were a new syscall, or some sort
of new madvise() operation that could be involved in setting up shadow
stack, such that it is never writable in userspace. Or, simpler but
uglier, tweaking the existing PROT_SHADOW_STACK based solution by
making core mmap code write a restore token.
Those are still probably not enough to stop all ROP given extreme
threat models, as Andy alluded. But to me, this one seemed maybe worth
trying to improve. Especially thinking to avoid future ABI changes if
it becomes a problem. Would anyone like to see attempts to tighten this
up by revisiting this shadow stack allocation interface?
Thanks,
Rick
[0]
https://gitlab.com/x86-glibc/glibc/-/commit/98eba0b829a1884c7d7bf76b0b39725919a9b6af#92c86763df6845c5b29664ac2ff40277678fd2c5_0_1
On Tue, Sep 14, 2021 at 01:33:02AM +0000, Edgecombe, Rick P wrote:
> The original prctl solution prevents this case since the kernel did the
> allocation and restore token setup, but of course it had other issues.
> The other ideas discussed previously were a new syscall, or some sort
> of new madvise() operation that could be involved in setting up shadow
> stack, such that it is never writable in userspace.
If I had to choose - and this is only my 2¢ anyway - I'd opt for this
until there's a really good reason for allowing shstk programs to fiddle
with their own shstk. Maybe there is but allowing them to do that sounds
to me like: "ew, why do we go to all this trouble to have shadow stacks
if programs would be allowed to fumble with it themselves? Might as well
not do shadow stacks at all."
And if/when there is a good reason, the API should be defined and
discussed properly at first, before we expose it to luserspace, ofc.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, Sep 13, 2021, at 6:33 PM, Edgecombe, Rick P wrote:
> On Mon, 2020-09-14 at 11:31 -0700, Andy Lutomirski wrote:
> > > On Sep 14, 2020, at 7:50 AM, Dave Hansen <[email protected]>
> > > wrote:
> > >
> > > On 9/11/20 3:59 PM, Yu-cheng Yu wrote:
> > > ...
> > > > Here are the changes if we take the mprotect(PROT_SHSTK)
> > > > approach.
> > > > Any comments/suggestions?
> > >
> > > I still don't like it. :)
> > >
> > > I'll also be much happier when there's a proper changelog to
> > > accompany
> > > this which also spells out the alternatives any why they suck so
> > > much.
> > >
> >
> > Let’s take a step back here. Ignoring the precise API, what exactly
> > is
> > a shadow stack from the perspective of a Linux user program?
> >
> > The simplest answer is that it’s just memory that happens to have
> > certain protections. This enables all kinds of shenanigans. A
> > program could map a memfd twice, once as shadow stack and once as
> > non-shadow-stack, and change its control flow. Similarly, a program
> > could mprotect its shadow stack, modify it, and mprotect it back. In
> > some threat models, though could be seen as a WRSS bypass. (Although
> > if an attacker can coerce a process to call mprotect(), the game is
> > likely mostly over anyway.)
> >
> > But we could be more restrictive, or perhaps we could allow user code
> > to opt into more restrictions. For example, we could have shadow
> > stacks be special memory that cannot be written from usermode by any
> > means other than ptrace() and friends, WRSS, and actual shadow stack
> > usage.
> >
> > What is the goal?
> >
> > No matter what we do, the effects of calling vfork() are going to be
> > a
> > bit odd with SHSTK enabled. I suppose we could disallow this, but
> > that seems likely to cause its own issues.
>
> Hi,
>
> Resurrecting this old thread to highlight a consequence of the design
> change that came out of it. I am going to be taking over this series
> from Yu-cheng, and wanted to check if people would be interested in re-
> visiting this interface.
>
> The consequence I wanted to highlight, is that making userspace be
> responsible for mapping memory as shadow stack, also requires moving
> the writing of the restore token to userspace for glibc ucontext
> operations. Since these operations involve creating/pivoting to new
> stacks in userspace, ucontext cet support involves also creating a new
> shadow stack. For normal thread stacks, the kernel has always done the
> shadow stack allocation and so it is never writable (in the normal
> sense) from userspace. But after this change makecontext() now first
> has to mmap() writable memory, then write the restore token, then
> mprotect() it as shadow stack. See the glibc changes to support
> PROT_SHADOW_STACK here[0].
>
> The writable window leaves an opening for an attacker to create an
> arbitrary shadow stack that could be pivoted to later by tweaking the
> ucontext_t structure. To try to see how much this matters, we have done
> a small test that uses this window to ROP from writes in another
> thread during the makecontext()/setcontext() window. (offensive work
> credit to Joao on CC). This would require a real app to already to be
> using ucontext in the course of normal runtime.
My general opinion here (take this with a grain of salt -- I haven't paged back in every single detail) is that the kernel should make it straightforward for a libc to do the right thing without nasty races, cross-thread coordination, or unnecessary permission to write to the stack. I *also* think that it should be possible for userspace to manage its own shadow stack allocation if it wants to, since I'm sure there will be JIT or green thread or other use cases that want to do crazy things that we fail to anticipate with in-kernel magic.
So perhaps we should keep the explicit allocation and free operations, have a way to opt-in to WRSS being flipped on, but also do our best to have API that handle the known cases well.
Does that make sense? Can we have both approaches work in the same kernel?
On Mon, 2021-09-20 at 09:48 -0700, Andy Lutomirski wrote:
> My general opinion here (take this with a grain of salt -- I haven't
> paged back in every single detail) is that the kernel should make it
> straightforward for a libc to do the right thing without nasty races,
> cross-thread coordination, or unnecessary permission to write to the
> stack. I *also* think that it should be possible for userspace to
> manage its own shadow stack allocation if it wants to, since I'm sure
> there will be JIT or green thread or other use cases that want to do
> crazy things that we fail to anticipate with in-kernel magic.
>
> So perhaps we should keep the explicit allocation and free
> operations, have a way to opt-in to WRSS being flipped on, but also
> do our best to have API that handle the known cases well.
>
> Does that make sense? Can we have both approaches work in the same
> kernel?
I think so. I'll take a look at adding a prctl to enable WRSS. Since
there already is ARCH_X86_CET_DISABLE to disable CET, it doesn't seem
like it should escalate anything. And ARCH_X86_CET_LOCK can prevent
turning it on if desired.
Thanks,
Rick