From: Ira Weiny <[email protected]>
When kernel code needs access to a PKS protected page they will need to
change the protections for the pkey to Read/Write.
Define pks_set_readwrite() to update the specified pkey. Define
pks_update_protection() as a helper to do the heavy lifting and allow
for subsequent pks_set_*() calls.
Define PKEY_READ_WRITE rather than use a magic value of '0' in
pks_update_protection().
Finally, ensure preemption is disabled for pks_write_pkrs() because the
context of this call can not generally be predicted.
pks.h is created to avoid conflicts and header dependencies with the
user space pkey code.
Add documentation.
Signed-off-by: Ira Weiny <[email protected]>
---
changes for v9
Move MSR documentation note to this patch
move declarations to incline/linux/pks.h
from rick edgecombe
change pkey type to u8
validate pkey range in pks_update_protection
from 0day
fix documentation link
from dave hansen
s/pks_mk_*/pks_set_*/
use pkey
s/pks_saved_pkrs/pkrs/
changes for v8
define pkey_read_write
make the call inline
clean up the names
use pks_write_pkrs() with preemption disabled
split this out from 'add pks kernel api'
include documentation in this patch
---
Documentation/core-api/protection-keys.rst | 15 +++++++++++
arch/x86/mm/pkeys.c | 31 ++++++++++++++++++++++
include/linux/pks.h | 31 ++++++++++++++++++++++
include/uapi/asm-generic/mman-common.h | 1 +
4 files changed, 78 insertions(+)
create mode 100644 include/linux/pks.h
diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst
index fe63acf5abbe..3af92e1cbffd 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -142,3 +142,18 @@ Adding pages to a pkey protected domain
.. kernel-doc:: arch/x86/include/asm/pgtable_types.h
:doc: PKS_KEY_ASSIGNMENT
+
+Changing permissions of individual keys
+---------------------------------------
+
+.. kernel-doc:: include/linux/pks.h
+ :identifiers: pks_set_readwrite
+
+MSR details
+~~~~~~~~~~~
+
+WRMSR is typically an architecturally serializing instruction. However,
+WRMSR(MSR_IA32_PKRS) is an exception. It is not a serializing instruction and
+instead maintains ordering properties similar to WRPKRU. Thus it is safe to
+immediately use a mapping when the pks_set*() functions returns. Check the
+latest SDM for details.
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 39e4c2cbc279..e4cbc79686ea 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -6,6 +6,7 @@
#include <linux/debugfs.h> /* debugfs_create_u32() */
#include <linux/mm_types.h> /* mm_struct, vma, etc... */
#include <linux/pkeys.h> /* PKEY_* */
+#include <linux/pks.h>
#include <linux/pks-keys.h>
#include <uapi/asm-generic/mman-common.h>
@@ -275,4 +276,34 @@ void pks_setup(void)
cr4_set_bits(X86_CR4_PKS);
}
+/*
+ * Do not call this directly, see pks_set*().
+ *
+ * @pkey: Key for the domain to change
+ * @protection: protection bits to be used
+ *
+ * Protection utilizes the same protection bits specified for User pkeys
+ * PKEY_DISABLE_ACCESS
+ * PKEY_DISABLE_WRITE
+ *
+ */
+void pks_update_protection(u8 pkey, u8 protection)
+{
+ u32 pkrs;
+
+ if (!cpu_feature_enabled(X86_FEATURE_PKS))
+ return;
+
+ if (WARN_ON_ONCE(pkey >= PKS_KEY_MAX))
+ return;
+
+ pkrs = current->thread.pkrs;
+ current->thread.pkrs = pkey_update_pkval(pkrs, pkey,
+ protection);
+ preempt_disable();
+ pks_write_pkrs(current->thread.pkrs);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(pks_update_protection);
+
#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
diff --git a/include/linux/pks.h b/include/linux/pks.h
new file mode 100644
index 000000000000..8b705a937b19
--- /dev/null
+++ b/include/linux/pks.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PKS_H
+#define _LINUX_PKS_H
+
+#ifdef CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS
+
+#include <linux/types.h>
+
+#include <uapi/asm-generic/mman-common.h>
+
+void pks_update_protection(u8 pkey, u8 protection);
+
+/**
+ * pks_set_readwrite() - Make the domain Read/Write
+ * @pkey: the pkey for which the access should change.
+ *
+ * Allow all access, read and write, to the domain specified by pkey. This is
+ * not a global update and only affects the current running thread.
+ */
+static inline void pks_set_readwrite(u8 pkey)
+{
+ pks_update_protection(pkey, PKEY_READ_WRITE);
+}
+
+#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
+
+static inline void pks_set_readwrite(u8 pkey) {}
+
+#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
+
+#endif /* _LINUX_PKS_H */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6c1aa92a92e4..f179544bd33a 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -80,6 +80,7 @@
/* compatibility flags */
#define MAP_FILE 0
+#define PKEY_READ_WRITE 0x0
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
--
2.35.1
On Tue, Apr 19, 2022 at 10:06:19AM -0700, [email protected] wrote:
> From: Ira Weiny <[email protected]>
>
> When kernel code needs access to a PKS protected page they will need to
> change the protections for the pkey to Read/Write.
I'm excited to have this infrastructure available! It'll finally give us
the "write rarely" infrastructure we've needed:
https://github.com/KSPP/linux/issues/130
>
> Define pks_set_readwrite() to update the specified pkey. Define
> pks_update_protection() as a helper to do the heavy lifting and allow
> for subsequent pks_set_*() calls.
>
> Define PKEY_READ_WRITE rather than use a magic value of '0' in
> pks_update_protection().
>
> Finally, ensure preemption is disabled for pks_write_pkrs() because the
> context of this call can not generally be predicted.
>
> pks.h is created to avoid conflicts and header dependencies with the
> user space pkey code.
>
> Add documentation.
>
> Signed-off-by: Ira Weiny <[email protected]>
>
> ---
> changes for v9
> Move MSR documentation note to this patch
> move declarations to incline/linux/pks.h
> from rick edgecombe
> change pkey type to u8
> validate pkey range in pks_update_protection
> from 0day
> fix documentation link
> from dave hansen
> s/pks_mk_*/pks_set_*/
> use pkey
> s/pks_saved_pkrs/pkrs/
>
> changes for v8
> define pkey_read_write
> make the call inline
> clean up the names
> use pks_write_pkrs() with preemption disabled
> split this out from 'add pks kernel api'
> include documentation in this patch
> ---
> Documentation/core-api/protection-keys.rst | 15 +++++++++++
> arch/x86/mm/pkeys.c | 31 ++++++++++++++++++++++
> include/linux/pks.h | 31 ++++++++++++++++++++++
> include/uapi/asm-generic/mman-common.h | 1 +
> 4 files changed, 78 insertions(+)
> create mode 100644 include/linux/pks.h
>
> diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst
> index fe63acf5abbe..3af92e1cbffd 100644
> --- a/Documentation/core-api/protection-keys.rst
> +++ b/Documentation/core-api/protection-keys.rst
> @@ -142,3 +142,18 @@ Adding pages to a pkey protected domain
>
> .. kernel-doc:: arch/x86/include/asm/pgtable_types.h
> :doc: PKS_KEY_ASSIGNMENT
> +
> +Changing permissions of individual keys
> +---------------------------------------
> +
> +.. kernel-doc:: include/linux/pks.h
> + :identifiers: pks_set_readwrite
> +
> +MSR details
> +~~~~~~~~~~~
> +
> +WRMSR is typically an architecturally serializing instruction. However,
> +WRMSR(MSR_IA32_PKRS) is an exception. It is not a serializing instruction and
> +instead maintains ordering properties similar to WRPKRU. Thus it is safe to
> +immediately use a mapping when the pks_set*() functions returns. Check the
> +latest SDM for details.
> diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> index 39e4c2cbc279..e4cbc79686ea 100644
> --- a/arch/x86/mm/pkeys.c
> +++ b/arch/x86/mm/pkeys.c
> @@ -6,6 +6,7 @@
> #include <linux/debugfs.h> /* debugfs_create_u32() */
> #include <linux/mm_types.h> /* mm_struct, vma, etc... */
> #include <linux/pkeys.h> /* PKEY_* */
> +#include <linux/pks.h>
> #include <linux/pks-keys.h>
> #include <uapi/asm-generic/mman-common.h>
>
> @@ -275,4 +276,34 @@ void pks_setup(void)
> cr4_set_bits(X86_CR4_PKS);
> }
>
> +/*
> + * Do not call this directly, see pks_set*().
> + *
> + * @pkey: Key for the domain to change
> + * @protection: protection bits to be used
> + *
> + * Protection utilizes the same protection bits specified for User pkeys
> + * PKEY_DISABLE_ACCESS
> + * PKEY_DISABLE_WRITE
> + *
> + */
> +void pks_update_protection(u8 pkey, u8 protection)
For better security, I think this should be a static inline, not a
callable (i.e. as a non-inline it could be the target of a control
flow attack).
> +{
> + u32 pkrs;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_PKS))
> + return;
> +
> + if (WARN_ON_ONCE(pkey >= PKS_KEY_MAX))
> + return;
I think this should enforce arguments being __builtin_constant_p(). i.e.
making sure that all callers of pks_update_protection() are using a
compile-time constant value. That makes it so that the caller location
and key become hard-coded (i.e. further reduction in risk to becoming a
control-flow gadget: the inlining of a const value means no arguments
any more). For example:
BUILD_BUG_ON(!__builtin_constant_p(pkey));
BUILD_BUG_ON(!__builtin_constant_p(protection));
(I think the test code will need some tweaking, but it should be
possible to adjust it.)
> +
> + pkrs = current->thread.pkrs;
> + current->thread.pkrs = pkey_update_pkval(pkrs, pkey,
> + protection);
> + preempt_disable();
> + pks_write_pkrs(current->thread.pkrs);
> + preempt_enable();
To resist cross-thread attacks, please:
- make pkey_update_pkval() also an inline
- use the pkrs variable directly and store it back only after the write
For example:
preempt_disable();
pkrs = pkey_update_pkval(current->thread.pkrs,
pkey, protection);
pks_write_pkrs(pkrs);
current->thread.pkrs = pkrs;
preempt_enable();
This means that the pkey/protection relationship always lives in a
CPU-local register and cannot be manipulated by another CPU before the
msr write completes. Better yet would be:
preempt_disable();
rdmsrl(MSR_IA32_PKRS, pkrs);
pkrs = pkey_update_pkval(pkrs, pkey, protection);
pks_write_pkrs(pkrs);
current->thread.pkrs = pkrs;
preempt_enable();
Then cross-thread attacks cannot corrupt the _other_ PKS keys (i.e.
write the desired changes to target's current->thread.kprs and trigger
an update to a different pkey, resulting in flushing the attacker's
changes to that CPU's pkey state.
> +/**
> + * pks_set_readwrite() - Make the domain Read/Write
> + * @pkey: the pkey for which the access should change.
> + *
> + * Allow all access, read and write, to the domain specified by pkey. This is
> + * not a global update and only affects the current running thread.
> + */
> +static inline void pks_set_readwrite(u8 pkey)
> +{
> + pks_update_protection(pkey, PKEY_READ_WRITE);
> +}
While adding these, can you please also add pks_set_nowrite()? This
will be needed for protecting writes to memory that should be otherwise
readable.
With these changes it should be possible to protect the kernel's page
table entries from "stray" writes. :)
-Kees
> +
> +#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
> +
> +static inline void pks_set_readwrite(u8 pkey) {}
> +
> +#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
> +
> +#endif /* _LINUX_PKS_H */
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index 6c1aa92a92e4..f179544bd33a 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -80,6 +80,7 @@
> /* compatibility flags */
> #define MAP_FILE 0
>
> +#define PKEY_READ_WRITE 0x0
> #define PKEY_DISABLE_ACCESS 0x1
> #define PKEY_DISABLE_WRITE 0x2
> #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
> --
> 2.35.1
>
--
Kees Cook
On Tue, May 10, 2022 at 02:33:03PM -0700, Ira Weiny wrote:
> On Mon, May 09, 2022 at 02:38:38PM -0700, Kees Cook wrote:
> > [...]
> > Better yet would be:
> >
> > preempt_disable();
> > rdmsrl(MSR_IA32_PKRS, pkrs);
> > pkrs = pkey_update_pkval(pkrs, pkey, protection);
> > pks_write_pkrs(pkrs);
> > current->thread.pkrs = pkrs;
> > preempt_enable();
> >
> > Then cross-thread attacks cannot corrupt the _other_ PKS keys (i.e.
> > write the desired changes to target's current->thread.kprs and trigger
> > an update to a different pkey, resulting in flushing the attacker's
> > changes to that CPU's pkey state.
>
> Unfortunately I don't think this entirely prevents an attack through the
> thread.pkrs value. thread.pkrs has to be used to set the MSR when a thread is
> scheduled. Therefore the rdmsrl above will by definition pick up the
> thread.pkrs but from an earlier time.
Ooh, good point, yeah.
> I'm not opposed to doing this as I think it does reduce the time window of such
> an attack but I wanted to mention it. Especially since I specifically avoided
> ever reading the MSR to improve performance.
>
> I'm going to run some tests. Perhaps the MSR read is not that big of a deal
> and I can convince myself that the performance diff is negligible.
Yeah, given "loaded at scheduling" point, there's not much benefit in
read/write pair. I think my first suggestion about only writing to
thread.pkrs after the write, etc, still stands. I'll ponder this a bit
more.
> > While adding these, can you please also add pks_set_nowrite()? This
> > will be needed for protecting writes to memory that should be otherwise
> > readable.
>
> I have a patch to add pks_set_readonly() but I was advised to not send it
> because this series does not include a use case for it. (PMEM does not need
> it.)
>
> Dave, Dan? Are you ok adding that back?
>
> Kees would you prefer pks_set_nowrite() as a name?
I think nowrite is the better name (in the sense that "read-only" can
sometimes imply non-executable).
> >
> > With these changes it should be possible to protect the kernel's page
> > table entries from "stray" writes. :)
>
> Yes, Rick has done some great work in that area.
Oh! I would _love_ to see this series. I was trying to scope the work
yesterday but gave up after I couldn't figure out the qemu PKS trick. :)
--
Kees Cook
On Tue, 2022-05-10 at 15:08 -0700, Kees Cook wrote:
> > Kees would you prefer pks_set_nowrite() as a name?
>
> I think nowrite is the better name (in the sense that "read-only" can
> sometimes imply non-executable).
I agree with this here. Read-only is a bad name for not writable.
Especially if you try talking about "execute-only" memory which is
"read-only" (not writable) and "not readable". Very confusing.
>
> > >
> > > With these changes it should be possible to protect the kernel's
> > > page
> > > table entries from "stray" writes. :)
> >
> > Yes, Rick has done some great work in that area.
>
> Oh! I would _love_ to see this series. I was trying to scope the work
> yesterday but gave up after I couldn't figure out the qemu PKS trick.
> :)
I would still like to get back to it, but other work has bumped it for
now.
v1:
https://lore.kernel.org/lkml/[email protected]/#r
v2:
https://lore.kernel.org/lkml/[email protected]/#r
Mostly it fit together pretty easily, but there was memory overhead
required to protect the page tables that map the direct map fully
(unless a better solution can be found).
On Mon, May 09, 2022 at 02:38:38PM -0700, Kees Cook wrote:
> On Tue, Apr 19, 2022 at 10:06:19AM -0700, [email protected] wrote:
> > From: Ira Weiny <[email protected]>
> >
> > When kernel code needs access to a PKS protected page they will need to
> > change the protections for the pkey to Read/Write.
>
> I'm excited to have this infrastructure available! It'll finally give us
> the "write rarely" infrastructure we've needed:
> https://github.com/KSPP/linux/issues/130
Thanks!
[snip]
> >
> > @@ -275,4 +276,34 @@ void pks_setup(void)
> > cr4_set_bits(X86_CR4_PKS);
> > }
> >
> > +/*
> > + * Do not call this directly, see pks_set*().
> > + *
> > + * @pkey: Key for the domain to change
> > + * @protection: protection bits to be used
> > + *
> > + * Protection utilizes the same protection bits specified for User pkeys
> > + * PKEY_DISABLE_ACCESS
> > + * PKEY_DISABLE_WRITE
> > + *
> > + */
> > +void pks_update_protection(u8 pkey, u8 protection)
>
> For better security, I think this should be a static inline, not a
> callable (i.e. as a non-inline it could be the target of a control
> flow attack).
Good point! I'll move this to asm/pks.h.
>
> > +{
> > + u32 pkrs;
> > +
> > + if (!cpu_feature_enabled(X86_FEATURE_PKS))
> > + return;
> > +
> > + if (WARN_ON_ONCE(pkey >= PKS_KEY_MAX))
> > + return;
>
> I think this should enforce arguments being __builtin_constant_p(). i.e.
> making sure that all callers of pks_update_protection() are using a
> compile-time constant value. That makes it so that the caller location
> and key become hard-coded (i.e. further reduction in risk to becoming a
> control-flow gadget: the inlining of a const value means no arguments
> any more). For example:
>
> BUILD_BUG_ON(!__builtin_constant_p(pkey));
> BUILD_BUG_ON(!__builtin_constant_p(protection));
Sounds reasonable.
>
> (I think the test code will need some tweaking, but it should be
> possible to adjust it.)
I'll figure it out.
>
> > +
> > + pkrs = current->thread.pkrs;
> > + current->thread.pkrs = pkey_update_pkval(pkrs, pkey,
> > + protection);
> > + preempt_disable();
> > + pks_write_pkrs(current->thread.pkrs);
> > + preempt_enable();
>
> To resist cross-thread attacks, please:
>
> - make pkey_update_pkval() also an inline
> - use the pkrs variable directly and store it back only after the write
>
> For example:
>
> preempt_disable();
> pkrs = pkey_update_pkval(current->thread.pkrs,
> pkey, protection);
> pks_write_pkrs(pkrs);
> current->thread.pkrs = pkrs;
> preempt_enable();
>
> This means that the pkey/protection relationship always lives in a
> CPU-local register and cannot be manipulated by another CPU before the
> msr write completes.
Yes this sounds good. Thanks for the tip.
> Better yet would be:
>
> preempt_disable();
> rdmsrl(MSR_IA32_PKRS, pkrs);
> pkrs = pkey_update_pkval(pkrs, pkey, protection);
> pks_write_pkrs(pkrs);
> current->thread.pkrs = pkrs;
> preempt_enable();
>
> Then cross-thread attacks cannot corrupt the _other_ PKS keys (i.e.
> write the desired changes to target's current->thread.kprs and trigger
> an update to a different pkey, resulting in flushing the attacker's
> changes to that CPU's pkey state.
Unfortunately I don't think this entirely prevents an attack through the
thread.pkrs value. thread.pkrs has to be used to set the MSR when a thread is
scheduled. Therefore the rdmsrl above will by definition pick up the
thread.pkrs but from an earlier time.
I'm not opposed to doing this as I think it does reduce the time window of such
an attack but I wanted to mention it. Especially since I specifically avoided
ever reading the MSR to improve performance.
I'm going to run some tests. Perhaps the MSR read is not that big of a deal
and I can convince myself that the performance diff is negligible.
>
> > +/**
> > + * pks_set_readwrite() - Make the domain Read/Write
> > + * @pkey: the pkey for which the access should change.
> > + *
> > + * Allow all access, read and write, to the domain specified by pkey. This is
> > + * not a global update and only affects the current running thread.
> > + */
> > +static inline void pks_set_readwrite(u8 pkey)
> > +{
> > + pks_update_protection(pkey, PKEY_READ_WRITE);
> > +}
>
> While adding these, can you please also add pks_set_nowrite()? This
> will be needed for protecting writes to memory that should be otherwise
> readable.
I have a patch to add pks_set_readonly() but I was advised to not send it
because this series does not include a use case for it. (PMEM does not need
it.)
Dave, Dan? Are you ok adding that back?
Kees would you prefer pks_set_nowrite() as a name?
>
> With these changes it should be possible to protect the kernel's page
> table entries from "stray" writes. :)
Yes, Rick has done some great work in that area.
Ira
>
> -Kees
>
> > +
> > +#else /* !CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
> > +
> > +static inline void pks_set_readwrite(u8 pkey) {}
> > +
> > +#endif /* CONFIG_ARCH_ENABLE_SUPERVISOR_PKEYS */
> > +
> > +#endif /* _LINUX_PKS_H */
> > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> > index 6c1aa92a92e4..f179544bd33a 100644
> > --- a/include/uapi/asm-generic/mman-common.h
> > +++ b/include/uapi/asm-generic/mman-common.h
> > @@ -80,6 +80,7 @@
> > /* compatibility flags */
> > #define MAP_FILE 0
> >
> > +#define PKEY_READ_WRITE 0x0
> > #define PKEY_DISABLE_ACCESS 0x1
> > #define PKEY_DISABLE_WRITE 0x2
> > #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
> > --
> > 2.35.1
> >
>
> --
> Kees Cook
On Tue, May 10, 2022 at 10:26:43PM +0000, Edgecombe, Rick P wrote:
> v2:
> https://lore.kernel.org/lkml/[email protected]/#r
>
> Mostly it fit together pretty easily, but there was memory overhead
> required to protect the page tables that map the direct map fully
> (unless a better solution can be found).
Oh my, I totally missed this. Yay for Too Much Email Every Day. ;)
I'll give this a read; thanks for working on it!
Also, can you speak to the schedule of availability for PKS?
--
Kees Cook
On Tue, May 10, 2022 at 08:15:45PM -0700, Kees Cook wrote:
> On Tue, May 10, 2022 at 10:26:43PM +0000, Edgecombe, Rick P wrote:
> > v2:
> > https://lore.kernel.org/lkml/[email protected]/#r
> >
> > Mostly it fit together pretty easily, but there was memory overhead
> > required to protect the page tables that map the direct map fully
> > (unless a better solution can be found).
>
> Oh my, I totally missed this. Yay for Too Much Email Every Day. ;)
>
> I'll give this a read; thanks for working on it!
>
> Also, can you speak to the schedule of availability for PKS?
Sapphire Rapids is the first CPU. But I can't speak to any dates.
I'm reworking the series now with your changes.
Thanks again for the review!
Ira