2020-07-14 07:08:26

by Ira Weiny

[permalink] [raw]
Subject: [RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch

From: Ira Weiny <[email protected]>

The PKRS MSR is defined as a per-core register. This isolates memory
access by CPU. Unfortunately, the MSR is not preserved by XSAVE.
Therefore, We must preserve the protections for individual tasks even if
they are context switched out and placed on another cpu later.

Define a saved PKRS value in the task struct, as well as a cached
per-cpu MSR value which mirrors the MSR value of the current CPU.
Initialize, all tasks with the default MSR value. Then, on schedule in,
check the saved task MSR vs the per-cpu value. If different proceed to
write the MSR. If not we avoid the overhead of the MSR write and
continue.

Follow on patches will update the saved PKRS as well as the MSR if
needed.

Co-developed-by: Fenghua Yu <[email protected]>
Signed-off-by: Fenghua Yu <[email protected]>
Signed-off-by: Ira Weiny <[email protected]>
---
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/include/asm/pkeys_internal.h | 20 +++++++++++++++
arch/x86/include/asm/processor.h | 12 +++++++++
arch/x86/kernel/cpu/common.c | 2 ++
arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++++
arch/x86/mm/pkeys.c | 13 ++++++++++
6 files changed, 83 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e8370e64a155..b6ffdfc3f388 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -727,6 +727,7 @@

#define MSR_IA32_TSC_DEADLINE 0x000006E0

+#define MSR_IA32_PKRS 0x000006E1

#define MSR_TSX_FORCE_ABORT 0x0000010F

diff --git a/arch/x86/include/asm/pkeys_internal.h b/arch/x86/include/asm/pkeys_internal.h
index a9f086f1e4b4..05257cdc7200 100644
--- a/arch/x86/include/asm/pkeys_internal.h
+++ b/arch/x86/include/asm/pkeys_internal.h
@@ -8,4 +8,24 @@

#define PKR_AD_KEY(pkey) (PKR_AD_BIT << ((pkey) * PKR_BITS_PER_PKEY))

+/*
+ * Define a default PKRS value for each task.
+ *
+ * Key 0 has no restriction. All other keys are set to the most restrictive
+ * value which is access disabled (AD=1).
+ *
+ * NOTE: This needs to be a macro to be used as part of the INIT_THREAD macro.
+ */
+#define INIT_PKRS_VALUE (PKR_AD_KEY(1) | PKR_AD_KEY(2) | PKR_AD_KEY(3) | \
+ PKR_AD_KEY(4) | PKR_AD_KEY(5) | PKR_AD_KEY(6) | \
+ PKR_AD_KEY(7) | PKR_AD_KEY(8) | PKR_AD_KEY(9) | \
+ PKR_AD_KEY(10) | PKR_AD_KEY(11) | PKR_AD_KEY(12) | \
+ PKR_AD_KEY(13) | PKR_AD_KEY(14) | PKR_AD_KEY(15))
+
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+void write_pkrs(u32 pkrs_val);
+#else
+static inline void write_pkrs(u32 pkrs_val) { }
+#endif
+
#endif /*_ASM_X86_PKEYS_INTERNAL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7da9855b5068..704d9f28fd4e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -545,6 +545,11 @@ struct thread_struct {

unsigned int sig_on_uaccess_err:1;

+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+ /* Saved Protection key register for supervisor mappings */
+ u32 saved_pkrs;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
@@ -907,8 +912,15 @@ static inline void spin_lock_prefetch(const void *x)
#define STACK_TOP TASK_SIZE_LOW
#define STACK_TOP_MAX TASK_SIZE_MAX

+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+#define INIT_THREAD_PKRS .saved_pkrs = INIT_PKRS_VALUE,
+#else
+#define INIT_THREAD_PKRS
+#endif
+
#define INIT_THREAD { \
.addr_limit = KERNEL_DS, \
+ INIT_THREAD_PKRS \
}

extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f34bcefeda42..b8241936cbbf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -56,6 +56,7 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
+#include <asm/pkeys_internal.h>

#include "cpu.h"

@@ -1442,6 +1443,7 @@ static void setup_pks(void)
return;

cr4_set_bits(X86_CR4_PKS);
+ write_pkrs(INIT_PKRS_VALUE);
}

/*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f362ce0d5ac0..d69250a7c1bf 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -42,6 +42,7 @@
#include <asm/spec-ctrl.h>
#include <asm/io_bitmap.h>
#include <asm/proto.h>
+#include <asm/pkeys_internal.h>

#include "process.h"

@@ -184,6 +185,36 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
return ret;
}

+/*
+ * NOTE: We wrap pks_init_task() and pks_sched_in() with
+ * CONFIG_ARCH_HAS_SUPERVISOR_PKEYS because using IS_ENABLED() fails
+ * due to the lack of task_struct->saved_pkrs in this configuration.
+ * Furthermore, we place them here because of the complexity introduced by
+ * header conflicts introduced to get the task_struct definition in the pkeys
+ * headers.
+ */
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+DECLARE_PER_CPU(u32, pkrs_cache);
+static inline void pks_init_task(struct task_struct *tsk)
+{
+ /* New tasks get the most restrictive PKRS value */
+ tsk->thread.saved_pkrs = INIT_PKRS_VALUE;
+}
+static inline void pks_sched_in(void)
+{
+ u64 current_pkrs = current->thread.saved_pkrs;
+
+ /* Only update the MSR when current's pkrs is different from the MSR. */
+ if (this_cpu_read(pkrs_cache) == current_pkrs)
+ return;
+
+ write_pkrs(current_pkrs);
+}
+#else
+static inline void pks_init_task(struct task_struct *tsk) { }
+static inline void pks_sched_in(void) { }
+#endif
+
void flush_thread(void)
{
struct task_struct *tsk = current;
@@ -192,6 +223,8 @@ void flush_thread(void)
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));

fpu__clear_all(&tsk->thread.fpu);
+
+ pks_init_task(tsk);
}

void disable_TSC(void)
@@ -655,6 +688,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)

if ((tifp ^ tifn) & _TIF_SLD)
switch_to_sld(tifn);
+
+ pks_sched_in();
}

/*
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index a5c680d32930..0f86f2374bd7 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -236,3 +236,16 @@ u32 get_new_pkr(u32 old_pkr, int pkey, unsigned long init_val)
/* Return the old part along with the new part: */
return old_pkr | new_pkr_bits;
}
+
+DEFINE_PER_CPU(u32, pkrs_cache);
+
+/*
+ * Write the PKey Register Supervisor. This must be run with preemption
+ * disabled as it does not guarantee the atomicity of updating the pkrs_cache
+ * and MSR on its own.
+ */
+void write_pkrs(u32 pkrs_val)
+{
+ this_cpu_write(pkrs_cache, pkrs_val);
+ wrmsrl(MSR_IA32_PKRS, pkrs_val);
+}
--
2.25.1


2020-07-14 08:28:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch

On Tue, Jul 14, 2020 at 12:02:09AM -0700, [email protected] wrote:
> From: Ira Weiny <[email protected]>
>
> The PKRS MSR is defined as a per-core register. This isolates memory
> access by CPU. Unfortunately, the MSR is not preserved by XSAVE.
> Therefore, We must preserve the protections for individual tasks even if
> they are context switched out and placed on another cpu later.

This is a contradiction and utter trainwreck. We're not going to do more
per-core MSRs and pretend they make sense per-task.

2020-07-14 18:55:15

by Ira Weiny

[permalink] [raw]
Subject: Re: [RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch

On Tue, Jul 14, 2020 at 10:27:01AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 14, 2020 at 12:02:09AM -0700, [email protected] wrote:
> > From: Ira Weiny <[email protected]>
> >
> > The PKRS MSR is defined as a per-core register. This isolates memory
> > access by CPU. Unfortunately, the MSR is not preserved by XSAVE.
> > Therefore, We must preserve the protections for individual tasks even if
> > they are context switched out and placed on another cpu later.
>
> This is a contradiction and utter trainwreck.

I don't understand where there is a contradiction? Perhaps I should have said
the MSR is not XSAVE managed vs 'preserved'?

> We're not going to do more
> per-core MSRs and pretend they make sense per-task.

I don't understand how this does not make sense. The PKRS register is
controlling the task's access to kernel memory and is designed to be restricted
to that task. Put another way, this is similar to CR3 which ultimately
controls tasks memory access. Per-process mm is inherent to memory access
control and is per-task. So how is this any different? Many MSRs are like
this.

I suppose an alternative might be to disallow a context switch while the PKRS
value is not the default but I don't see this being very desirable at all.

Ira

2020-07-14 18:58:07

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch

On 7/14/20 11:53 AM, Ira Weiny wrote:
>>> The PKRS MSR is defined as a per-core register.

Just to be clear, PKRS is a per-logical-processor register, just like
PKRU. The "per-core" thing here is a typo.

2020-07-14 19:08:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch

On Tue, Jul 14, 2020 at 11:53:22AM -0700, Ira Weiny wrote:
> On Tue, Jul 14, 2020 at 10:27:01AM +0200, Peter Zijlstra wrote:
> > On Tue, Jul 14, 2020 at 12:02:09AM -0700, [email protected] wrote:
> > > From: Ira Weiny <[email protected]>
> > >
> > > The PKRS MSR is defined as a per-core register. This isolates memory
> > > access by CPU. Unfortunately, the MSR is not preserved by XSAVE.
> > > Therefore, We must preserve the protections for individual tasks even if
> > > they are context switched out and placed on another cpu later.
> >
> > This is a contradiction and utter trainwreck.
>
> I don't understand where there is a contradiction? Perhaps I should have said
> the MSR is not XSAVE managed vs 'preserved'?

You're stating the MSR is per-*CORE*, and then continue to talk about
per-task state.

We've had a bunch of MSRs have exactly that problem recently, and it's
not fun. We're not going to do that again.

2020-07-14 19:10:45

by Ira Weiny

[permalink] [raw]
Subject: Re: [RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch

On Tue, Jul 14, 2020 at 09:05:39PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 14, 2020 at 11:53:22AM -0700, Ira Weiny wrote:
> > On Tue, Jul 14, 2020 at 10:27:01AM +0200, Peter Zijlstra wrote:
> > > On Tue, Jul 14, 2020 at 12:02:09AM -0700, [email protected] wrote:
> > > > From: Ira Weiny <[email protected]>
> > > >
> > > > The PKRS MSR is defined as a per-core register. This isolates memory
> > > > access by CPU. Unfortunately, the MSR is not preserved by XSAVE.
> > > > Therefore, We must preserve the protections for individual tasks even if
> > > > they are context switched out and placed on another cpu later.
> > >
> > > This is a contradiction and utter trainwreck.
> >
> > I don't understand where there is a contradiction? Perhaps I should have said
> > the MSR is not XSAVE managed vs 'preserved'?
>
> You're stating the MSR is per-*CORE*, and then continue to talk about
> per-task state.
>
> We've had a bunch of MSRs have exactly that problem recently, and it's
> not fun. We're not going to do that again.

Ah sorry, my mistake yes I meant 'per-logical-processor' like Dave said. I'll
update the commit message.

Ira

2020-07-16 08:40:08

by Chen, Rong A

[permalink] [raw]
Subject: [x86/pks] 061e3e0b92: leaking-addresses.dmesg.uncheckedMSRaccesserror:WRMSRto0x6e1(triedtowrite0x0000000055555554)atrIP:(write_pkrs+0x15/0x2b)

Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: 061e3e0b92e26642bf69bf520767adecec855b6f ("[RFC PATCH 04/15] x86/pks: Preserve the PKRS MSR on context switch")
url: https://github.com/0day-ci/linux/commits/ira-weiny-intel-com/PKS-Add-Protection-Keys-Supervisor-PKS-support/20200714-151030
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 0dc589da873b58b70f4caf4b070fb0cf70fdd1dc

in testcase: leaking-addresses
with following parameters:

ucode: 0x21



on test machine: 8 threads Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz with 16G memory

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>



2020-07-15 05:21:03 ./leaking_addresses.pl --output-raw result/scan.out
2020-07-15 05:21:29 ./leaking_addresses.pl --input-raw result/scan.out --squash-by-filename

Total number of results from scan (incl dmesg): 150520

dmesg output:
[ 0.027642] mapped APIC to ffffffffff5fc000 ( fee00000)
[ 0.039863] mapped IOAPIC to ffffffffff5fb000 (fec00000)
[ 7.259462] unchecked MSR access error: WRMSR to 0x6e1 (tried to write 0x0000000055555554) at rIP: 0xffffffff810743d5 (write_pkrs+0x15/0x2b)
[ 7.905588] RSP: 0018:ffffffff82603e58 EFLAGS: 00000202
[ 7.905589] RAX: ffff88841f22b1c0 RBX: ffff88841f236100 RCX: 000000000000001f
[ 7.905590] RBP: ffffffff82763a80 R08: 00000001d734dad2 R09: 000000007fffffff
[ 7.905591] R10: 0000000000000001 R11: ffff88841f22a324 R12: 00000001d734dad2
[ 7.905591] R13: 0000000000000004 R14: 0000000000000004 R15: ffffffff82612840


To reproduce:

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp install job.yaml # job file is attached in this email
bin/lkp run job.yaml



Thanks,
Rong Chen


Attachments:
(No filename) (1.90 kB)
config-5.8.0-rc5-00012-g061e3e0b92e26 (160.80 kB)
job-script (5.26 kB)
kmsg.xz (20.81 kB)
leaking-addresses (3.33 kB)
job.yaml (4.17 kB)
reproduce (126.00 B)
Download all attachments