2016-10-19 02:03:56

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 0/6] x86/arch_prctl Add ARCH_[GET|SET]_CPUID for controlling the CPUID instruction

rr (http://rr-project.org/), a userspace record-and-replay reverse-
execution debugger, would like to trap and emulate the CPUID instruction.
This would allow us to a) mask away certain hardware features that rr does
not support (e.g. RDRAND) and b) enable trace portability across machines
by providing constant results.

Newer Intel CPUs (Ivy Bridge and later) can fault when CPUID is executed at
CPL > 0. Expose this capability to userspace as a new pair of arch_prctls,
ARCH_GET_CPUID and ARCH_SET_CPUID, with two values, ARCH_CPUID_ENABLE and
ARCH_CPUID_SIGSEGV.

v6 did not get any feedback, so I included the changes since v5.

Since v6:
- Rebased to 4.9-rc1 (primarily over the new VDSO arch_prctls and the new pkey
entries in the syscall table)

Since v5:

Patch 3:
- do_arch_prctl is again do_arch_prctl_common

Patch 6:
- Added static qualifiers on get/set_cpuid_mode
- Braced ifs in disable/enable_cpuid
- Renamed arch_post_exec to arch_setup_new_exec


2016-10-19 02:04:04

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 3/6] x86/arch_prctl: Add do_arch_prctl_common

Add do_arch_prctl_common to handle arch_prctls that are not specific to 64
bits. Call it from the syscall entry point, but not any of the other
callsites in the kernel, which all want one of the existing 64 bit only
arch_prctls.

Signed-off-by: Kyle Huey <[email protected]>
---
arch/x86/include/asm/proto.h | 1 +
arch/x86/kernel/process.c | 5 +++++
arch/x86/kernel/process_64.c | 8 +++++++-
3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 95c3e51..f72b551 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -30,6 +30,7 @@ void x86_report_nx(void);

extern int reboot_force;

+long do_arch_prctl_common(struct task_struct *task, int code, unsigned long arg2);
#ifdef CONFIG_X86_64
long do_arch_prctl_64(struct task_struct *task, int code, unsigned long arg2);
#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a87..d0126b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -584,3 +584,8 @@ unsigned long get_wchan(struct task_struct *p)
put_task_stack(p);
return ret;
}
+
+long do_arch_prctl_common(struct task_struct *task, int code, unsigned long arg2)
+{
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 611df20..bf75d26 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -617,7 +617,13 @@ long do_arch_prctl_64(struct task_struct *task, int code, unsigned long arg2)

SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
{
- return do_arch_prctl_64(current, code, arg2);
+ long ret;
+
+ ret = do_arch_prctl_64(current, code, arg2);
+ if (ret == -EINVAL)
+ ret = do_arch_prctl_common(current, code, arg2);
+
+ return ret;
}

unsigned long KSTK_ESP(struct task_struct *task)
--
2.10.1

2016-10-19 02:04:19

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 2/6] x86/arch_prctl/64: Rename do_arch_prctl to do_arch_prctl_64

In order to introduce new arch_prctls that are not 64 bit only, rename the
existing 64 bit implementation to do_arch_prctl_64. Also rename the second
argument to arch_prctl, which will no longer always be an address.

Signed-off-by: Kyle Huey <[email protected]>
Reviewed-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/proto.h | 4 +++-
arch/x86/kernel/process_64.c | 32 +++++++++++++++++---------------
arch/x86/kernel/ptrace.c | 8 ++++----
arch/x86/um/syscalls_64.c | 4 ++--
4 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 9b9b30b..95c3e51 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -30,6 +30,8 @@ void x86_report_nx(void);

extern int reboot_force;

-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
+#ifdef CONFIG_X86_64
+long do_arch_prctl_64(struct task_struct *task, int code, unsigned long arg2);
+#endif

#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2718cf9..611df20 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -198,7 +198,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
(struct user_desc __user *)tls, 0);
else
#endif
- err = do_arch_prctl(p, ARCH_SET_FS, tls);
+ err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
@@ -539,7 +539,7 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
}
#endif

-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+long do_arch_prctl_64(struct task_struct *task, int code, unsigned long arg2)
{
int ret = 0;
int doit = task == current;
@@ -547,62 +547,64 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)

switch (code) {
case ARCH_SET_GS:
- if (addr >= TASK_SIZE_MAX)
+ if (arg2 >= TASK_SIZE_MAX)
return -EPERM;
cpu = get_cpu();
task->thread.gsindex = 0;
- task->thread.gsbase = addr;
+ task->thread.gsbase = arg2;
if (doit) {
load_gs_index(0);
- ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
+ ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
}
put_cpu();
break;
case ARCH_SET_FS:
/* Not strictly needed for fs, but do it for symmetry
with gs */
- if (addr >= TASK_SIZE_MAX)
+ if (arg2 >= TASK_SIZE_MAX)
return -EPERM;
cpu = get_cpu();
task->thread.fsindex = 0;
- task->thread.fsbase = addr;
+ task->thread.fsbase = arg2;
if (doit) {
/* set the selector to 0 to not confuse __switch_to */
loadsegment(fs, 0);
- ret = wrmsrl_safe(MSR_FS_BASE, addr);
+ ret = wrmsrl_safe(MSR_FS_BASE, arg2);
}
put_cpu();
break;
case ARCH_GET_FS: {
unsigned long base;
+
if (doit)
rdmsrl(MSR_FS_BASE, base);
else
base = task->thread.fsbase;
- ret = put_user(base, (unsigned long __user *)addr);
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
unsigned long base;
+
if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gsbase;
- ret = put_user(base, (unsigned long __user *)addr);
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
# ifdef CONFIG_X86_X32_ABI
case ARCH_MAP_VDSO_X32:
- return prctl_map_vdso(&vdso_image_x32, addr);
+ return prctl_map_vdso(&vdso_image_x32, arg2);
# endif
# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case ARCH_MAP_VDSO_32:
- return prctl_map_vdso(&vdso_image_32, addr);
+ return prctl_map_vdso(&vdso_image_32, arg2);
# endif
case ARCH_MAP_VDSO_64:
- return prctl_map_vdso(&vdso_image_64, addr);
+ return prctl_map_vdso(&vdso_image_64, arg2);
#endif

default:
@@ -613,9 +615,9 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
return ret;
}

-SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, addr)
+SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
{
- return do_arch_prctl(current, code, addr);
+ return do_arch_prctl_64(current, code, arg2);
}

unsigned long KSTK_ESP(struct task_struct *task)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0e63c02..5004302 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -395,12 +395,12 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
/*
- * When changing the segment base, use do_arch_prctl
+ * When changing the segment base, use do_arch_prctl_64
* to set either thread.fs or thread.fsindex and the
* corresponding GDT slot.
*/
if (child->thread.fsbase != value)
- return do_arch_prctl(child, ARCH_SET_FS, value);
+ return do_arch_prctl_64(child, ARCH_SET_FS, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
/*
@@ -409,7 +409,7 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
if (child->thread.gsbase != value)
- return do_arch_prctl(child, ARCH_SET_GS, value);
+ return do_arch_prctl_64(child, ARCH_SET_GS, value);
return 0;
#endif
}
@@ -868,7 +868,7 @@ long arch_ptrace(struct task_struct *child, long request,
Works just like arch_prctl, except that the arguments
are reversed. */
case PTRACE_ARCH_PRCTL:
- ret = do_arch_prctl(child, data, addr);
+ ret = do_arch_prctl_64(child, data, addr);
break;
#endif

diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index ab3f7f4..3362c4e 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -73,9 +73,9 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
return ret;
}

-SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, addr)
+SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
{
- return arch_prctl(current, code, (unsigned long __user *) addr);
+ return arch_prctl(current, code, (unsigned long __user *) arg2);
}

void arch_switch_to(struct task_struct *to)
--
2.10.1

2016-10-19 02:04:16

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 1/6] x86/arch_prctl/64: Use SYSCALL_DEFINE2 to define sys_arch_prctl

Signed-off-by: Kyle Huey <[email protected]>
---
arch/x86/kernel/process_64.c | 3 ++-
arch/x86/um/syscalls_64.c | 3 ++-
2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3..2718cf9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -35,6 +35,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
+#include <linux/syscalls.h>

#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -612,7 +613,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
return ret;
}

-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, addr)
{
return do_arch_prctl(current, code, addr);
}
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index e655227..ab3f7f4 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -6,6 +6,7 @@
*/

#include <linux/sched.h>
+#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include <os.h>
@@ -72,7 +73,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
return ret;
}

-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, addr)
{
return arch_prctl(current, code, (unsigned long __user *) addr);
}

base-commit: 1001354ca34179f3db924eb66672442a173147dc
--
2.10.1

2016-10-19 02:04:59

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 6/6] x86/arch_prctl: Add ARCH_[GET|SET]_CPUID

Intel supports faulting on the CPUID instruction beginning with Ivy Bridge.
When enabled, the processor will fault on attempts to execute the CPUID
instruction with CPL>0. Exposing this feature to userspace will allow a
ptracer to trap and emulate the CPUID instruction.

When supported, this feature is controlled by toggling bit 0 of
MSR_MISC_FEATURES_ENABLES. It is documented in detail in Section 2.3.2 of
http://www.intel.com/content/dam/www/public/us/en/documents/application-notes/virtualization-technology-flexmigration-application-note.pdf

Implement a new pair of arch_prctls, available on both x86-32 and x86-64.

ARCH_GET_CPUID: Returns the current CPUID faulting state, either
ARCH_CPUID_ENABLE or ARCH_CPUID_SIGSEGV. arg2 must be 0.

ARCH_SET_CPUID: Set the CPUID faulting state to arg2, which must be either
ARCH_CPUID_ENABLE or ARCH_CPUID_SIGSEGV. Returns EINVAL if arg2 is
another value or CPUID faulting is not supported on this system.

The state of the CPUID faulting flag is propagated across forks, but reset
upon exec.

Signed-off-by: Kyle Huey <[email protected]>
---
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/include/asm/thread_info.h | 6 +-
arch/x86/include/uapi/asm/prctl.h | 6 +
arch/x86/kernel/process.c | 96 ++++++++++++-
fs/exec.c | 1 +
include/linux/thread_info.h | 4 +
tools/testing/selftests/x86/Makefile | 2 +-
tools/testing/selftests/x86/cpuid-fault.c | 231 ++++++++++++++++++++++++++++++
8 files changed, 344 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/x86/cpuid-fault.c

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 39aa563..cddefdd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -53,6 +53,7 @@
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+#define MSR_MISC_FEATURES_ENABLES 0x00000140

#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2aaca53..59dd761 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -78,6 +78,7 @@ struct task_struct;
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
+#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
@@ -101,6 +102,7 @@ struct task_struct;
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_NOHZ (1 << TIF_NOHZ)
@@ -129,7 +131,7 @@ struct task_struct;

/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)

#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
@@ -230,6 +232,8 @@ static inline int arch_within_stack_frames(const void * const stack,
extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
#endif /* !__ASSEMBLY__ */

#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index ae135de..0f6389c 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -6,6 +6,12 @@
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004

+/* Get/set the process' ability to use the CPUID instruction */
+#define ARCH_GET_CPUID 0x1005
+#define ARCH_SET_CPUID 0x1006
+# define ARCH_CPUID_ENABLE 1 /* allow the use of the CPUID instruction */
+# define ARCH_CPUID_SIGSEGV 2 /* throw a SIGSEGV instead of reading the CPUID */
+
#ifdef CONFIG_CHECKPOINT_RESTORE
# define ARCH_MAP_VDSO_X32 0x2001
# define ARCH_MAP_VDSO_32 0x2002
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d0126b2..4bbb2eb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -33,6 +33,7 @@
#include <asm/mce.h>
#include <asm/vm86.h>
#include <asm/switch_to.h>
+#include <asm/prctl.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -192,6 +193,70 @@ int set_tsc_mode(unsigned int val)
return 0;
}

+static void switch_cpuid_faulting(bool on)
+{
+ if (on)
+ msr_set_bit(MSR_MISC_FEATURES_ENABLES, 0);
+ else
+ msr_clear_bit(MSR_MISC_FEATURES_ENABLES, 0);
+}
+
+static void disable_cpuid(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ switch_cpuid_faulting(true);
+ }
+ preempt_enable();
+}
+
+static void enable_cpuid(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ switch_cpuid_faulting(false);
+ }
+ preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+ return test_thread_flag(TIF_NOCPUID) ? ARCH_CPUID_SIGSEGV : ARCH_CPUID_ENABLE;
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long val)
+{
+ /* Only disable_cpuid() if it is supported on this hardware. */
+ bool cpuid_fault_supported = static_cpu_has(X86_FEATURE_CPUID_FAULT);
+
+ if (val == ARCH_CPUID_ENABLE)
+ enable_cpuid();
+ else if (val == ARCH_CPUID_SIGSEGV && cpuid_fault_supported)
+ disable_cpuid();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+ /* If cpuid was previously disabled for this task, re-enable it. */
+ if (test_thread_flag(TIF_NOCPUID))
+ enable_cpuid();
+}
+
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
@@ -211,6 +276,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
update_debugctlmsr(debugctl);
}

+ if (test_tsk_thread_flag(prev_p, TIF_NOCPUID) ^
+ test_tsk_thread_flag(next_p, TIF_NOCPUID)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOCPUID))
+ switch_cpuid_faulting(true);
+ else
+ switch_cpuid_faulting(false);
+ }
+
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
@@ -587,5 +661,25 @@ unsigned long get_wchan(struct task_struct *p)

long do_arch_prctl_common(struct task_struct *task, int code, unsigned long arg2)
{
- return -EINVAL;
+ int ret = 0;
+
+ switch (code) {
+ case ARCH_GET_CPUID: {
+ if (arg2 != 0)
+ ret = -EINVAL;
+ else
+ ret = get_cpuid_mode();
+ break;
+ }
+ case ARCH_SET_CPUID: {
+ ret = set_cpuid_mode(task, arg2);
+ break;
+ }
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
}
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f..f7ba1f5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1287,6 +1287,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);

+ arch_setup_new_exec();
perf_event_exec();
__set_task_comm(current, kbasename(bprm->filename), true);

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 45f004e..70aba0d 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -145,6 +145,10 @@ static inline void check_object_size(const void *ptr, unsigned long n,
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

+#ifndef arch_setup_new_exec
+static inline void arch_setup_new_exec(void) {}
+#endif
+
#endif /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index a89f80a..3744f28 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -6,7 +6,7 @@ include ../lib.mk

TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
check_initial_reg_state sigreturn ldt_gdt iopl \
- protection_keys
+ protection_keys cpuid-fault
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/cpuid-fault.c b/tools/testing/selftests/x86/cpuid-fault.c
new file mode 100644
index 0000000..6c257a8
--- /dev/null
+++ b/tools/testing/selftests/x86/cpuid-fault.c
@@ -0,0 +1,231 @@
+
+/*
+ * Tests for arch_prctl(ARCH_GET_CPUID, ...) / arch_prctl(ARCH_SET_CPUID, ...)
+ *
+ * Basic test to test behaviour of ARCH_GET_CPUID and ARCH_SET_CPUID
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <cpuid.h>
+#include <err.h>
+#include <errno.h>
+#include <sys/wait.h>
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+const char *cpuid_names[] = {
+ [0] = "[not set]",
+ [ARCH_CPUID_ENABLE] = "ARCH_CPUID_ENABLE",
+ [ARCH_CPUID_SIGSEGV] = "ARCH_CPUID_SIGSEGV",
+};
+
+int arch_prctl(int code, unsigned long arg2)
+{
+ return syscall(SYS_arch_prctl, code, arg2);
+}
+
+int cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+ unsigned int *edx)
+{
+ return __get_cpuid(0, eax, ebx, ecx, edx);
+}
+
+int do_child_exec_test(int eax, int ebx, int ecx, int edx)
+{
+ int cpuid_val = 0, child = 0, status = 0;
+
+ printf("arch_prctl(ARCH_GET_CPUID); ");
+
+ cpuid_val = arch_prctl(ARCH_GET_CPUID, 0);
+ if (cpuid_val < 0)
+ errx(1, "ARCH_GET_CPUID fails now, but not before?");
+
+ printf("cpuid_val == %s\n", cpuid_names[cpuid_val]);
+ if (cpuid_val != ARCH_CPUID_SIGSEGV)
+ errx(1, "How did cpuid get re-enabled on fork?");
+
+ if ((child = fork()) == 0) {
+ cpuid_val = arch_prctl(ARCH_GET_CPUID, 0);
+ if (cpuid_val < 0)
+ errx(1, "ARCH_GET_CPUID fails now, but not before?");
+
+ printf("cpuid_val == %s\n", cpuid_names[cpuid_val]);
+ if (cpuid_val != ARCH_CPUID_SIGSEGV)
+ errx(1, "How did cpuid get re-enabled on fork?");
+
+ printf("exec\n");
+ execl("/proc/self/exe", "cpuid-fault", "-early-return", NULL);
+ }
+
+ if (child != waitpid(child, &status, 0))
+ errx(1, "waitpid failed!?");
+
+ if (WEXITSTATUS(status) != 0)
+ errx(1, "Execed child exited abnormally");
+
+ return 0;
+}
+
+int child_received_signal;
+
+void child_sigsegv_cb(int sig)
+{
+ int cpuid_val = 0;
+
+ child_received_signal = 1;
+ printf("[ SIG_SEGV ]\n");
+ printf("arch_prctl(ARCH_GET_CPUID); ");
+
+ cpuid_val = arch_prctl(ARCH_GET_CPUID, 0);
+ if (cpuid_val < 0)
+ errx(1, "ARCH_GET_CPUID fails now, but not before?");
+
+ printf("cpuid_val == %s\n", cpuid_names[cpuid_val]);
+ printf("arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_ENABLE)\n");
+ if (arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_ENABLE) != 0)
+ exit(errno);
+
+ printf("cpuid() == ");
+}
+
+int do_child_test(void)
+{
+ unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+
+ signal(SIGSEGV, child_sigsegv_cb);
+
+ /* the child starts out with cpuid disabled, the signal handler
+ * attempts to enable and retry
+ */
+ printf("cpuid() == ");
+ cpuid(&eax, &ebx, &ecx, &edx);
+ printf("{%x, %x, %x, %x}\n", eax, ebx, ecx, edx);
+ return child_received_signal ? 0 : 42;
+}
+
+int signal_count;
+
+void sigsegv_cb(int sig)
+{
+ int cpuid_val = 0;
+
+ signal_count++;
+ printf("[ SIG_SEGV ]\n");
+ printf("arch_prctl(ARCH_GET_CPUID); ");
+
+ cpuid_val = arch_prctl(ARCH_GET_CPUID, 0);
+ if (cpuid_val < 0)
+ errx(1, "ARCH_GET_CPUID fails now, but not before?");
+
+ printf("cpuid_val == %s\n", cpuid_names[cpuid_val]);
+ printf("arch_prctl(ARC_SET_CPUID, ARCH_CPUID_ENABLE)\n");
+ if (arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_ENABLE) != 0)
+ errx(1, "ARCH_SET_CPUID failed!");
+
+ printf("cpuid() == ");
+}
+
+int main(int argc, char **argv)
+{
+ int cpuid_val = 0, child = 0, status = 0;
+ unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+
+ signal(SIGSEGV, sigsegv_cb);
+ setvbuf(stdout, NULL, _IONBF, 0);
+
+ cpuid(&eax, &ebx, &ecx, &edx);
+ printf("cpuid() == {%x, %x, %x, %x}\n", eax, ebx, ecx, edx);
+ printf("arch_prctl(ARCH_GET_CPUID); ");
+
+ cpuid_val = arch_prctl(ARCH_GET_CPUID, 0);
+ if (cpuid_val < 0) {
+ if (errno == EINVAL) {
+ printf("ARCH_GET_CPUID is unsupported on this system.\n");
+ fflush(stdout);
+ exit(0); /* no ARCH_GET_CPUID on this system */
+ } else {
+ errx(errno, "ARCH_GET_CPUID failed unexpectedly!");
+ }
+ }
+
+ printf("cpuid_val == %s\n", cpuid_names[cpuid_val]);
+ cpuid(&eax, &ebx, &ecx, &edx);
+ printf("cpuid() == {%x, %x, %x, %x}\n", eax, ebx, ecx, edx);
+ printf("arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_ENABLE)\n");
+
+ if (arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_ENABLE) != 0) {
+ if (errno == EINVAL) {
+ printf("ARCH_SET_CPUID is unsupported on this system.");
+ exit(0); /* no ARCH_SET_CPUID on this system */
+ } else {
+ errx(errno, "ARCH_SET_CPUID failed unexpectedly!");
+ }
+ }
+
+
+ cpuid(&eax, &ebx, &ecx, &edx);
+ printf("cpuid() == {%x, %x, %x, %x}\n", eax, ebx, ecx, edx);
+ printf("arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_SIGSEGV)\n");
+ fflush(stdout);
+
+ if (arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_SIGSEGV) == -1)
+ errx(1, "ARCH_SET_CPUID failed!");
+
+ printf("cpuid() == ");
+ eax = ebx = ecx = edx = 0;
+ cpuid(&eax, &ebx, &ecx, &edx);
+ printf("{%x, %x, %x, %x}\n", eax, ebx, ecx, edx);
+ printf("arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_SIGSEGV)\n");
+
+ if (signal_count != 1)
+ errx(1, "cpuid didn't fault!");
+
+ if (arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_SIGSEGV) == -1)
+ errx(1, "ARCH_SET_CPUID failed!");
+
+ if (argc > 1)
+ exit(0); /* Don't run the whole test again if we were execed */
+
+ printf("do_child_test\n");
+ if ((child = fork()) == 0)
+ return do_child_test();
+
+ if (child != waitpid(child, &status, 0))
+ errx(1, "waitpid failed!?");
+
+ if (WEXITSTATUS(status) != 0)
+ errx(1, "Child exited abnormally!");
+
+ /* The child enabling cpuid should not have affected us */
+ printf("cpuid() == ");
+ eax = ebx = ecx = edx = 0;
+ cpuid(&eax, &ebx, &ecx, &edx);
+ printf("{%x, %x, %x, %x}\n", eax, ebx, ecx, edx);
+ printf("arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_SIGSEGV)\n");
+
+ if (signal_count != 2)
+ errx(1, "cpuid didn't fault!");
+
+ if (arch_prctl(ARCH_SET_CPUID, ARCH_CPUID_SIGSEGV) == -1)
+ errx(1, "ARCH_SET_CPUID failed!");
+
+ /* Our ARCH_CPUID_SIGSEGV should not propagate through exec */
+ printf("do_child_exec_test\n");
+ fflush(stdout);
+ if ((child = fork()) == 0)
+ return do_child_exec_test(eax, ebx, ecx, edx);
+
+ if (child != waitpid(child, &status, 0))
+ errx(1, "waitpid failed!?");
+
+ if (WEXITSTATUS(status) != 0)
+ errx(1, "Child exited abnormally!");
+
+ printf("All tests passed!\n");
+ exit(EXIT_SUCCESS);
+}
--
2.10.1

2016-10-19 02:05:13

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 5/6] x86/cpufeature: Detect CPUID faulting support

Intel supports faulting on the CPUID instruction beginning with Ivy Bridge.
When enabled, the processor will fault on attempts to execute the CPUID
instruction with CPL>0. This will allow a ptracer to emulate the CPUID
instruction.

Bit 31 of MSR_PLATFORM_INFO advertises support for this feature. It is
documented in detail in Section 2.3.2 of
http://www.intel.com/content/dam/www/public/us/en/documents/application-notes/virtualization-technology-flexmigration-application-note.pdf

Detect support for this feature and expose it as X86_FEATURE_CPUID_FAULT.

Signed-off-by: Kyle Huey <[email protected]>
Reviewed-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/kernel/cpu/scattered.c | 13 +++++++++++++
3 files changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1188bc8..a918f98 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -189,6 +189,7 @@

#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 4) /* Intel CPUID faulting */

#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 56f4c66..39aa563 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -41,6 +41,7 @@
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
#define MSR_PLATFORM_INFO 0x000000ce
+#define PLATINFO_CPUID_FAULT (1UL << 31)

#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 8cb57df..7901481 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -24,6 +24,16 @@ enum cpuid_regs {
CR_EBX
};

+static bool supports_cpuid_faulting(void)
+{
+ unsigned int lo, hi;
+
+ if (rdmsr_safe(MSR_PLATFORM_INFO, &lo, &hi))
+ return false;
+
+ return lo & PLATINFO_CPUID_FAULT;
+}
+
void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{
u32 max_level;
@@ -54,4 +64,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
}
+
+ if (supports_cpuid_faulting())
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
}
--
2.10.1

2016-10-19 02:05:20

by Kyle Huey

[permalink] [raw]
Subject: [PATCH v7 4/6] x86/syscalls/32: Wire up arch_prctl on x86-32

Hook up arch_prctl to call do_arch_prctl on x86-32, and in 32 bit compat
mode on x86-64. This allows us to have arch_prctls that are not specific to
64 bits.

On UML, simply stub out this syscall.

Signed-off-by: Kyle Huey <[email protected]>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/kernel/process_32.c | 7 +++++++
arch/x86/kernel/process_64.c | 7 +++++++
arch/x86/um/Makefile | 2 +-
arch/x86/um/syscalls_32.c | 7 +++++++
include/linux/compat.h | 2 ++
6 files changed, 25 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/um/syscalls_32.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ff6ef7b..804456f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
382 i386 pkey_free sys_pkey_free
#383 i386 pkey_get sys_pkey_get
#384 i386 pkey_set sys_pkey_set
+385 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8e..95d3adc 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -35,6 +35,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/syscalls.h>

#include <asm/pgtable.h>
#include <asm/ldt.h>
@@ -54,6 +55,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
+#include <asm/proto.h>

void __show_regs(struct pt_regs *regs, int all)
{
@@ -301,3 +303,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)

return prev_p;
}
+
+SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, code, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bf75d26..3a2a84d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -626,6 +626,13 @@ SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
return ret;
}

+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, code, arg2);
+}
+#endif
+
unsigned long KSTK_ESP(struct task_struct *task)
{
return task_pt_regs(task)->sp;
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index e7e7055..69f0827 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \

ifeq ($(CONFIG_X86_32),y)

-obj-y += checksum_32.o
+obj-y += checksum_32.o syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o

subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644
index 0000000..ccf0598
--- /dev/null
+++ b/arch/x86/um/syscalls_32.c
@@ -0,0 +1,7 @@
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, code, unsigned long, arg2)
+{
+ return -EINVAL;
+}
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6360939..500cdb3 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -721,6 +721,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
int, const char __user *);

+asmlinkage long compat_sys_arch_prctl(int, unsigned long);
+
/*
* For most but not all architectures, "am I in a compat syscall?" and
* "am I a compat task?" are the same question. For architectures on which
--
2.10.1

2016-10-25 05:31:03

by Kyle Huey

[permalink] [raw]
Subject: Re: [PATCH v7 0/6] x86/arch_prctl Add ARCH_[GET|SET]_CPUID for controlling the CPUID instruction

On Tue, Oct 18, 2016 at 7:03 PM, Kyle Huey <[email protected]> wrote:
> rr (http://rr-project.org/), a userspace record-and-replay reverse-
> execution debugger, would like to trap and emulate the CPUID instruction.
> This would allow us to a) mask away certain hardware features that rr does
> not support (e.g. RDRAND) and b) enable trace portability across machines
> by providing constant results.
>
> Newer Intel CPUs (Ivy Bridge and later) can fault when CPUID is executed at
> CPL > 0. Expose this capability to userspace as a new pair of arch_prctls,
> ARCH_GET_CPUID and ARCH_SET_CPUID, with two values, ARCH_CPUID_ENABLE and
> ARCH_CPUID_SIGSEGV.
>
> v6 did not get any feedback, so I included the changes since v5.
>
> Since v6:
> - Rebased to 4.9-rc1 (primarily over the new VDSO arch_prctls and the new pkey
> entries in the syscall table)
>
> Since v5:
>
> Patch 3:
> - do_arch_prctl is again do_arch_prctl_common
>
> Patch 6:
> - Added static qualifiers on get/set_cpuid_mode
> - Braced ifs in disable/enable_cpuid
> - Renamed arch_post_exec to arch_setup_new_exec
>

A friendly ping. Has anybody had an opportunity to look at this patch
series again? Any feedback, positive or negative, would be greatly
appreciated.

- Kyle

2016-10-25 16:51:41

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v7 0/6] x86/arch_prctl Add ARCH_[GET|SET]_CPUID for controlling the CPUID instruction

On Mon, 24 Oct 2016, Kyle Huey wrote:
>
> A friendly ping. Has anybody had an opportunity to look at this patch
> series again? Any feedback, positive or negative, would be greatly
> appreciated.

It's on my list, but I'm stuck in bug hunting at the moment.

2016-10-27 14:06:53

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v7 5/6] x86/cpufeature: Detect CPUID faulting support

On Tue, 18 Oct 2016, Kyle Huey wrote:
>
> +static bool supports_cpuid_faulting(void)
> +{
> + unsigned int lo, hi;
> +
> + if (rdmsr_safe(MSR_PLATFORM_INFO, &lo, &hi))
> + return false;
> +
> + return lo & PLATINFO_CPUID_FAULT;
> +}
> +
> void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
> {
> u32 max_level;
> @@ -54,4 +64,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
> if (regs[cb->reg] & (1 << cb->bit))
> set_cpu_cap(c, cb->feature);
> }
> +
> + if (supports_cpuid_faulting())
> + set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
> }

Instead of specific magic for this feature I'd rather like to see something
like the completely untested patch below. That allows us to add MSR based
features trivially in the future.

Thanks,

tglx

8<------------------------

--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -24,11 +24,18 @@ enum cpuid_regs {
CR_EBX
};

+struct msr_bit {
+ u16 feature;
+ u16 msr;
+ u8 bit;
+};
+
void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{
- u32 max_level;
- u32 regs[4];
const struct cpuid_bit *cb;
+ const struct msr_bit *mb;
+ u32 max_level, regs[4];
+ u64 msrval;

static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
@@ -42,6 +49,11 @@ void init_scattered_cpuid_features(struc
{ 0, 0, 0, 0, 0 }
};

+ static const struct msr_bit msr_bits[] = {
+ { X86_FEATURE_CPUID_FAULT, MSR_PLATFORM_INFO, 31 },
+ { 0, 0, 0 }
+ };
+
for (cb = cpuid_bits; cb->feature; cb++) {

/* Verify that the level is valid */
@@ -56,4 +68,12 @@ void init_scattered_cpuid_features(struc
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
}
+
+ for (mb = msr_bits; mb->feature; mb++) {
+ if (rdmsrl_safe(mb->msr, &msrval))
+ continue;
+ if (msrval & (1ULL << mb->bit))
+ set_cpu_cap(c, mb->feature);
+ }
+
}

2016-10-27 15:04:42

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v7 6/6] x86/arch_prctl: Add ARCH_[GET|SET]_CPUID

On Tue, 18 Oct 2016, Kyle Huey wrote:
> When supported, this feature is controlled by toggling bit 0 of
> MSR_MISC_FEATURES_ENABLES. It is documented in detail in Section 2.3.2 of
> http://www.intel.com/content/dam/www/public/us/en/documents/application-notes/virtualization-technology-flexmigration-application-note.pdf
>

Bah. I really dislike these document URLs. They often change before a patch
hits Linus tree just because $corps think that restructuring their web
sites every two month is enhancing user experience.

I really wonder if we should start collecting such information in a k.org
wiki or just stick the PDF into the k.org bugzilla so we get a permanent
reference.

<RANT AT INHELL>

Can Intel please get its act together and stop sticking stuff in random
application notes instead of properly documenting it in the SDM?

The SDM Sept 2016 still marks bit 31 of the PLATFORM_INFO MSR as reserved
and the MISC_FEATURE_ENABLES msr is mentioned at all. Really useful, NOT!

</RANT>

> +static void switch_cpuid_faulting(bool on)
> +{
> + if (on)
> + msr_set_bit(MSR_MISC_FEATURES_ENABLES, 0);

We really want this '0' to be a proper define and not just a hardcoded
value.

> + else
> + msr_clear_bit(MSR_MISC_FEATURES_ENABLES, 0);
> +}
> +
> +static void disable_cpuid(void)
> +{
> + preempt_disable();
> + if (!test_and_set_thread_flag(TIF_NOCPUID)) {
> + /*
> + * Must flip the CPU state synchronously with
> + * TIF_NOCPUID in the current running context.
> + */
> + switch_cpuid_faulting(true);
> + }
> + preempt_enable();
> +}
> +
> +static void enable_cpuid(void)
> +{
> + preempt_disable();
> + if (test_and_clear_thread_flag(TIF_NOCPUID)) {
> + /*
> + * Must flip the CPU state synchronously with
> + * TIF_NOCPUID in the current running context.
> + */
> + switch_cpuid_faulting(false);
> + }
> + preempt_enable();
> +}
> +
> +static int get_cpuid_mode(void)
> +{
> + return test_thread_flag(TIF_NOCPUID) ? ARCH_CPUID_SIGSEGV : ARCH_CPUID_ENABLE;
> +}
> +
> +static int set_cpuid_mode(struct task_struct *task, unsigned long val)
> +{
> + /* Only disable_cpuid() if it is supported on this hardware. */
> + bool cpuid_fault_supported = static_cpu_has(X86_FEATURE_CPUID_FAULT);

Errm. Why are you restricting this to disable_cpuid()? If that feature is
off then there is no guarantee that MSR_MISC_FEATURES_ENABLES is
accessible. This wants to have:

if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
return -ENODEV;

or some other sensible error code.

> + if (val == ARCH_CPUID_ENABLE)
> + enable_cpuid();
> + else if (val == ARCH_CPUID_SIGSEGV && cpuid_fault_supported)
> + disable_cpuid();
> + else
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +/*
> + * Called immediately after a successful exec.
> + */
> +void arch_setup_new_exec(void)
> +{
> + /* If cpuid was previously disabled for this task, re-enable it. */
> + if (test_thread_flag(TIF_NOCPUID))
> + enable_cpuid();
> +}
> +
> void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
> struct tss_struct *tss)
> {
> @@ -211,6 +276,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
> update_debugctlmsr(debugctl);
> }
>
> + if (test_tsk_thread_flag(prev_p, TIF_NOCPUID) ^
> + test_tsk_thread_flag(next_p, TIF_NOCPUID)) {
> + /* prev and next are different */

Why commenting the obvious? That's clear from the code.

> + if (test_tsk_thread_flag(next_p, TIF_NOCPUID))
> + switch_cpuid_faulting(true);
> + else
> + switch_cpuid_faulting(false);

This is insane. The compiler makes that a conditional jump and then in
switch_cpuid_faulting we get another one. Further switch_cpuid_faulting()
calls into lib/msr which is adding even more overhead.

msr_set/clear_bit() are nice for random driver code, but complete overkill
for the context switch hotpath.

That's just not acceptable for switch_to(). We keep adding cruft and then
wonder why context switches slow down despite machines getting faster.

This can and needs to be done smarter. See untested patch below. The
resulting code has a single conditional jump, which is obviously the check
for a change between prev and next. Everything else is done with straight
linear shift,add,and,rdmsr,wrmsr instructions.

> @@ -587,5 +661,25 @@ unsigned long get_wchan(struct task_struct *p)
>
> long do_arch_prctl_common(struct task_struct *task, int code, unsigned long arg2)
> {
> - return -EINVAL;
> + int ret = 0;
> +
> + switch (code) {
> + case ARCH_GET_CPUID: {
> + if (arg2 != 0)
> + ret = -EINVAL;
> + else
> + ret = get_cpuid_mode();
> + break;
> + }
> + case ARCH_SET_CPUID: {
> + ret = set_cpuid_mode(task, arg2);
> + break;
> + }
> +
> + default:
> + ret = -EINVAL;
> + break;
> + }
> +
> + return ret;

If you try hard, you might come up with a variant which takes even more
lines.

The delta patch below applies on top of this patch and is completely
untested. If you find bugs, you own them :)

Thanks,

tglx

8<-----------------------

--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -54,6 +54,7 @@
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
#define MSR_MISC_FEATURES_ENABLES 0x00000140
+#define CPUID_FAULT_ENABLE (1ULL << 0)

#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -193,12 +193,17 @@ int set_tsc_mode(unsigned int val)
return 0;
}

-static void switch_cpuid_faulting(bool on)
+#define CPUID_FAULT_ON_MASK (~0ULL)
+#define CPUID_FAULT_OFF_MASK (~CPUID_FAULT_ENABLE)
+
+static void cpuid_fault_ctrl(u64 msk)
{
- if (on)
- msr_set_bit(MSR_MISC_FEATURES_ENABLES, 0);
- else
- msr_clear_bit(MSR_MISC_FEATURES_ENABLES, 0);
+ u64 msrval;
+
+ rdmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+ msrval |= CPUID_FAULT_ENABLE;
+ msrval &= msk;
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
}

static void disable_cpuid(void)
@@ -209,7 +214,7 @@ static void disable_cpuid(void)
* Must flip the CPU state synchronously with
* TIF_NOCPUID in the current running context.
*/
- switch_cpuid_faulting(true);
+ cpuid_fault_ctrl(CPUID_FAULT_OFF_MASK);
}
preempt_enable();
}
@@ -222,7 +227,7 @@ static void enable_cpuid(void)
* Must flip the CPU state synchronously with
* TIF_NOCPUID in the current running context.
*/
- switch_cpuid_faulting(false);
+ cpuid_fault_ctrl(CPUID_FAULT_ON_MASK);
}
preempt_enable();
}
@@ -234,16 +239,18 @@ static int get_cpuid_mode(void)

static int set_cpuid_mode(struct task_struct *task, unsigned long val)
{
- /* Only disable_cpuid() if it is supported on this hardware. */
- bool cpuid_fault_supported = static_cpu_has(X86_FEATURE_CPUID_FAULT);
+ if (static_cpu_has(X86_FEATURE_CPUID_FAULT))
+ return -ENODEV;

- if (val == ARCH_CPUID_ENABLE)
+ switch (val) {
+ case ARCH_CPUID_ENABLE:
enable_cpuid();
- else if (val == ARCH_CPUID_SIGSEGV && cpuid_fault_supported)
+ break;
+ case ARCH_CPUID_SIGSEGV:
disable_cpuid();
- else
- return -EINVAL;
-
+ break;
+ default: return -EINVAL;
+ }
return 0;
}

@@ -278,11 +285,10 @@ void __switch_to_xtra(struct task_struct

if (test_tsk_thread_flag(prev_p, TIF_NOCPUID) ^
test_tsk_thread_flag(next_p, TIF_NOCPUID)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOCPUID))
- switch_cpuid_faulting(true);
- else
- switch_cpuid_faulting(false);
+ bool set = test_tsk_thread_flag(next_p, TIF_NOCPUID);
+ u64 msk = set ? CPUID_FAULT_ON_MASK : CPUID_FAULT_OFF_MASK;
+
+ cpuid_fault_ctrl(msk);
}

if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
@@ -661,25 +667,11 @@ unsigned long get_wchan(struct task_stru

long do_arch_prctl_common(struct task_struct *task, int code, unsigned long arg2)
{
- int ret = 0;
-
switch (code) {
- case ARCH_GET_CPUID: {
- if (arg2 != 0)
- ret = -EINVAL;
- else
- ret = get_cpuid_mode();
- break;
- }
- case ARCH_SET_CPUID: {
- ret = set_cpuid_mode(task, arg2);
- break;
+ case ARCH_GET_CPUID:
+ return arg2 ? -EINVAL : get_cpuid_mode();
+ case ARCH_SET_CPUID:
+ return set_cpuid_mode(task, arg2);
}
-
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
+ return -EINVAL;
}

2016-10-27 22:35:17

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v7 6/6] x86/arch_prctl: Add ARCH_[GET|SET]_CPUID

On Thu, Oct 27, 2016 at 4:15 AM, Thomas Gleixner <[email protected]> wrote:
> This is insane. The compiler makes that a conditional jump and then in
> switch_cpuid_faulting we get another one. Further switch_cpuid_faulting()
> calls into lib/msr which is adding even more overhead.
>
> msr_set/clear_bit() are nice for random driver code, but complete overkill
> for the context switch hotpath.
>
> That's just not acceptable for switch_to(). We keep adding cruft and then
> wonder why context switches slow down despite machines getting faster.
>
> This can and needs to be done smarter. See untested patch below. The
> resulting code has a single conditional jump, which is obviously the check
> for a change between prev and next. Everything else is done with straight
> linear shift,add,and,rdmsr,wrmsr instructions.
>

...

> #define MSR_IA32_SYSENTER_ESP 0x00000175
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -193,12 +193,17 @@ int set_tsc_mode(unsigned int val)
> return 0;
> }
>
> -static void switch_cpuid_faulting(bool on)
> +#define CPUID_FAULT_ON_MASK (~0ULL)
> +#define CPUID_FAULT_OFF_MASK (~CPUID_FAULT_ENABLE)
> +
> +static void cpuid_fault_ctrl(u64 msk)
> {
> - if (on)
> - msr_set_bit(MSR_MISC_FEATURES_ENABLES, 0);
> - else
> - msr_clear_bit(MSR_MISC_FEATURES_ENABLES, 0);
> + u64 msrval;
> +
> + rdmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
> + msrval |= CPUID_FAULT_ENABLE;
> + msrval &= msk;
> + wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
> }

Let's just do this right from day one:

static void set_cpuid_faulting(bool on)
{
u64 msrval;

DEBUG_LOCKS_WARN_ON(!irqs_disabled());

msrval = this_cpu_read(msr_misc_features_enables_shadow);
msrval &= CPUID_FAULT_ENABLE;
msrval |= (on << CPUID_FAULT_ENABLE_BIT);
this_cpu_write(msr_misc_features_enables_shadow, msrval);
wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
}

RDMSR may be considerably faster than WRMSR, but that doesn't mean it's *fast*.

Obviously this needs some initialization code, but that's fine IMO.

--Andy

2016-10-27 22:42:45

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v7 6/6] x86/arch_prctl: Add ARCH_[GET|SET]_CPUID

On Thu, 27 Oct 2016, Andy Lutomirski wrote:
> On Thu, Oct 27, 2016 at 4:15 AM, Thomas Gleixner <[email protected]> wrote:
> Let's just do this right from day one:
>
> static void set_cpuid_faulting(bool on)
> {
> u64 msrval;
>
> DEBUG_LOCKS_WARN_ON(!irqs_disabled());
>
> msrval = this_cpu_read(msr_misc_features_enables_shadow);
> msrval &= CPUID_FAULT_ENABLE;
> msrval |= (on << CPUID_FAULT_ENABLE_BIT);
> this_cpu_write(msr_misc_features_enables_shadow, msrval);
> wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
> }
>
> RDMSR may be considerably faster than WRMSR, but that doesn't mean it's *fast*.

Good point!