Given feedbacks from [1], it was asked to make first a few
patches ready as soon as possible. To make FSGSBASE facilitated,
some helper functions and refactoring work are incorporated.
Besides that, it includes Andy's fix for accurate FS/GS base read
and cleanup for the vDSO initialization.
[1] FSGSBASE patch set V2: https://lkml.org/lkml/2018/5/31/686
Andy Lutomirski (1):
x86/fsgsbase/64: Make ptrace read FS/GS base accurately
Chang S. Bae (5):
x86/fsgsbase/64: Introduce FS/GS base helper functions
x86/fsgsbase/64: Use FS/GS base helpers in core dump
x86/fsgsbase/64: Factor out load FS/GS segments from __switch_to
x86/msr: write_rdtscp_aux() to use wrmsr_safe()
x86/vdso: Move out the CPU number store
arch/x86/entry/vdso/vgetcpu.c | 4 +-
arch/x86/entry/vdso/vma.c | 38 +--------
arch/x86/include/asm/elf.h | 6 +-
arch/x86/include/asm/fsgsbase.h | 47 +++++++++++
arch/x86/include/asm/msr.h | 2 +-
arch/x86/include/asm/segment.h | 29 ++++++-
arch/x86/include/asm/vgtod.h | 2 -
arch/x86/kernel/cpu/common.c | 5 ++
arch/x86/kernel/process_64.c | 181 +++++++++++++++++++++++++++++++---------
arch/x86/kernel/ptrace.c | 28 ++-----
arch/x86/kernel/setup_percpu.c | 17 +++-
11 files changed, 250 insertions(+), 109 deletions(-)
create mode 100644 arch/x86/include/asm/fsgsbase.h
--
2.7.4
When new FSGSBASE instructions enabled, this read will
become faster.
Based-on-code-from: Andy Lutomirski <[email protected]>
Signed-off-by: Chang S. Bae <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Reviewed-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/elf.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 0d157d2..49acefb 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -10,6 +10,7 @@
#include <asm/ptrace.h>
#include <asm/user.h>
#include <asm/auxvec.h>
+#include <asm/fsgsbase.h>
typedef unsigned long elf_greg_t;
@@ -205,7 +206,6 @@ void set_personality_ia32(bool);
#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
- unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15; \
(pr_reg)[1] = (regs)->r14; \
@@ -228,8 +228,8 @@ do { \
(pr_reg)[18] = (regs)->flags; \
(pr_reg)[19] = (regs)->sp; \
(pr_reg)[20] = (regs)->ss; \
- rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
- rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
+ (pr_reg)[21] = read_fsbase(); \
+ (pr_reg)[22] = read_inactive_gsbase(); \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
--
2.7.4
Instead of open code, load_fsgs() will cleanup __switch_to
and symmetric with FS/GS segment save. When FSGSBASE
enabled, X86_FEATURE_FSGSBASE check will be incorporated.
Signed-off-by: Chang S. Bae <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Reviewed-by: Andy Lutomirski <[email protected]>
---
arch/x86/kernel/process_64.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e498671..cebf240 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -279,6 +279,15 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
+static __always_inline void load_fsgs(struct thread_struct *prev,
+ struct thread_struct *next)
+{
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+}
+
static unsigned long task_seg_base(struct task_struct *task,
unsigned short selector)
{
@@ -588,10 +597,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
- load_seg_legacy(prev->fsindex, prev->fsbase,
- next->fsindex, next->fsbase, FS);
- load_seg_legacy(prev->gsindex, prev->gsbase,
- next->gsindex, next->gsbase, GS);
+ load_fsgs(prev, next);
switch_fpu_finish(next_fpu, cpu);
--
2.7.4
Using wrmsr_safe() can make code a bit simpler by removing
some condition check
Suggested-by: H. Peter Anvin <[email protected]>
Signed-off-by: Chang S. Bae <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/msr.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 04addd6..09bec62 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -317,7 +317,7 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
-#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
+#define write_rdtscp_aux(val) wrmsr_safe(MSR_TSC_AUX, (val), 0)
struct msr *msrs_alloc(void);
void msrs_free(struct msr *msrs);
--
2.7.4
The CPU (and node) number will be written, as early enough,
to the segment limit of per CPU data and TSC_AUX MSR entry.
The information has been retrieved by vgetcpu in user space
and will be also loaded from the paranoid entry, when
FSGSBASE enabled. So, it is moved out from vDSO to the CPU
initialization path where IST setup is serialized.
Now, redundant setting of the segment in entry/vdso/vma.c
was removed; a substantial code removal. It removes a
hotplug notifier, makes a facility useful to both the kernel
and userspace unconditionally available much sooner, and
unification with i386. (Thanks to HPA for suggesting the
cleanup)
Signed-off-by: Chang S. Bae <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/entry/vdso/vgetcpu.c | 4 ++--
arch/x86/entry/vdso/vma.c | 38 +-------------------------------------
arch/x86/include/asm/segment.h | 29 +++++++++++++++++++++++++++--
arch/x86/include/asm/vgtod.h | 2 --
arch/x86/kernel/cpu/common.c | 5 +++++
arch/x86/kernel/setup_percpu.c | 17 +++++++++++++----
6 files changed, 48 insertions(+), 47 deletions(-)
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f..3284069 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -18,9 +18,9 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
p = __getcpu();
if (cpu)
- *cpu = p & VGETCPU_CPU_MASK;
+ *cpu = lsl_tscp_to_cpu(p);
if (node)
- *node = p >> 12;
+ *node = lsl_tscp_to_node(p);
return 0;
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b8b556..3f9d43f 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -332,40 +332,6 @@ static __init int vdso_setup(char *s)
return 0;
}
__setup("vdso=", vdso_setup);
-#endif
-
-#ifdef CONFIG_X86_64
-static void vgetcpu_cpu_init(void *arg)
-{
- int cpu = smp_processor_id();
- struct desc_struct d = { };
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /*
- * Store cpu number in limit so that it can be loaded
- * quickly in user space in vgetcpu. (12 bits for the CPU
- * and 8 bits for the node)
- */
- d.limit0 = cpu | ((node & 0xf) << 12);
- d.limit1 = node >> 4;
- d.type = 5; /* RO data, expand down, accessed */
- d.dpl = 3; /* Visible to user code */
- d.s = 1; /* Not a system segment */
- d.p = 1; /* Present */
- d.d = 1; /* 32-bit */
-
- write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
-}
-
-static int vgetcpu_online(unsigned int cpu)
-{
- return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
-}
static int __init init_vdso(void)
{
@@ -375,9 +341,7 @@ static int __init init_vdso(void)
init_vdso_image(&vdso_image_x32);
#endif
- /* notifier priority > KVM */
- return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
- "x86/vdso/vma:online", vgetcpu_online, NULL);
+ return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 8f09012b..596112c 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -187,7 +187,7 @@
#define GDT_ENTRY_TLS_MAX 14
/* Abused to load per CPU data from limit */
-#define GDT_ENTRY_PER_CPU 15
+#define GDT_ENTRY_PERCPU 15
/*
* Number of entries in the GDT table:
@@ -207,7 +207,7 @@
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER32_DS __USER_DS
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
-#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
+#define __PER_CPU_SEG (GDT_ENTRY_PERCPU*8 + 3)
#endif
@@ -225,6 +225,31 @@
#define GDT_ENTRY_TLS_ENTRIES 3
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
+/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
+#define LSL_TSCP_CPU_SIZE 12
+#define LSL_TSCP_CPU_MASK 0xfff
+
+#ifndef __ASSEMBLY__
+
+/* Helper functions to store/load CPU and node numbers */
+
+static inline unsigned long make_lsl_tscp(int cpu, unsigned long node)
+{
+ return ((node << LSL_TSCP_CPU_SIZE) | cpu);
+}
+
+static inline unsigned int lsl_tscp_to_cpu(unsigned long x)
+{
+ return (x & LSL_TSCP_CPU_MASK);
+}
+
+static inline unsigned int lsl_tscp_to_node(unsigned long x)
+{
+ return (x >> LSL_TSCP_CPU_SIZE);
+}
+
+#endif
+
#ifdef __KERNEL__
/*
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index fb856c9..1cc9d30 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -79,8 +79,6 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s)
#ifdef CONFIG_X86_64
-#define VGETCPU_CPU_MASK 0xfff
-
static inline unsigned int __getcpu(void)
{
unsigned int p;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 38276f5..c7b54f0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1665,6 +1665,11 @@ void cpu_init(void)
wrmsrl(MSR_FS_BASE, 0);
wrmsrl(MSR_KERNEL_GS_BASE, 0);
+#ifdef CONFIG_NUMA
+ write_rdtscp_aux(make_lsl_tscp(cpu, early_cpu_to_node(cpu)));
+#else
+ write_rdtscp_aux(make_lsl_tscp(cpu, 0));
+#endif
barrier();
x86_configure_nx();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ea554f8..e716e94 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -155,12 +155,21 @@ static void __init pcpup_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
-#ifdef CONFIG_X86_32
- struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
- 0xFFFFF);
+#ifdef CONFIG_NUMA
+ unsigned long node = early_cpu_to_node(cpu);
+#else
+ unsigned long node = 0;
+#endif
+ struct desc_struct d = GDT_ENTRY_INIT(0x0, per_cpu_offset(cpu),
+ make_lsl_tscp(cpu, node));
+
+ d.type = 5; /* R0 data, expand down, accessed */
+ d.dpl = 3; /* Visible to user code */
+ d.s = 1; /* Not a system segment */
+ d.p = 1; /* Present */
+ d.d = 1; /* 32-bit */
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
-#endif
}
void __init setup_per_cpu_areas(void)
--
2.7.4
With new helpers, FS/GS base access is centralized.
Eventually, when FSGSBASE instruction enabled, it will
be faster.
The helpers are used on ptrace APIs (PTRACE_ARCH_PRCTL,
PTRACE_SETREG, PTRACE_GETREG, etc). Idea is to keep
the FS/GS-update mechanism organized.
"inactive" GS base refers to base backed up at kernel
entries and of inactive (user) task's.
The bug that returns stale FS/GS base value (when index
is nonzero) is preserved and will be fixed by next
patch.
Based-on-code-from: Andy Lutomirski <[email protected]>
Signed-off-by: Chang S. Bae <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/fsgsbase.h | 47 ++++++++++++++
arch/x86/kernel/process_64.c | 132 +++++++++++++++++++++++++++++-----------
arch/x86/kernel/ptrace.c | 28 +++------
3 files changed, 153 insertions(+), 54 deletions(-)
create mode 100644 arch/x86/include/asm/fsgsbase.h
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
new file mode 100644
index 0000000..f00a8a6
--- /dev/null
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_FSGSBASE_H
+#define _ASM_FSGSBASE_H 1
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_X86_64
+
+#include <asm/msr-index.h>
+
+/*
+ * Read/write a task's fsbase or gsbase. This returns the value that
+ * the FS/GS base would have (if the task were to be resumed). These
+ * work on current or on a different non-running task.
+ */
+unsigned long read_task_fsbase(struct task_struct *task);
+unsigned long read_task_gsbase(struct task_struct *task);
+int write_task_fsbase(struct task_struct *task, unsigned long fsbase);
+int write_task_gsbase(struct task_struct *task, unsigned long gsbase);
+
+/* Helper functions for reading/writing FS/GS base */
+
+static inline unsigned long read_fsbase(void)
+{
+ unsigned long fsbase;
+
+ rdmsrl(MSR_FS_BASE, fsbase);
+ return fsbase;
+}
+
+void write_fsbase(unsigned long fsbase);
+
+static inline unsigned long read_inactive_gsbase(void)
+{
+ unsigned long gsbase;
+
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ return gsbase;
+}
+
+void write_inactive_gsbase(unsigned long gsbase);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_FSGSBASE_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 12bb445..ace0158 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -54,6 +54,7 @@
#include <asm/vdso.h>
#include <asm/intel_rdt_sched.h>
#include <asm/unistd.h>
+#include <asm/fsgsbase.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
#include <asm/unistd_32_ia32.h>
@@ -278,6 +279,94 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
+void write_fsbase(unsigned long fsbase)
+{
+ /* set the selector to 0 to not confuse __switch_to */
+ loadseg(FS, 0);
+ wrmsrl(MSR_FS_BASE, fsbase);
+}
+
+void write_inactive_gsbase(unsigned long gsbase)
+{
+ /* set the selector to 0 to not confuse __switch_to */
+ loadseg(GS, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+}
+
+unsigned long read_task_fsbase(struct task_struct *task)
+{
+ unsigned long fsbase;
+
+ if (task == current) {
+ fsbase = read_fsbase();
+ } else {
+ /*
+ * XXX: This will not behave as expected if called
+ * if fsindex != 0. This preserves an existing bug
+ * that will be fixed.
+ */
+ fsbase = task->thread.fsbase;
+ }
+
+ return fsbase;
+}
+
+unsigned long read_task_gsbase(struct task_struct *task)
+{
+ unsigned long gsbase;
+
+ if (task == current) {
+ gsbase = read_inactive_gsbase();
+ } else {
+ /*
+ * XXX: This will not behave as expected if called
+ * if gsindex != 0. Same bug preservation as above
+ * read_task_fsbase.
+ */
+ gsbase = task->thread.gsbase;
+ }
+
+ return gsbase;
+}
+
+int write_task_fsbase(struct task_struct *task, unsigned long fsbase)
+{
+ int cpu;
+
+ /*
+ * Not strictly needed for fs, but do it for symmetry
+ * with gs
+ */
+ if (unlikely(fsbase >= TASK_SIZE_MAX))
+ return -EPERM;
+
+ cpu = get_cpu();
+ task->thread.fsbase = fsbase;
+ if (task == current)
+ write_fsbase(fsbase);
+ task->thread.fsindex = 0;
+ put_cpu();
+
+ return 0;
+}
+
+int write_task_gsbase(struct task_struct *task, unsigned long gsbase)
+{
+ int cpu;
+
+ if (unlikely(gsbase >= TASK_SIZE_MAX))
+ return -EPERM;
+
+ cpu = get_cpu();
+ task->thread.gsbase = gsbase;
+ if (task == current)
+ write_inactive_gsbase(gsbase);
+ task->thread.gsindex = 0;
+ put_cpu();
+
+ return 0;
+}
+
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
@@ -618,54 +707,27 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
- int doit = task == current;
- int cpu;
switch (option) {
- case ARCH_SET_GS:
- if (arg2 >= TASK_SIZE_MAX)
- return -EPERM;
- cpu = get_cpu();
- task->thread.gsindex = 0;
- task->thread.gsbase = arg2;
- if (doit) {
- load_gs_index(0);
- ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
- }
- put_cpu();
+ case ARCH_SET_GS: {
+ ret = write_task_gsbase(task, arg2);
break;
- case ARCH_SET_FS:
- /* Not strictly needed for fs, but do it for symmetry
- with gs */
- if (arg2 >= TASK_SIZE_MAX)
- return -EPERM;
- cpu = get_cpu();
- task->thread.fsindex = 0;
- task->thread.fsbase = arg2;
- if (doit) {
- /* set the selector to 0 to not confuse __switch_to */
- loadsegment(fs, 0);
- ret = wrmsrl_safe(MSR_FS_BASE, arg2);
- }
- put_cpu();
+ }
+ case ARCH_SET_FS: {
+ ret = write_task_fsbase(task, arg2);
break;
+ }
case ARCH_GET_FS: {
unsigned long base;
- if (doit)
- rdmsrl(MSR_FS_BASE, base);
- else
- base = task->thread.fsbase;
+ base = read_task_fsbase(task);
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
unsigned long base;
- if (doit)
- rdmsrl(MSR_KERNEL_GS_BASE, base);
- else
- base = task->thread.gsbase;
+ base = read_task_gsbase(task);
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ed5c4cd..b2f0beb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -39,6 +39,7 @@
#include <asm/hw_breakpoint.h>
#include <asm/traps.h>
#include <asm/syscall.h>
+#include <asm/fsgsbase.h>
#include "tls.h"
@@ -396,12 +397,11 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
/*
- * When changing the segment base, use do_arch_prctl_64
- * to set either thread.fs or thread.fsindex and the
- * corresponding GDT slot.
+ * When changing the FS base, use the same
+ * mechanism as for do_arch_prctl_64
*/
if (child->thread.fsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ return write_task_fsbase(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
/*
@@ -410,7 +410,7 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
if (child->thread.gsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ return write_task_gsbase(child, value);
return 0;
#endif
}
@@ -434,20 +434,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return get_flags(task);
#ifdef CONFIG_X86_64
- case offsetof(struct user_regs_struct, fs_base): {
- /*
- * XXX: This will not behave as expected if called on
- * current or if fsindex != 0.
- */
- return task->thread.fsbase;
- }
- case offsetof(struct user_regs_struct, gs_base): {
- /*
- * XXX: This will not behave as expected if called on
- * current or if fsindex != 0.
- */
- return task->thread.gsbase;
- }
+ case offsetof(struct user_regs_struct, fs_base):
+ return read_task_fsbase(task);
+ case offsetof(struct user_regs_struct, gs_base):
+ return read_task_gsbase(task);
#endif
}
--
2.7.4
From: Andy Lutomirski <[email protected]>
ptrace can read FS/GS base using the register access API
(PTRACE_PEEKUSER, etc) or PTRACE_ARCH_PRCTL. Make both of these
mechanisms return the actual FS/GS base.
This will improve debuggability by providing the correct information
to ptracer (GDB and etc).
Signed-off-by: Andy Lutomirski <[email protected]>
[chang: Rebase and revise patch description]
Signed-off-by: Chang S. Bae <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/kernel/process_64.c | 67 +++++++++++++++++++++++++++++++++-----------
1 file changed, 51 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ace0158..e498671 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -279,6 +279,49 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
+static unsigned long task_seg_base(struct task_struct *task,
+ unsigned short selector)
+{
+ unsigned short idx = selector >> 3;
+ unsigned long base;
+
+ if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+ if (unlikely(idx >= GDT_ENTRIES))
+ return 0;
+
+ /*
+ * There are no user segments in the GDT with nonzero bases
+ * other than the TLS segments.
+ */
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return 0;
+
+ idx -= GDT_ENTRY_TLS_MIN;
+ base = get_desc_base(&task->thread.tls_array[idx]);
+ } else {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If performance here mattered, we could protect the LDT
+ * with RCU. This is a slow path, though, so we can just
+ * take the mutex.
+ */
+ mutex_lock(&task->mm->context.lock);
+ ldt = task->mm->context.ldt;
+ if (unlikely(idx >= ldt->nr_entries))
+ base = 0;
+ else
+ base = get_desc_base(ldt->entries + idx);
+ mutex_unlock(&task->mm->context.lock);
+#else
+ base = 0;
+#endif
+ }
+
+ return base;
+}
+
void write_fsbase(unsigned long fsbase)
{
/* set the selector to 0 to not confuse __switch_to */
@@ -297,16 +340,12 @@ unsigned long read_task_fsbase(struct task_struct *task)
{
unsigned long fsbase;
- if (task == current) {
+ if (task == current)
fsbase = read_fsbase();
- } else {
- /*
- * XXX: This will not behave as expected if called
- * if fsindex != 0. This preserves an existing bug
- * that will be fixed.
- */
+ else if (task->thread.fsindex == 0)
fsbase = task->thread.fsbase;
- }
+ else
+ fsbase = task_seg_base(task, task->thread.fsindex);
return fsbase;
}
@@ -315,16 +354,12 @@ unsigned long read_task_gsbase(struct task_struct *task)
{
unsigned long gsbase;
- if (task == current) {
+ if (task == current)
gsbase = read_inactive_gsbase();
- } else {
- /*
- * XXX: This will not behave as expected if called
- * if gsindex != 0. Same bug preservation as above
- * read_task_fsbase.
- */
+ else if (task->thread.gsindex == 0)
gsbase = task->thread.gsbase;
- }
+ else
+ gsbase = task_seg_base(task, task->thread.gsindex);
return gsbase;
}
--
2.7.4
On 06/04/2018 10:24 PM, Chang S. Bae wrote:
> The CPU (and node) number will be written, as early enough,
> to the segment limit of per CPU data and TSC_AUX MSR entry.
> The information has been retrieved by vgetcpu in user space
> and will be also loaded from the paranoid entry, when
> FSGSBASE enabled. So, it is moved out from vDSO to the CPU
> initialization path where IST setup is serialized.
>
> Now, redundant setting of the segment in entry/vdso/vma.c
> was removed; a substantial code removal. It removes a
> hotplug notifier, makes a facility useful to both the kernel
> and userspace unconditionally available much sooner, and
> unification with i386. (Thanks to HPA for suggesting the
> cleanup)
>
> Signed-off-by: Chang S. Bae <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> ---
> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> index ea554f8..e716e94 100644
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -155,12 +155,21 @@ static void __init pcpup_populate_pte(unsigned long addr)
>
> static inline void setup_percpu_segment(int cpu)
> {
> -#ifdef CONFIG_X86_32
> - struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
> - 0xFFFFF);
> +#ifdef CONFIG_NUMA
> + unsigned long node = early_cpu_to_node(cpu);
> +#else
> + unsigned long node = 0;
> +#endif
> + struct desc_struct d = GDT_ENTRY_INIT(0x0, per_cpu_offset(cpu),
> + make_lsl_tscp(cpu, node));
> +
> + d.type = 5; /* R0 data, expand down, accessed */
> + d.dpl = 3; /* Visible to user code */
> + d.s = 1; /* Not a system segment */
> + d.p = 1; /* Present */
> + d.d = 1; /* 32-bit */
>
> write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
> -#endif
> }
This won't work on X86-32 because it actually uses the segment limit with fs: access. So there
is a reason why the lsl based method is X86-64 only.
-Mika
>
> void __init setup_per_cpu_areas(void)
>
>> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
>> index ea554f8..e716e94 100644
>> --- a/arch/x86/kernel/setup_percpu.c
>> +++ b/arch/x86/kernel/setup_percpu.c
>> @@ -155,12 +155,21 @@ static void __init pcpup_populate_pte(unsigned long addr)
>>
>> static inline void setup_percpu_segment(int cpu)
>> {
>> -#ifdef CONFIG_X86_32
>> - struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
>> - 0xFFFFF);
>> +#ifdef CONFIG_NUMA
>> + unsigned long node = early_cpu_to_node(cpu);
>> +#else
>> + unsigned long node = 0;
>> +#endif
>> + struct desc_struct d = GDT_ENTRY_INIT(0x0, per_cpu_offset(cpu),
>> + make_lsl_tscp(cpu, node));
>> +
>> + d.type = 5; /* R0 data, expand down, accessed */
>> + d.dpl = 3; /* Visible to user code */
>> + d.s = 1; /* Not a system segment */
>> + d.p = 1; /* Present */
>> + d.d = 1; /* 32-bit */
>>
>> write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
>> -#endif
>> }
> This won't work on X86-32 because it actually uses the segment limit with fs: access. So there
> is a reason why the lsl based method is X86-64 only.
The limit will be consumed in X86-64 only, while the unification with i386 was suggested for a
different reason.
Thanks,
Chang
On 06/05/2018 07:44 AM, Bae, Chang Seok wrote:
>>> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
>>> index ea554f8..e716e94 100644
>>> --- a/arch/x86/kernel/setup_percpu.c
>>> +++ b/arch/x86/kernel/setup_percpu.c
>>> @@ -155,12 +155,21 @@ static void __init pcpup_populate_pte(unsigned long addr)
>>>
>>> static inline void setup_percpu_segment(int cpu)
>>> {
>>> -#ifdef CONFIG_X86_32
>>> - struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
>>> - 0xFFFFF);
>>> +#ifdef CONFIG_NUMA
>>> + unsigned long node = early_cpu_to_node(cpu);
>>> +#else
>>> + unsigned long node = 0;
>>> +#endif
>>> + struct desc_struct d = GDT_ENTRY_INIT(0x0, per_cpu_offset(cpu),
>>> + make_lsl_tscp(cpu, node));
>>> +
>>> + d.type = 5; /* R0 data, expand down, accessed */
>>> + d.dpl = 3; /* Visible to user code */
>>> + d.s = 1; /* Not a system segment */
>>> + d.p = 1; /* Present */
>>> + d.d = 1; /* 32-bit */
>>>
>>> write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
>>> -#endif
>>> }
>
>> This won't work on X86-32 because it actually uses the segment limit with fs: access. So there
>> is a reason why the lsl based method is X86-64 only.
>
> The limit will be consumed in X86-64 only, while the unification with i386 was suggested for a
> different reason.
>
> Thanks,
> Chang
>
The unification affects i386, and the limit is consumed by the processor with fs: access.
The limit was 0xFFFFF before, now it depends on the cpu/node. So accesses on small number cpus
are likely to fault.
--Mika
On 06/04/18 20:57, Mika Penttilä wrote:
>
> This won't work on X86-32 because it actually uses the segment limit with fs: access. So there
> is a reason why the lsl based method is X86-64 only.
>
<thinks out loud>
Why does that matter in any shape, way, or form? The LSL instruction
doesn't touch any of the segment registers, it just uses a segment
selector number.
<looks at code>
I see... we have a VERY unfortunate name collision: the x86-64
GDT_ENTRY_PERC_PU and the i386 GDT_ENTRY_PERCPU are in fact two
completely different things, with the latter being the actual percpu
offset used by the kernel.
So yes, this patch is wrong, because the naming of the x86-64 segment is
insane especially in the proximity of the -- it should be something
like GDT_ENTRY_CPU_NUMBER.
Unfortunately we probably can't use the same GDT entry on x86-32 and
x86-64, because entry 15 (selector 0x7b) is USER_DS, which is something
we really don't want to screw with. This means i386 programs that
execute LSL directly for whatever reason will have to understand the
difference, but most of the other segment numbers differ as well,
including user space %cs (USER_CS/USER32_CS) and %ds/%es/%ss (USER_DS).
Perhaps we could bump down segments 23-28 by one and put it as 23, that
way the CPU_NUMBER segment would always be at %ss+80 for the default
(flat, initial) user space %ss. (We want to specify using %ss rather
than %ds, because it is less likely to be changed and because 64 bits,
too, have %ss defined, but not %ds.)
<action item>
Rename the x86-64 segment to CPU_NUMBER, fixing the naming conflict.
Add 1 to GDT entry numbers 23-28 for i386 (all of these are
kernel-internal segments and so have no impact on user space).
Add i386 CPU_NUMBER equivalent to x86-64 at GDT entry 23.
Document the above relationship between segments.
OK, everyone?
-hpa
On 06/05/2018 08:36 AM, H. Peter Anvin wrote:
> On 06/04/18 20:57, Mika Penttilä wrote:
>>
>> This won't work on X86-32 because it actually uses the segment limit with fs: access. So there
>> is a reason why the lsl based method is X86-64 only.
>>
>
> <thinks out loud>
>
> Why does that matter in any shape, way, or form? The LSL instruction
> doesn't touch any of the segment registers, it just uses a segment
> selector number.
>
> <looks at code>
>
> I see... we have a VERY unfortunate name collision: the x86-64
> GDT_ENTRY_PERC_PU and the i386 GDT_ENTRY_PERCPU are in fact two
> completely different things, with the latter being the actual percpu
> offset used by the kernel.
>
> So yes, this patch is wrong, because the naming of the x86-64 segment is
> insane especially in the proximity of the -- it should be something
> like GDT_ENTRY_CPU_NUMBER.
>
> Unfortunately we probably can't use the same GDT entry on x86-32 and
> x86-64, because entry 15 (selector 0x7b) is USER_DS, which is something
> we really don't want to screw with. This means i386 programs that
> execute LSL directly for whatever reason will have to understand the
> difference, but most of the other segment numbers differ as well,
> including user space %cs (USER_CS/USER32_CS) and %ds/%es/%ss (USER_DS).
> Perhaps we could bump down segments 23-28 by one and put it as 23, that
> way the CPU_NUMBER segment would always be at %ss+80 for the default
> (flat, initial) user space %ss. (We want to specify using %ss rather
> than %ds, because it is less likely to be changed and because 64 bits,
> too, have %ss defined, but not %ds.)
>
> <action item>
>
> Rename the x86-64 segment to CPU_NUMBER, fixing the naming conflict.
> Add 1 to GDT entry numbers 23-28 for i386 (all of these are
> kernel-internal segments and so have no impact on user space).
> Add i386 CPU_NUMBER equivalent to x86-64 at GDT entry 23.
> Document the above relationship between segments.
>
> OK, everyone?
>
> -hpa
>
Yes GDT_ENTRY_PER_CPU and GDT_ENTRY_PERCPU meaning two totally different things is really confusing,
the proposal seems ok to me!
--Mika
On Mon, Jun 4, 2018 at 10:36 PM H. Peter Anvin <[email protected]> wrote:
>
> On 06/04/18 20:57, Mika Penttilä wrote:
> >
> > This won't work on X86-32 because it actually uses the segment limit with fs: access. So there
> > is a reason why the lsl based method is X86-64 only.
> >
>
> <thinks out loud>
>
> Why does that matter in any shape, way, or form? The LSL instruction
> doesn't touch any of the segment registers, it just uses a segment
> selector number.
>
> <looks at code>
>
> I see... we have a VERY unfortunate name collision: the x86-64
> GDT_ENTRY_PERC_PU and the i386 GDT_ENTRY_PERCPU are in fact two
> completely different things, with the latter being the actual percpu
> offset used by the kernel.
>
> So yes, this patch is wrong, because the naming of the x86-64 segment is
> insane especially in the proximity of the -- it should be something
> like GDT_ENTRY_CPU_NUMBER.
>
> Unfortunately we probably can't use the same GDT entry on x86-32 and
> x86-64, because entry 15 (selector 0x7b) is USER_DS, which is something
> we really don't want to screw with. This means i386 programs that
> execute LSL directly for whatever reason will have to understand the
> difference, but most of the other segment numbers differ as well,
> including user space %cs (USER_CS/USER32_CS) and %ds/%es/%ss (USER_DS).
> Perhaps we could bump down segments 23-28 by one and put it as 23, that
> way the CPU_NUMBER segment would always be at %ss+80 for the default
> (flat, initial) user space %ss. (We want to specify using %ss rather
> than %ds, because it is less likely to be changed and because 64 bits,
> too, have %ss defined, but not %ds.)
>
> <action item>
>
> Rename the x86-64 segment to CPU_NUMBER, fixing the naming conflict.
Yes, agreed. Probably as its own patch *before* the rest of this cleanup.
> Add 1 to GDT entry numbers 23-28 for i386 (all of these are
> kernel-internal segments and so have no impact on user space).
> Add i386 CPU_NUMBER equivalent to x86-64 at GDT entry 23.
> Document the above relationship between segments.
Sure, but also as a standalone patch, please.
>
> OK, everyone?
>
> -hpa
FYI, we noticed the following commit (built with gcc-4.9):
commit: ab1bcc442070315bd0ce963331d5bb93d5c5476e ("x86/vdso: Move out the CPU number store")
url: https://github.com/0day-ci/linux/commits/Chang-S-Bae/x86-fsgsbase-64-Introduce-FS-GS-base-helper-functions/20180605-095329
in testcase: boot
on test machine: qemu-system-i386 -enable-kvm -cpu Haswell,+smep,+smap -m 360M
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+-------------------------------+------------+------------+
| | 053d1414b3 | ab1bcc4420 |
+-------------------------------+------------+------------+
| boot_successes | 8 | 0 |
| boot_failures | 0 | 8 |
| BUG:kernel_hang_in_boot_stage | 0 | 8 |
+-------------------------------+------------+------------+
[ 0.000000] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
[ 0.000000] setup_percpu: NR_CPUS:8 nr_cpumask_bits:8 nr_cpu_ids:1 nr_node_ids:1
[ 0.000000] percpu: Embedded 342 pages/cpu @(ptrval) s1371088 r0 d29744 u1400832
[ 0.000000] pcpu-alloc: s1371088 r0 d29744 u1400832 alloc=342*4096
[ 0.000000] pcpu-alloc: [0] 0
BUG: kernel hang in boot stage
Linux version 4.17.0-rc3-00290-gab1bcc4 #1
Command line: ip=::::vm-vp-quantal-i386-50::dhcp root=/dev/ram0 user=lkp job=/lkp/scheduled/vm-vp-quantal-i386-50/boot-1-quantal-core-i386.cgz-ab1bcc442070315bd0ce963331d5bb93d5c5476e-20180610-7731-3wa87x-0.yaml ARCH=i386 kconfig=i386-randconfig-h1-06101053 branch=linux-devel/devel-catchup-201806101444 commit=ab1bcc442070315bd0ce963331d5bb93d5c5476e BOOT_IMAGE=/pkg/linux/i386-randconfig-h1-06101053/gcc-4.9/ab1bcc442070315bd0ce963331d5bb93d5c5476e/vmlinuz-4.17.0-rc3-00290-gab1bcc4 max_uptime=600 RESULT_ROOT=/result/boot/1/vm-vp-quantal-i386/quantal-core-i386.cgz/i386-randconfig-h1-06101053/gcc-4.9/ab1bcc442070315bd0ce963331d5bb93d5c5476e/0 LKP_SERVER=inn debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 net.ifnames=0 printk.devkmsg=on panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 drbd.minor_count=8 systemd.log_level=err ignore_loglevel console=tty0 earlyprintk=ttyS0,115200 console=ttyS0,115200 vga=normal rw drbd.minor_count=8 rcuperf.shutdown=0
Elapsed time: 560
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> job-script # job-script is attached in this email
Thanks,
Xiaolong