2007-11-29 00:39:13

by Roland McGrath

[permalink] [raw]
Subject: [PATCH x86/mm 1/6] x86-64 ia32 ptrace pt_regs cleanup


This cleans up the getreg32/putreg32 functions to use struct pt_regs in a
straightforward fashion, instead of equivalent ugly pointer arithmetic.

Signed-off-by: Roland McGrath <[email protected]>
---
arch/x86/ia32/ptrace32.c | 21 +++++++++------------
1 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
index 1e382e3..c52d066 100644
--- a/arch/x86/ia32/ptrace32.c
+++ b/arch/x86/ia32/ptrace32.c
@@ -37,11 +37,11 @@

#define R32(l,q) \
case offsetof(struct user32, regs.l): \
- stack[offsetof(struct pt_regs, q) / 8] = val; break
+ regs->q = val; break;

static int putreg32(struct task_struct *child, unsigned regno, u32 val)
{
- __u64 *stack = (__u64 *)task_pt_regs(child);
+ struct pt_regs *regs = task_pt_regs(child);

switch (regno) {
case offsetof(struct user32, regs.fs):
@@ -65,12 +65,12 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
case offsetof(struct user32, regs.ss):
if ((val & 3) != 3)
return -EIO;
- stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
+ regs->ss = val & 0xffff;
break;
case offsetof(struct user32, regs.cs):
if ((val & 3) != 3)
return -EIO;
- stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
+ regs->cs = val & 0xffff;
break;

R32(ebx, bx);
@@ -84,9 +84,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
R32(eip, ip);
R32(esp, sp);

- case offsetof(struct user32, regs.eflags): {
- __u64 *flags = &stack[offsetof(struct pt_regs, flags)/8];
-
+ case offsetof(struct user32, regs.eflags):
val &= FLAG_MASK;
/*
* If the user value contains TF, mark that
@@ -97,9 +95,8 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
clear_tsk_thread_flag(child, TIF_FORCED_TF);
else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
val |= X86_EFLAGS_TF;
- *flags = val | (*flags & ~FLAG_MASK);
+ regs->flags = val | (regs->flags & ~FLAG_MASK);
break;
- }

case offsetof(struct user32, u_debugreg[0]) ...
offsetof(struct user32, u_debugreg[7]):
@@ -123,11 +120,11 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)

#define R32(l,q) \
case offsetof(struct user32, regs.l): \
- *val = stack[offsetof(struct pt_regs, q)/8]; break
+ *val = regs->q; break

static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
{
- __u64 *stack = (__u64 *)task_pt_regs(child);
+ struct pt_regs *regs = task_pt_regs(child);

switch (regno) {
case offsetof(struct user32, regs.fs):
@@ -160,7 +157,7 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
/*
* If the debugger set TF, hide it from the readout.
*/
- *val = stack[offsetof(struct pt_regs, flags)/8];
+ *val = regs->flags;
if (test_tsk_thread_flag(child, TIF_FORCED_TF))
*val &= ~X86_EFLAGS_TF;
break;


2007-11-29 00:41:17

by Roland McGrath

[permalink] [raw]
Subject: [PATCH x86/mm 3/6] x86-32 ptrace whitespace


This canonicalizes the indentation in the getreg and putreg functions.

Signed-off-by: Roland McGrath <[email protected]>
---
arch/x86/kernel/ptrace_32.c | 110 +++++++++++++++++++++---------------------
1 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index f81e2f1..5aca84e 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -51,37 +51,37 @@ static int putreg(struct task_struct *child,
struct pt_regs *regs = task_pt_regs(child);
regno >>= 2;
switch (regno) {
- case GS:
- if (value && (value & 3) != 3)
- return -EIO;
- child->thread.gs = value;
- return 0;
- case DS:
- case ES:
- case FS:
- if (value && (value & 3) != 3)
- return -EIO;
- value &= 0xffff;
- break;
- case SS:
- case CS:
- if ((value & 3) != 3)
- return -EIO;
- value &= 0xffff;
- break;
- case EFL:
- value &= FLAG_MASK;
- /*
- * If the user value contains TF, mark that
- * it was not "us" (the debugger) that set it.
- * If not, make sure it stays set if we had.
- */
- if (value & X86_EFLAGS_TF)
- clear_tsk_thread_flag(child, TIF_FORCED_TF);
- else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
- value |= X86_EFLAGS_TF;
- value |= regs->flags & ~FLAG_MASK;
- break;
+ case GS:
+ if (value && (value & 3) != 3)
+ return -EIO;
+ child->thread.gs = value;
+ return 0;
+ case DS:
+ case ES:
+ case FS:
+ if (value && (value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ break;
+ case SS:
+ case CS:
+ if ((value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ break;
+ case EFL:
+ value &= FLAG_MASK;
+ /*
+ * If the user value contains TF, mark that
+ * it was not "us" (the debugger) that set it.
+ * If not, make sure it stays set if we had.
+ */
+ if (value & X86_EFLAGS_TF)
+ clear_tsk_thread_flag(child, TIF_FORCED_TF);
+ else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+ value |= X86_EFLAGS_TF;
+ value |= regs->flags & ~FLAG_MASK;
+ break;
}
*pt_regs_access(regs, regno) = value;
return 0;
@@ -94,26 +94,26 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)

regno >>= 2;
switch (regno) {
- case EFL:
- /*
- * If the debugger set TF, hide it from the readout.
- */
- retval = regs->flags;
- if (test_tsk_thread_flag(child, TIF_FORCED_TF))
- retval &= ~X86_EFLAGS_TF;
- break;
- case GS:
- retval = child->thread.gs;
- break;
- case DS:
- case ES:
- case FS:
- case SS:
- case CS:
- retval = 0xffff;
- /* fall through */
- default:
- retval &= *pt_regs_access(regs, regno);
+ case EFL:
+ /*
+ * If the debugger set TF, hide it from the readout.
+ */
+ retval = regs->flags;
+ if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+ retval &= ~X86_EFLAGS_TF;
+ break;
+ case GS:
+ retval = child->thread.gs;
+ break;
+ case DS:
+ case ES:
+ case FS:
+ case SS:
+ case CS:
+ retval = 0xffff;
+ /* fall through */
+ default:
+ retval &= *pt_regs_access(regs, regno);
}
return retval;
}
@@ -190,7 +190,7 @@ static int ptrace_set_debugreg(struct task_struct *child,
* Make sure the single step bit is not set.
*/
void ptrace_disable(struct task_struct *child)
-{
+{
user_disable_single_step(child);
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
}
@@ -203,7 +203,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)

switch (request) {
/* when I and D space are separate, these will need to be fixed. */
- case PTRACE_PEEKTEXT: /* read word at location addr. */
+ case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
ret = generic_ptrace_peekdata(child, addr, data);
break;
@@ -213,7 +213,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
unsigned long tmp;

ret = -EIO;
- if ((addr & 3) || addr < 0 ||
+ if ((addr & 3) || addr < 0 ||
addr > sizeof(struct user) - 3)
break;

@@ -238,7 +238,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)

case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
ret = -EIO;
- if ((addr & 3) || addr < 0 ||
+ if ((addr & 3) || addr < 0 ||
addr > sizeof(struct user) - 3)
break;

2007-11-29 00:41:31

by Roland McGrath

[permalink] [raw]
Subject: [PATCH x86/mm 2/6] x86-64 ptrace whitespace


This canonicalizes the indentation in the getreg and putreg functions.

Signed-off-by: Roland McGrath <[email protected]>
---
arch/x86/kernel/ptrace_64.c | 224 +++++++++++++++++++++---------------------
1 files changed, 112 insertions(+), 112 deletions(-)

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 56b31cd..2427548 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -2,7 +2,7 @@
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <[email protected]>, May 2000
- *
+ *
* x86-64 port 2000-2002 Andi Kleen
*/

@@ -48,7 +48,7 @@
* Make sure the single step bit is not set.
*/
void ptrace_disable(struct task_struct *child)
-{
+{
user_disable_single_step(child);
}

@@ -63,69 +63,69 @@ static int putreg(struct task_struct *child,
{
struct pt_regs *regs = task_pt_regs(child);
switch (regno) {
- case offsetof(struct user_regs_struct,fs):
- if (value && (value & 3) != 3)
- return -EIO;
- child->thread.fsindex = value & 0xffff;
- return 0;
- case offsetof(struct user_regs_struct,gs):
- if (value && (value & 3) != 3)
- return -EIO;
- child->thread.gsindex = value & 0xffff;
- return 0;
- case offsetof(struct user_regs_struct,ds):
- if (value && (value & 3) != 3)
- return -EIO;
- child->thread.ds = value & 0xffff;
- return 0;
- case offsetof(struct user_regs_struct,es):
- if (value && (value & 3) != 3)
- return -EIO;
- child->thread.es = value & 0xffff;
- return 0;
- case offsetof(struct user_regs_struct,ss):
- if ((value & 3) != 3)
- return -EIO;
- value &= 0xffff;
- return 0;
- case offsetof(struct user_regs_struct,fs_base):
- if (value >= TASK_SIZE_OF(child))
- return -EIO;
- /*
- * When changing the segment base, use do_arch_prctl
- * to set either thread.fs or thread.fsindex and the
- * corresponding GDT slot.
- */
- if (child->thread.fs != value)
- return do_arch_prctl(child, ARCH_SET_FS, value);
- return 0;
- case offsetof(struct user_regs_struct,gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
- if (value >= TASK_SIZE_OF(child))
- return -EIO;
- if (child->thread.gs != value)
- return do_arch_prctl(child, ARCH_SET_GS, value);
- return 0;
- case offsetof(struct user_regs_struct,flags):
- value &= FLAG_MASK;
- /*
- * If the user value contains TF, mark that
- * it was not "us" (the debugger) that set it.
- * If not, make sure it stays set if we had.
- */
- if (value & X86_EFLAGS_TF)
- clear_tsk_thread_flag(child, TIF_FORCED_TF);
- else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
- value |= X86_EFLAGS_TF;
- value |= regs->flags & ~FLAG_MASK;
- break;
- case offsetof(struct user_regs_struct,cs):
- if ((value & 3) != 3)
- return -EIO;
- value &= 0xffff;
- break;
+ case offsetof(struct user_regs_struct,fs):
+ if (value && (value & 3) != 3)
+ return -EIO;
+ child->thread.fsindex = value & 0xffff;
+ return 0;
+ case offsetof(struct user_regs_struct,gs):
+ if (value && (value & 3) != 3)
+ return -EIO;
+ child->thread.gsindex = value & 0xffff;
+ return 0;
+ case offsetof(struct user_regs_struct,ds):
+ if (value && (value & 3) != 3)
+ return -EIO;
+ child->thread.ds = value & 0xffff;
+ return 0;
+ case offsetof(struct user_regs_struct,es):
+ if (value && (value & 3) != 3)
+ return -EIO;
+ child->thread.es = value & 0xffff;
+ return 0;
+ case offsetof(struct user_regs_struct,ss):
+ if ((value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ return 0;
+ case offsetof(struct user_regs_struct,fs_base):
+ if (value >= TASK_SIZE_OF(child))
+ return -EIO;
+ /*
+ * When changing the segment base, use do_arch_prctl
+ * to set either thread.fs or thread.fsindex and the
+ * corresponding GDT slot.
+ */
+ if (child->thread.fs != value)
+ return do_arch_prctl(child, ARCH_SET_FS, value);
+ return 0;
+ case offsetof(struct user_regs_struct,gs_base):
+ /*
+ * Exactly the same here as the %fs handling above.
+ */
+ if (value >= TASK_SIZE_OF(child))
+ return -EIO;
+ if (child->thread.gs != value)
+ return do_arch_prctl(child, ARCH_SET_GS, value);
+ return 0;
+ case offsetof(struct user_regs_struct,flags):
+ value &= FLAG_MASK;
+ /*
+ * If the user value contains TF, mark that
+ * it was not "us" (the debugger) that set it.
+ * If not, make sure it stays set if we had.
+ */
+ if (value & X86_EFLAGS_TF)
+ clear_tsk_thread_flag(child, TIF_FORCED_TF);
+ else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+ value |= X86_EFLAGS_TF;
+ value |= regs->flags & ~FLAG_MASK;
+ break;
+ case offsetof(struct user_regs_struct,cs):
+ if ((value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ break;
}
*pt_regs_access(regs, regno) = value;
return 0;
@@ -136,49 +136,49 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
struct pt_regs *regs = task_pt_regs(child);
unsigned long val;
switch (regno) {
- case offsetof(struct user_regs_struct, fs):
- return child->thread.fsindex;
- case offsetof(struct user_regs_struct, gs):
- return child->thread.gsindex;
- case offsetof(struct user_regs_struct, ds):
- return child->thread.ds;
- case offsetof(struct user_regs_struct, es):
- return child->thread.es;
- case offsetof(struct user_regs_struct, fs_base):
- /*
- * do_arch_prctl may have used a GDT slot instead of
- * the MSR. To userland, it appears the same either
- * way, except the %fs segment selector might not be 0.
- */
- if (child->thread.fs != 0)
- return child->thread.fs;
- if (child->thread.fsindex != FS_TLS_SEL)
- return 0;
- return get_desc_base(&child->thread.tls_array[FS_TLS]);
- case offsetof(struct user_regs_struct, gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
- if (child->thread.gs != 0)
- return child->thread.gs;
- if (child->thread.gsindex != GS_TLS_SEL)
- return 0;
- return get_desc_base(&child->thread.tls_array[GS_TLS]);
- case offsetof(struct user_regs_struct, flags):
- /*
- * If the debugger set TF, hide it from the readout.
- */
- val = regs->flags;
- if (test_tsk_thread_flag(child, TIF_IA32))
- val &= 0xffffffff;
- if (test_tsk_thread_flag(child, TIF_FORCED_TF))
- val &= ~X86_EFLAGS_TF;
- return val;
- default:
- val = *pt_regs_access(regs, regno);
- if (test_tsk_thread_flag(child, TIF_IA32))
- val &= 0xffffffff;
- return val;
+ case offsetof(struct user_regs_struct, fs):
+ return child->thread.fsindex;
+ case offsetof(struct user_regs_struct, gs):
+ return child->thread.gsindex;
+ case offsetof(struct user_regs_struct, ds):
+ return child->thread.ds;
+ case offsetof(struct user_regs_struct, es):
+ return child->thread.es;
+ case offsetof(struct user_regs_struct, fs_base):
+ /*
+ * do_arch_prctl may have used a GDT slot instead of
+ * the MSR. To userland, it appears the same either
+ * way, except the %fs segment selector might not be 0.
+ */
+ if (child->thread.fs != 0)
+ return child->thread.fs;
+ if (child->thread.fsindex != FS_TLS_SEL)
+ return 0;
+ return get_desc_base(&child->thread.tls_array[FS_TLS]);
+ case offsetof(struct user_regs_struct, gs_base):
+ /*
+ * Exactly the same here as the %fs handling above.
+ */
+ if (child->thread.gs != 0)
+ return child->thread.gs;
+ if (child->thread.gsindex != GS_TLS_SEL)
+ return 0;
+ return get_desc_base(&child->thread.tls_array[GS_TLS]);
+ case offsetof(struct user_regs_struct, flags):
+ /*
+ * If the debugger set TF, hide it from the readout.
+ */
+ val = regs->flags;
+ if (test_tsk_thread_flag(child, TIF_IA32))
+ val &= 0xffffffff;
+ if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+ val &= ~X86_EFLAGS_TF;
+ return val;
+ default:
+ val = *pt_regs_access(regs, regno);
+ if (test_tsk_thread_flag(child, TIF_IA32))
+ val &= 0xffffffff;
+ return val;
}

}
@@ -244,7 +244,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)

switch (request) {
/* when I and D space are separate, these will need to be fixed. */
- case PTRACE_PEEKTEXT: /* read word at location addr. */
+ case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
ret = generic_ptrace_peekdata(child, addr, data);
break;
@@ -310,10 +310,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
(struct user_desc __user *) data, 0);
break;
#endif
- /* normal 64bit interface to access TLS data.
+ /* normal 64bit interface to access TLS data.
Works just like arch_prctl, except that the arguments
are reversed. */
- case PTRACE_ARCH_PRCTL:
+ case PTRACE_ARCH_PRCTL:
ret = do_arch_prctl(child, data, addr);
break;

@@ -386,7 +386,7 @@ static void syscall_trace(struct pt_regs *regs)
printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
current->comm,
regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
- current_thread_info()->flags, current->ptrace);
+ current_thread_info()->flags, current->ptrace);
#endif

ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)

2007-11-29 00:41:43

by Roland McGrath

[permalink] [raw]
Subject: [PATCH x86/mm 4/6] x86-64 ptrace get/putreg current task


This generalizes the getreg and putreg functions so they can be used on the
current task, as well as on a task stopped in TASK_TRACED and switched off.
This lays the groundwork to share this code for all kinds of user-mode
machine state access, not just ptrace.

Signed-off-by: Roland McGrath <[email protected]>
---
arch/x86/kernel/ptrace_64.c | 36 ++++++++++++++++++++++++++++++++++--
1 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 2427548..5979dbe 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -67,21 +67,29 @@ static int putreg(struct task_struct *child,
if (value && (value & 3) != 3)
return -EIO;
child->thread.fsindex = value & 0xffff;
+ if (child == current)
+ loadsegment(fs, child->thread.fsindex);
return 0;
case offsetof(struct user_regs_struct,gs):
if (value && (value & 3) != 3)
return -EIO;
child->thread.gsindex = value & 0xffff;
+ if (child == current)
+ load_gs_index(child->thread.gsindex);
return 0;
case offsetof(struct user_regs_struct,ds):
if (value && (value & 3) != 3)
return -EIO;
child->thread.ds = value & 0xffff;
+ if (child == current)
+ loadsegment(ds, child->thread.ds);
return 0;
case offsetof(struct user_regs_struct,es):
if (value && (value & 3) != 3)
return -EIO;
child->thread.es = value & 0xffff;
+ if (child == current)
+ loadsegment(es, child->thread.es);
return 0;
case offsetof(struct user_regs_struct,ss):
if ((value & 3) != 3)
@@ -135,14 +143,32 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
{
struct pt_regs *regs = task_pt_regs(child);
unsigned long val;
+ unsigned int seg;
switch (regno) {
case offsetof(struct user_regs_struct, fs):
+ if (child == current) {
+ /* Older gas can't assemble movq %?s,%r?? */
+ asm("movl %%fs,%0" : "=r" (seg));
+ return seg;
+ }
return child->thread.fsindex;
case offsetof(struct user_regs_struct, gs):
+ if (child == current) {
+ asm("movl %%gs,%0" : "=r" (seg));
+ return seg;
+ }
return child->thread.gsindex;
case offsetof(struct user_regs_struct, ds):
+ if (child == current) {
+ asm("movl %%ds,%0" : "=r" (seg));
+ return seg;
+ }
return child->thread.ds;
case offsetof(struct user_regs_struct, es):
+ if (child == current) {
+ asm("movl %%es,%0" : "=r" (seg));
+ return seg;
+ }
return child->thread.es;
case offsetof(struct user_regs_struct, fs_base):
/*
@@ -152,7 +178,10 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
*/
if (child->thread.fs != 0)
return child->thread.fs;
- if (child->thread.fsindex != FS_TLS_SEL)
+ seg = child->thread.fsindex;
+ if (child == current)
+ asm("movl %%fs,%0" : "=r" (seg));
+ if (seg != FS_TLS_SEL)
return 0;
return get_desc_base(&child->thread.tls_array[FS_TLS]);
case offsetof(struct user_regs_struct, gs_base):
@@ -161,7 +190,10 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
*/
if (child->thread.gs != 0)
return child->thread.gs;
- if (child->thread.gsindex != GS_TLS_SEL)
+ seg = child->thread.gsindex;
+ if (child == current)
+ asm("movl %%gs,%0" : "=r" (seg));
+ if (seg != GS_TLS_SEL)
return 0;
return get_desc_base(&child->thread.tls_array[GS_TLS]);
case offsetof(struct user_regs_struct, flags):

2007-11-29 00:42:22

by Roland McGrath

[permalink] [raw]
Subject: [PATCH x86/mm 5/6] x86-32 ptrace get/putreg current task


This generalizes the getreg and putreg functions so they can be used on the
current task, as well as on a task stopped in TASK_TRACED and switched off.
This lays the groundwork to share this code for all kinds of user-mode
machine state access, not just ptrace.

Signed-off-by: Roland McGrath <[email protected]>
---
arch/x86/kernel/ptrace_32.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 5aca84e..2607130 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -55,6 +55,12 @@ static int putreg(struct task_struct *child,
if (value && (value & 3) != 3)
return -EIO;
child->thread.gs = value;
+ if (child == current)
+ /*
+ * The user-mode %gs is not affected by
+ * kernel entry, so we must update the CPU.
+ */
+ loadsegment(gs, value);
return 0;
case DS:
case ES:
@@ -104,6 +110,8 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
break;
case GS:
retval = child->thread.gs;
+ if (child == current)
+ savesegment(gs, retval);
break;
case DS:
case ES:

2007-11-29 00:42:41

by Roland McGrath

[permalink] [raw]
Subject: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task


This generalizes the getreg32 and putreg32 functions so they can be used on
the current task, as well as on a task stopped in TASK_TRACED and switched
off. This lays the groundwork to share this code for all kinds of
user-mode machine state access, not just ptrace.

Signed-off-by: Roland McGrath <[email protected]>
---
arch/x86/ia32/ptrace32.c | 16 ++++++++++++++++
1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
index c52d066..d5663e2 100644
--- a/arch/x86/ia32/ptrace32.c
+++ b/arch/x86/ia32/ptrace32.c
@@ -48,19 +48,27 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
if (val && (val & 3) != 3)
return -EIO;
child->thread.fsindex = val & 0xffff;
+ if (child == current)
+ loadsegment(fs, child->thread.fsindex);
break;
case offsetof(struct user32, regs.gs):
if (val && (val & 3) != 3)
return -EIO;
child->thread.gsindex = val & 0xffff;
+ if (child == current)
+ load_gs_index(child->thread.gsindex);
break;
case offsetof(struct user32, regs.ds):
if (val && (val & 3) != 3)
return -EIO;
child->thread.ds = val & 0xffff;
+ if (child == current)
+ loadsegment(ds, child->thread.ds);
break;
case offsetof(struct user32, regs.es):
child->thread.es = val & 0xffff;
+ if (child == current)
+ loadsegment(es, child->thread.ds);
break;
case offsetof(struct user32, regs.ss):
if ((val & 3) != 3)
@@ -129,15 +137,23 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
switch (regno) {
case offsetof(struct user32, regs.fs):
*val = child->thread.fsindex;
+ if (child == current)
+ asm("movl %%fs,%0" : "=r" (*val));
break;
case offsetof(struct user32, regs.gs):
*val = child->thread.gsindex;
+ if (child == current)
+ asm("movl %%gs,%0" : "=r" (*val));
break;
case offsetof(struct user32, regs.ds):
*val = child->thread.ds;
+ if (child == current)
+ asm("movl %%ds,%0" : "=r" (*val));
break;
case offsetof(struct user32, regs.es):
*val = child->thread.es;
+ if (child == current)
+ asm("movl %%es,%0" : "=r" (*val));
break;

R32(cs, cs);

2007-11-29 10:40:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH x86/mm 1/6] x86-64 ia32 ptrace pt_regs cleanup


thanks - i've queued up all 6 patches of yours. They applied fine to
x86.git and passed a quick build & boot test as well.

Ingo

2007-11-29 17:35:13

by Chuck Ebbert

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

On 11/28/2007 07:42 PM, Roland McGrath wrote:
> --- a/arch/x86/ia32/ptrace32.c
> +++ b/arch/x86/ia32/ptrace32.c
> @@ -48,19 +48,27 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
> if (val && (val & 3) != 3)
> return -EIO;
> child->thread.fsindex = val & 0xffff;
> + if (child == current)
> + loadsegment(fs, child->thread.fsindex);
> break;
> case offsetof(struct user32, regs.gs):
> if (val && (val & 3) != 3)
> return -EIO;
> child->thread.gsindex = val & 0xffff;
> + if (child == current)
> + load_gs_index(child->thread.gsindex);
> break;
> case offsetof(struct user32, regs.ds):
> if (val && (val & 3) != 3)
> return -EIO;
> child->thread.ds = val & 0xffff;
> + if (child == current)
> + loadsegment(ds, child->thread.ds);
> break;
> case offsetof(struct user32, regs.es):
> child->thread.es = val & 0xffff;
> + if (child == current)
> + loadsegment(es, child->thread.ds);

child->thread.es ??

> @@ -129,15 +137,23 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
> switch (regno) {
> case offsetof(struct user32, regs.fs):
> *val = child->thread.fsindex;
> + if (child == current)
> + asm("movl %%fs,%0" : "=r" (*val));
> break;
> case offsetof(struct user32, regs.gs):
> *val = child->thread.gsindex;
> + if (child == current)
> + asm("movl %%gs,%0" : "=r" (*val));

Won't this return the kernel's GS instead of the user's?

2007-11-29 17:39:59

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH x86/mm 4/6] x86-64 ptrace get/putreg current task

On Wed, Nov 28, 2007 at 04:41:18PM -0800, Roland McGrath wrote:
>
> This generalizes the getreg and putreg functions so they can be used on the
> current task, as well as on a task stopped in TASK_TRACED and switched off.
> This lays the groundwork to share this code for all kinds of user-mode
> machine state access, not just ptrace.

Not sure it's a that good idea as it means almost every case now has
a branch for two entirely separate cases. Wouldn't it be better to have
a separate helper for the looking at the current proces case?

Either way support for looking at the current thread should only be
merged together with an actual user for that.

2007-11-29 18:10:54

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task



Chuck seems to have caught a bug, although the wrong one:

On Thu, 29 Nov 2007, Chuck Ebbert wrote:
>
> On 11/28/2007 07:42 PM, Roland McGrath wrote:
> > --- a/arch/x86/ia32/ptrace32.c
> > +++ b/arch/x86/ia32/ptrace32.c
> > ...
> > + if (child == current)
> > + load_gs_index(child->thread.gsindex);

This is correct.

But the ones that do the same thing for fs/es/ds are *not*. Those three
registers are kernel mode registers (ds/es are the regular kernel data
segment, fs is the per-cpu data segment), and restored on return to user
space from the stack.

For similar reasons, this is wrong:

> > @@ -129,15 +137,23 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
> > switch (regno) {
> > case offsetof(struct user32, regs.fs):
> > *val = child->thread.fsindex;
> > + if (child == current)
> > + asm("movl %%fs,%0" : "=r" (*val));
> > break;

That %fs is the kernel per-cpu thing, not the user %fs.

But this one is correct:

> > case offsetof(struct user32, regs.gs):
> > *val = child->thread.gsindex;
> > + if (child == current)
> > + asm("movl %%gs,%0" : "=r" (*val));
>
> Won't this return the kernel's GS instead of the user's?

No, %gs is untouched by the kernel, so it contains user space version, and
getting the value directly from %gs looks correct.

Linus

2007-11-29 18:17:33

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

Linus Torvalds wrote:
>
> But this one is correct:
>
>>> case offsetof(struct user32, regs.gs):
>>> *val = child->thread.gsindex;
>>> + if (child == current)
>>> + asm("movl %%gs,%0" : "=r" (*val));
>> Won't this return the kernel's GS instead of the user's?
>
> No, %gs is untouched by the kernel, so it contains user space version, and
> getting the value directly from %gs looks correct.
>

Brief summary/reminder:

The kernel uses %fs in 32-bit mode and %gs in 64-bit mode.
User space TLS uses %gs in 32-bit mode and %fs in 64-bit mode.

The 64-bit kernel has to use %gs in order for SWAPGS to be available to
it (by which time the 32-bit ABI was already fixed.) It is advantageous
for user space to use the register the kernel typically won't, in order
to speed up system call entry/exit.

-hpa

2007-11-29 18:18:49

by Chuck Ebbert

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

On 11/29/2007 01:09 PM, Linus Torvalds wrote:
>>> case offsetof(struct user32, regs.gs):
>>> *val = child->thread.gsindex;
>>> + if (child == current)
>>> + asm("movl %%gs,%0" : "=r" (*val));
>> Won't this return the kernel's GS instead of the user's?
>
> No, %gs is untouched by the kernel, so it contains user space version, and
> getting the value directly from %gs looks correct.
>

But this is x86_64, where swapgs is done on kernel entry.

2007-11-29 18:23:58

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

Chuck Ebbert wrote:
> On 11/29/2007 01:09 PM, Linus Torvalds wrote:
>>>> case offsetof(struct user32, regs.gs):
>>>> *val = child->thread.gsindex;
>>>> + if (child == current)
>>>> + asm("movl %%gs,%0" : "=r" (*val));
>>> Won't this return the kernel's GS instead of the user's?
>> No, %gs is untouched by the kernel, so it contains user space version, and
>> getting the value directly from %gs looks correct.
>>
>
> But this is x86_64, where swapgs is done on kernel entry.
>

For i386-x86_64 sharing, getting to the user segments probably should be
macroized. I'm thinking something like
get_user_[cs|ds|es|fs|gs|ss](thread) in <asm/processor.h> doing the
appropriate thing for different configurations.

-hpa

2007-11-29 18:33:32

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task



On Thu, 29 Nov 2007, H. Peter Anvin wrote:
>
> The kernel uses %fs in 32-bit mode and %gs in 64-bit mode.

Yeah, thanks for reminding me about this particular insanity.

We should just make the kernel always use %gs for the percpu data. On
32-bit x86 there really is no reason to use %fs over %gs, other than the
purely historical one, and the fact that "f" is alphabetically before "g",
so it's the one you use first if you have no other preferences.

That way we can more easily share code, if the rules for fs/gs are the
same for 32-bit/64-bit code.

However, you also say:

> It is advantageous for user space to use the register the kernel
> typically won't, in order to speed up system call entry/exit.

but I'm not seeing the reason for that one. Care to comment more? (Yes,
there is often a latency from segment reload to use, but the reload
latency for system call exit *should* be entirely covered by the cost of
doing the system call return itself, no?)

Linus

2007-11-29 18:47:43

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

Linus Torvalds wrote:
>
> However, you also say:
>
>> It is advantageous for user space to use the register the kernel
>> typically won't, in order to speed up system call entry/exit.
>
> but I'm not seeing the reason for that one. Care to comment more? (Yes,
> there is often a latency from segment reload to use, but the reload
> latency for system call exit *should* be entirely covered by the cost of
> doing the system call return itself, no?)
>

I do seem to recall that some processor implementations can load a NULL
segment faster than a non-NULL segment. This was significant enough
that we wanted to use %fs in x86-64 userspace, as opposed to the
original ABI which used %gs both in userspace and in the kernel.

-hpa

2007-11-29 19:10:26

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task



On Thu, 29 Nov 2007, H. Peter Anvin wrote:
> Linus Torvalds wrote:
> >
> > > It is advantageous for user space to use the register the kernel typically
> > > won't, in order to speed up system call entry/exit.
> >
> > but I'm not seeing the reason for that one. Care to comment more? (Yes,
> > there is often a latency from segment reload to use, but the reload latency
> > for system call exit *should* be entirely covered by the cost of doing the
> > system call return itself, no?)
>
> I do seem to recall that some processor implementations can load a NULL
> segment faster than a non-NULL segment. This was significant enough that we
> wanted to use %fs in x86-64 userspace, as opposed to the original ABI which
> used %gs both in userspace and in the kernel.

Ahh, I think you may be right for some CPUs. The zero selector is indeed
potentially faster to load, since it doesn't have to even bother looking
at the GDT/LDT.

That said, I doubt it's very noticeable. I just ran tests on both an old
P4 and on a more modern Core 2 machine, and for both of those the
performance was identical between loading a NUL selector and loading it
with a non-zero one.

But I could well imagine that it matters a few cycles on other CPU's. But
from my testing, it definitely isn't noticeable, and I think the
maintenance advantage of using the same segment setup would more than make
up for the fact that maybe some odd CPU can see a difference.

Linus

2007-11-29 19:17:59

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

Andi, do you happen to remember the details on this?

-hpa


Linus Torvalds wrote:
>
> On Thu, 29 Nov 2007, H. Peter Anvin wrote:
>> Linus Torvalds wrote:
>>>> It is advantageous for user space to use the register the kernel typically
>>>> won't, in order to speed up system call entry/exit.
>>> but I'm not seeing the reason for that one. Care to comment more? (Yes,
>>> there is often a latency from segment reload to use, but the reload latency
>>> for system call exit *should* be entirely covered by the cost of doing the
>>> system call return itself, no?)
>> I do seem to recall that some processor implementations can load a NULL
>> segment faster than a non-NULL segment. This was significant enough that we
>> wanted to use %fs in x86-64 userspace, as opposed to the original ABI which
>> used %gs both in userspace and in the kernel.
>
> Ahh, I think you may be right for some CPUs. The zero selector is indeed
> potentially faster to load, since it doesn't have to even bother looking
> at the GDT/LDT.
>
> That said, I doubt it's very noticeable. I just ran tests on both an old
> P4 and on a more modern Core 2 machine, and for both of those the
> performance was identical between loading a NUL selector and loading it
> with a non-zero one.
>
> But I could well imagine that it matters a few cycles on other CPU's. But
> from my testing, it definitely isn't noticeable, and I think the
> maintenance advantage of using the same segment setup would more than make
> up for the fact that maybe some odd CPU can see a difference.
>
> Linus

2007-11-29 19:27:48

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

On Thu, Nov 29, 2007 at 11:16:55AM -0800, H. Peter Anvin wrote:
> Andi, do you happen to remember the details on this?

x86-64 has to use GS because there is no SWAPFS
We decided to make it opposite on user space back then, but not
based on benchmarks (there were only simulators back then) Oh yes
the reason was that the GS context switch is slightly more expensive
than the FS one and the code does lazy optimization.

For i386 iirc Jeremy/Zach did the benchmarking and they settled
on %fs because it was faster for something (originally it was %gs too)

-Andi

2007-11-29 19:44:58

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task


* Andi Kleen <[email protected]> wrote:

> For i386 iirc Jeremy/Zach did the benchmarking and they settled on %fs
> because it was faster for something (originally it was %gs too)

yep. IIRC, some CPUs only optimize %fs because that's what Windows uses
and leaves Linux with %gs out in the cold. There's also a performance
penalty for overlapping segment use, if the segment cache is single
entry only with an additional optimization for NULL [which just hides
the segment cache].

But if it's good for unification we could switch that to %gs again on
32-bit. I was one of the people who advocated the use of the 'other'
segment register, so that the hardware has less overlap, but clean and
unified code trumps this concern. It shouldnt be an issue on reasonably
modern CPUs anyway.

Ingo

2007-11-29 19:51:14

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task



On Thu, 29 Nov 2007, Andi Kleen wrote:
>
> For i386 iirc Jeremy/Zach did the benchmarking and they settled
> on %fs because it was faster for something (originally it was %gs too)

Hmm. Context switching ends up having to switch the segment that we do
*not* use for the kernel, and the context switching can be faster for the
case where

(a) the segment selector is identical in both source and destination
*AND*
(b) the selector is either zero or points to a GDT entry.

So yes, context switching can be faster for a NUL selector if both old and
new threads use it, and in fact that's the onle case we check right now:

/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
loadsegment(gs, next->gs);

for the 32-bit case. That is faster if "gs" is normally zero, since that
means that we can avoid that loadsegment.

HOWEVER. That is actually not the right (well, "complete") conditional,
since it's only one sub-case of the thing that matters. The right
conditional is probably

/*
* Restore %gs if needed (which is common).
* We can avoid it if they are identical, and
* point to the GDT.
*/
if ((prev->gs ^ next->gs) | (next->gs & 4))
loadsegment(gs, next->gs);

At that point, we would only have to reload stuff if the user actually
uses a local descriptor table entry (which does happen for threaded apps,
I guess, but at least we avoid it for all the common traditional UNIX
processes).

Linus

2007-11-29 20:03:16

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

Ingo Molnar wrote:
> * Andi Kleen <[email protected]> wrote:
>
>> For i386 iirc Jeremy/Zach did the benchmarking and they settled on %fs
>> because it was faster for something (originally it was %gs too)
>
> yep. IIRC, some CPUs only optimize %fs because that's what Windows uses
> and leaves Linux with %gs out in the cold. There's also a performance
> penalty for overlapping segment use, if the segment cache is single
> entry only with an additional optimization for NULL [which just hides
> the segment cache].
>

For the 32-bit case, which is the only one that can be changed at all:

I guess, specifically, that assuming a sysenter implementation (meaning
CS is handled ad hoc by the sysenter/sysexit instructions) we have
USER_DS, KERNEL_DS, and the kernel thread pointer. If the segments
don't overlap, the user thread pointer gets loaded once per exec or task
switch, and doesn't change in between. If they do, the user thread
pointer has to be reloaded on system call exit.

A nonzero segment load involves a memory reference followed by
data-dependent traps on that reference, so the amount of reordering the
CPU can do to hide that latency is limited. A zero segment load doesn't
perform the memory reference at all.

Note that a segment cache (a proper cache, not the segment descriptor
registers that the Intel docs bogusly call a "cache") does *not* save
the memory reference, since if the descriptor has changed in memory it
*has* to be honoured; it only allows it to be performed lazily (assume
the cache is valid, then throw an internal exception and don't commit
state if the descriptor stored in the cache tag doesn't match the
descriptor loaded from memory.)

-hpa

2007-11-29 20:11:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task


> HOWEVER. That is actually not the right (well, "complete") conditional,
> since it's only one sub-case of the thing that matters. The right
> conditional is probably
>
> /*
> * Restore %gs if needed (which is common).
> * We can avoid it if they are identical, and
> * point to the GDT.

How would you catch (common) the case of them having different bases in the
GDT TLS entries? At some point the selector has to be reloaded, otherwise
it won't be picked up by the CPU.

I think the original condition is correct. You could maybe merge it with the
TLS entry rewrite and only do it if something changes there. Not
sure it is worth it though.

-Andi
> */
> if ((prev->gs ^ next->gs) | (next->gs & 4))
> loadsegment(gs, next->gs);
>
> At that point, we would only have to reload stuff if the user actually
> uses a local descriptor table entry (which does happen for threaded apps,
> I guess, but at least we avoid it for all the common traditional UNIX
> processes).
>
> Linus
>


2007-11-29 20:26:01

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task



On Thu, 29 Nov 2007, Andi Kleen wrote:
>
> How would you catch (common) the case of them having different bases in the
> GDT TLS entries? At some point the selector has to be reloaded, otherwise
> it won't be picked up by the CPU.

You're right. I somehow thought we were using the LDT for TLS entries,
which I guess we did back before kernel support, but yeah, we're using
per-CPU GDT entries, and would need to check that those match too.

So yeah, the only safe ones are the ones we don't ever change, and that is
basically just the kernel entries (that users cannot load anyway).

So scratch that. You're right - the NUL selector is the only special one
for the context switcher.

Linus

2007-11-29 22:22:52

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

> > case offsetof(struct user32, regs.gs):
> > *val = child->thread.gsindex;
> > + if (child == current)
> > + asm("movl %%gs,%0" : "=r" (*val));
>
> Won't this return the kernel's GS instead of the user's?
[...]
> But this is x86_64, where swapgs is done on kernel entry.

As I understand it, and from what the documentation I have says, swapgs has
nothing to do with the %gs selector. It affects the "GS base register",
i.e. the MSR.


Thanks,
Roland

2007-11-29 22:25:49

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

> But the ones that do the same thing for fs/es/ds are *not*. Those three
> registers are kernel mode registers (ds/es are the regular kernel data
> segment, fs is the per-cpu data segment), and restored on return to user
> space from the stack.

Um, really? This is x86-64 code. AIUI those values don't have any effect
at all in 64-bit mode (as the kernel is). I haven't found any code in
entry_64.S or ia32entry.S that touches them. __switch_to uses direct
access to the segment registers just as I've done.


Thanks,
Roland

2007-11-29 22:36:15

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task



On Thu, 29 Nov 2007, Roland McGrath wrote:
>
> Um, really? This is x86-64 code. AIUI those values don't have any effect
> at all in 64-bit mode (as the kernel is). I haven't found any code in
> entry_64.S or ia32entry.S that touches them. __switch_to uses direct
> access to the segment registers just as I've done.

Yes, you're right, I just looked at the ia32 part and didn't read your
patches right.

Linus

2007-11-29 23:01:53

by Chuck Ebbert

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task

On 11/29/2007 05:21 PM, Roland McGrath wrote:
>>> case offsetof(struct user32, regs.gs):
>>> *val = child->thread.gsindex;
>>> + if (child == current)
>>> + asm("movl %%gs,%0" : "=r" (*val));
>> Won't this return the kernel's GS instead of the user's?
> [...]
>> But this is x86_64, where swapgs is done on kernel entry.
>
> As I understand it, and from what the documentation I have says, swapgs has
> nothing to do with the %gs selector. It affects the "GS base register",
> i.e. the MSR.
>

Yep, I confused the GS selector with the base address in the descriptor.

2007-12-01 23:45:58

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH x86/mm 6/6] x86-64 ia32 ptrace get/putreg32 current task


On Nov 29, 2007, at 2:44 PM, Ingo Molnar wrote:

>
> * Andi Kleen <[email protected]> wrote:
>
>> For i386 iirc Jeremy/Zach did the benchmarking and they settled on
>> %fs
>> because it was faster for something (originally it was %gs too)
>
> yep. IIRC, some CPUs only optimize %fs because that's what Windows
> uses
> and leaves Linux with %gs out in the cold.

I did measure some anomalies with the AMD K6+ (or something like
that), in which %gs was faster than %fs. It was pretty much
inexplicable, but also unique - all other processors I tested (which
was a range from Pentium MMX to current) had identical performance.

> There's also a performance
> penalty for overlapping segment use, if the segment cache is single
> entry only with an additional optimization for NULL [which just hides
> the segment cache].

Some processors do perform slightly better with null selector loads
than GDT/LDT ones, but it wasn't really noticeable for modern
processors. The Intel architecture guy I asked about this said that
it might be worth doing, but it would likely be swamped by a GDT
cache miss. I looked at rearranging the kernel's GDT to pack all the
kernel entry/exit entries into as few cachelines as possible, but it
was surprisingly fiddley.

> But if it's good for unification we could switch that to %gs again on
> 32-bit. I was one of the people who advocated the use of the 'other'
> segment register, so that the hardware has less overlap, but clean and
> unified code trumps this concern. It shouldnt be an issue on
> reasonably
> modern CPUs anyway.

Well, overall it should be fairly easy to make the two arches use
their own segment registers with a simple #define. But things like
ptrace and vm86 were tricky, though I guess the latter isn't an issue
for 64-bit.

I originally chose %gs for the kernel, partly in the hope that
compiler support for TLS would be helpful in the kernel, though that
doesn't seem like a good idea in retrospect. %gs for the sake of
consistency would be reasonable, and wouldn't have a measurable
downside.

J