2011-05-26 18:50:35

by Thomas Gleixner

[permalink] [raw]
Subject: [GIT pull] x86 vdso updates

Linus,

Please pull the latest x86-vdso-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86-vdso-for-linus

Thanks,

tglx

------------------>
Andy Lutomirski (8):
x86-64: Clean up vdso/kernel shared variables
x86-64: Remove unnecessary barrier in vread_tsc
x86-64: Don't generate cmov in vread_tsc
x86-64: Vclock_gettime(CLOCK_MONOTONIC) can't ever see nsec < 0
x86-64: Move vread_tsc into a new file with sensible options
x86-64: Turn off -pg and turn on -foptimize-sibling-calls for vDSO
x86-64: Add time to vDSO
x86-64: Optimize vDSO time()

Thomas Gleixner (1):
x86: vdso: Remove unused variable


arch/x86/include/asm/tsc.h | 4 ++
arch/x86/include/asm/vdso.h | 14 -------
arch/x86/include/asm/vgtod.h | 2 -
arch/x86/include/asm/vsyscall.h | 12 +-----
arch/x86/include/asm/vvar.h | 52 +++++++++++++++++++++++++++
arch/x86/kernel/Makefile | 8 +++--
arch/x86/kernel/time.c | 2 +-
arch/x86/kernel/tsc.c | 19 ----------
arch/x86/kernel/vmlinux.lds.S | 34 ++++++------------
arch/x86/kernel/vread_tsc_64.c | 36 +++++++++++++++++++
arch/x86/kernel/vsyscall_64.c | 46 ++++++++++--------------
arch/x86/vdso/Makefile | 17 ++++++++-
arch/x86/vdso/vclock_gettime.c | 74 ++++++++++++++++++++++++++++-----------
arch/x86/vdso/vdso.lds.S | 9 +----
arch/x86/vdso/vextern.h | 16 --------
arch/x86/vdso/vgetcpu.c | 3 +-
arch/x86/vdso/vma.c | 27 --------------
arch/x86/vdso/vvar.c | 12 ------
18 files changed, 202 insertions(+), 185 deletions(-)
create mode 100644 arch/x86/include/asm/vvar.h
create mode 100644 arch/x86/kernel/vread_tsc_64.c
delete mode 100644 arch/x86/vdso/vextern.h
delete mode 100644 arch/x86/vdso/vvar.c

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 83e2efd..9db5583 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern unsigned long native_calibrate_tsc(void);

+#ifdef CONFIG_X86_64
+extern cycles_t vread_tsc(void);
+#endif
+
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052..bb05228 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
#ifndef _ASM_X86_VDSO_H
#define _ASM_X86_VDSO_H

-#ifdef CONFIG_X86_64
-extern const char VDSO64_PRELINK[];
-
-/*
- * Given a pointer to the vDSO image, find the pointer to VDSO64_name
- * as that symbol is defined in the vDSO sources or linker script.
- */
-#define VDSO64_SYMBOL(base, name) \
-({ \
- extern const char VDSO64_##name[]; \
- (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
-})
-#endif
-
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
extern const char VDSO32_PRELINK[];

diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e20..646b4c1 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -23,8 +23,6 @@ struct vsyscall_gtod_data {
struct timespec wall_to_monotonic;
struct timespec wall_time_coarse;
};
-extern struct vsyscall_gtod_data __vsyscall_gtod_data
-__section_vsyscall_gtod_data;
extern struct vsyscall_gtod_data vsyscall_gtod_data;

#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d2..d555973 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,19 @@ enum vsyscall_num {
#ifdef __KERNEL__
#include <linux/seqlock.h>

-#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
-#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
-
/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __section_vsyscall_gtod_data __attribute__ \
- ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
-#define __section_vsyscall_clock __attribute__ \
- ((unused, __section__ (".vsyscall_clock"),aligned(16)))
#define __vsyscall_fn \
__attribute__ ((unused, __section__(".vsyscall_fn"))) notrace

#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2

-extern int __vgetcpu_mode;
-extern volatile unsigned long __jiffies;
-
/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;

+#include <asm/vvar.h>
+
extern void map_vsyscall(void);

#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 0000000..341b355
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,52 @@
+/*
+ * vvar.h: Shared vDSO/kernel variable declarations
+ * Copyright (c) 2011 Andy Lutomirski
+ * Subject to the GNU General Public License, version 2
+ *
+ * A handful of variables are accessible (read-only) from userspace
+ * code in the vsyscall page and the vdso. They are declared here.
+ * Some other file must define them with DEFINE_VVAR.
+ *
+ * In normal kernel code, they are used like any other variable.
+ * In user code, they are accessed through the VVAR macro.
+ *
+ * Each of these variables lives in the vsyscall page, and each
+ * one needs a unique offset within the little piece of the page
+ * reserved for vvars. Specify that offset in DECLARE_VVAR.
+ * (There are 896 bytes available. If you mess up, the linker will
+ * catch it.)
+ */
+
+/* Offset of vars within vsyscall page */
+#define VSYSCALL_VARS_OFFSET (3072 + 128)
+
+#if defined(__VVAR_KERNEL_LDS)
+
+/* The kernel linker script defines its own magic to put vvars in the
+ * right place.
+ */
+#define DECLARE_VVAR(offset, type, name) \
+ EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
+
+#else
+
+#define DECLARE_VVAR(offset, type, name) \
+ static type const * const vvaraddr_ ## name = \
+ (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
+
+#define DEFINE_VVAR(type, name) \
+ type __vvar_ ## name \
+ __attribute__((section(".vsyscall_var_" #name), aligned(16)))
+
+#define VVAR(name) (*vvaraddr_ ## name)
+
+#endif
+
+/* DECLARE_VVAR(offset, type, name) */
+
+DECLARE_VVAR(0, volatile unsigned long, jiffies)
+DECLARE_VVAR(8, int, vgetcpu_mode)
+DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+
+#undef DECLARE_VVAR
+#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2508064..f5abe3a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)

ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
-CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_pvclock.o = -pg
@@ -24,13 +23,16 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
+CFLAGS_vread_tsc_64.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
GCOV_PROFILE_tsc.o := n
GCOV_PROFILE_paravirt.o := n

+# vread_tsc_64 is hot and should be fully optimized:
+CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
+
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a2..00cbb27 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
#include <asm/time.h>

#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
#endif

unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9335bf7..6cc6922 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
ret : clocksource_tsc.cycle_last;
}

-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret;
-
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
- rdtsc_barrier();
-
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-#endif
-
static void resume_tsc(struct clocksource *cs)
{
clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 49927a8..74b5ad4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,6 +161,12 @@ SECTIONS

#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
+ ADDR(.vsyscall_0) + offset \
+ : AT(VLOAD(.vsyscall_var_ ## x)) { \
+ *(.vsyscall_var_ ## x) \
+ } \
+ x = VVIRT(.vsyscall_var_ ## x);

. = ALIGN(4096);
__vsyscall_0 = .;
@@ -175,18 +181,6 @@ SECTIONS
*(.vsyscall_fn)
}

- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
- *(.vsyscall_gtod_data)
- }
-
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
- *(.vsyscall_clock)
- }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
*(.vsyscall_1)
}
@@ -194,21 +188,14 @@ SECTIONS
*(.vsyscall_2)
}

- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
- *(.vgetcpu_mode)
- }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) {
- *(.jiffies)
- }
- jiffies = VVIRT(.jiffies);
-
.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
*(.vsyscall_3)
}

+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+
. = __vsyscall_0 + PAGE_SIZE;

#undef VSYSCALL_ADDR
@@ -216,6 +203,7 @@ SECTIONS
#undef VLOAD
#undef VVIRT_OFFSET
#undef VVIRT
+#undef EMIT_VVAR

#endif /* CONFIG_X86_64 */

diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 0000000..a81aa9e
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
+/* This code runs in userspace. */
+
+#define DISABLE_BRANCH_PROFILING
+#include <asm/vgtod.h>
+
+notrace cycle_t __vsyscall_fn vread_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c..5f6ad03 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,15 +49,8 @@
__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","cx","memory"

-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
{
.lock = SEQLOCK_UNLOCKED,
.sysctl_enabled = 1,
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
*/
static __always_inline void do_get_tz(struct timezone * tz)
{
- *tz = __vsyscall_gtod_data.sys_tz;
+ *tz = VVAR(vsyscall_gtod_data).sys_tz;
}

static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);

- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+ vread = VVAR(vsyscall_gtod_data).clock.vread;
+ if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
+ !vread)) {
gettimeofday(tv,NULL);
return;
}

now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
+ base = VVAR(vsyscall_gtod_data).clock.cycle_last;
+ mask = VVAR(vsyscall_gtod_data).clock.mask;
+ mult = VVAR(vsyscall_gtod_data).clock.mult;
+ shift = VVAR(vsyscall_gtod_data).clock.shift;

- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+ tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
+ nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
+ } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));

/* calculate interval: */
cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
{
unsigned seq;
time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+ if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
return time_syscall(t);

do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+ seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);

- result = __vsyscall_gtod_data.wall_time_sec;
+ result = VVAR(vsyscall_gtod_data).wall_time_sec;

- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+ } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));

if (t)
*t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
We do this here because otherwise user space would do it on
its own in a likely inferior way (no access to jiffies).
If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
+ if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+ } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b6552b1..bef0bc9 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)


# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o

# files to link into kernel
obj-$(VDSO64-y) += vma.o vdso.o
@@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)

+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector)
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ -fno-omit-frame-pointer -foptimize-sibling-calls

$(vobjs): KBUILD_CFLAGS += $(CFL)

+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vdso-note.o = -pg
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vvar.o = -pg
+
targets += vdso-syms.lds
obj-$(VDSO64-y) += vdso-syms.lds

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754..a724905 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -2,7 +2,7 @@
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
- * Fast user context implementation of clock_gettime and gettimeofday.
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
*
* The code should have no internal unresolved relocations.
* Check with readelf after changing.
@@ -22,9 +22,8 @@
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
-#include "vextern.h"

-#define gtod vdso_vsyscall_gtod_data
+#define gtod (&VVAR(vsyscall_gtod_data))

notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -56,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
return 0;
}

-/* Copy of the version in kernel/time.c which we cannot directly access */
-notrace static void
-vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
-{
- while (nsec >= NSEC_PER_SEC) {
- nsec -= NSEC_PER_SEC;
- ++sec;
- }
- while (nsec < 0) {
- nsec += NSEC_PER_SEC;
- --sec;
- }
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
-}
-
notrace static noinline int do_monotonic(struct timespec *ts)
{
unsigned long seq, ns, secs;
@@ -82,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(&gtod->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
+ * are all guaranteed to be nonnegative.
+ */
+ while (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}

@@ -107,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
secs += gtod->wall_to_monotonic.tv_sec;
ns += gtod->wall_to_monotonic.tv_nsec;
} while (unlikely(read_seqretry(&gtod->lock, seq)));
- vset_normalized_timespec(ts, secs, ns);
+
+ /* wall_time_nsec and wall_to_monotonic.tv_nsec are
+ * guaranteed to be between 0 and NSEC_PER_SEC.
+ */
+ if (ns >= NSEC_PER_SEC) {
+ ns -= NSEC_PER_SEC;
+ ++secs;
+ }
+ ts->tv_sec = secs;
+ ts->tv_nsec = ns;
+
return 0;
}

@@ -157,3 +160,32 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
}
int gettimeofday(struct timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
+
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+
+static __always_inline long time_syscall(long *t)
+{
+ long secs;
+ asm volatile("syscall"
+ : "=a" (secs)
+ : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
+ return secs;
+}
+
+notrace time_t __vdso_time(time_t *t)
+{
+ time_t result;
+
+ if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
+ return time_syscall(t);
+
+ /* This is atomic on x86_64 so we don't need any locks. */
+ result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
+
+ if (t)
+ *t = result;
+ return result;
+}
+int time(time_t *t)
+ __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b..b96b267 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -23,15 +23,10 @@ VERSION {
__vdso_gettimeofday;
getcpu;
__vdso_getcpu;
+ time;
+ __vdso_time;
local: *;
};
}

VDSO64_PRELINK = VDSO_PRELINK;
-
-/*
- * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
- */
-#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
-#include "vextern.h"
-#undef VEXTERN
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2..0000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef VEXTERN
-#include <asm/vsyscall.h>
-#define VEXTERN(x) \
- extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
-#endif
-
-#define VMAGIC 0xfeedbabeabcdefabUL
-
-/* Any kernel variables used in the vDSO must be exported in the main
- kernel's vmlinux.lds.S/vsyscall.h/proper __section and
- put into vextern.h and be referenced as a pointer with vdso prefix.
- The main kernel later fills in the values. */
-
-VEXTERN(jiffies)
-VEXTERN(vgetcpu_mode)
-VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b2..5463ad5 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
#include <linux/time.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
-#include "vextern.h"

notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;

- if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+ if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f..7abd2be 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -15,9 +15,6 @@
#include <asm/proto.h>
#include <asm/vdso.h>

-#include "vextern.h" /* Just for VMAGIC. */
-#undef VEXTERN
-
unsigned int __read_mostly vdso_enabled = 1;

extern char vdso_start[], vdso_end[];
@@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid;
static struct page **vdso_pages;
static unsigned vdso_size;

-static inline void *var_ref(void *p, char *name)
-{
- if (*(void **)p != (void *)VMAGIC) {
- printk("VDSO: variable %s broken\n", name);
- vdso_enabled = 0;
- }
- return p;
-}
-
static int __init init_vdso_vars(void)
{
int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
int i;
- char *vbase;

vdso_size = npages << PAGE_SHIFT;
vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
@@ -54,20 +41,6 @@ static int __init init_vdso_vars(void)
copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
}

- vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
- if (!vbase)
- goto oom;
-
- if (memcmp(vbase, "\177ELF", 4)) {
- printk("VDSO: I'm broken; not ELF\n");
- vdso_enabled = 0;
- }
-
-#define VEXTERN(x) \
- *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
-#include "vextern.h"
-#undef VEXTERN
- vunmap(vbase);
return 0;

oom:
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703..0000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Define pointer to external vDSO variables.
- These are part of the vDSO. The kernel fills in the real addresses
- at boot time. This is done because when the vdso is linked the
- kernel isn't yet and we don't know the final addresses. */
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/vgtod.h>
-
-#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
-#include "vextern.h"


2011-05-26 21:26:42

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On 05/26/2011 02:50 PM, Thomas Gleixner wrote:
> x86-64: Add time to vDSO

And here's the glibc bug:

http://sourceware.org/bugzilla/show_bug.cgi?id=12813

Let the deprecation of the vsyscall page begin :)

--Andy

2011-05-27 06:12:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates


* Andy Lutomirski <[email protected]> wrote:

> On 05/26/2011 02:50 PM, Thomas Gleixner wrote:
> > x86-64: Add time to vDSO
>
> And here's the glibc bug:
>
> http://sourceware.org/bugzilla/show_bug.cgi?id=12813
>
> Let the deprecation of the vsyscall page begin :)

Yeah :-)

On a related note, now that these bits are upstream, what are your
rough plans for doing the int81 vsyscall emulation patch?

The int81 patch would actually be (much!) more important to the
average Linux user than getting rid of the vsyscall from static
binaries: the vsyscall is the last predictable executable address in
PIE daemons with a dangerous SYSENTER instruction in it ...

I'd actually consider accelerating it into v2.6.40.

Thanks,

Ingo

2011-05-27 11:36:34

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 2:12 AM, Ingo Molnar <[email protected]> wrote:
>
> * Andy Lutomirski <[email protected]> wrote:
>
>> On 05/26/2011 02:50 PM, Thomas Gleixner wrote:
>> > ? ? ? x86-64: Add time to vDSO
>>
>> And here's the glibc bug:
>>
>> http://sourceware.org/bugzilla/show_bug.cgi?id=12813
>>
>> Let the deprecation of the vsyscall page begin :)
>
> Yeah :-)
>
> On a related note, now that these bits are upstream, what are your
> rough plans for doing the int81 vsyscall emulation patch?
>
> The int81 patch would actually be (much!) more important to the
> average Linux user than getting rid of the vsyscall from static
> binaries: the vsyscall is the last predictable executable address in
> PIE daemons with a dangerous SYSENTER instruction in it ...
>
> I'd actually consider accelerating it into v2.6.40.

Working on it slowly. In principle I'm writing my thesis right now on
things that have very little to do with software :)

My rough plans for the near-term stuff are:

1. Move vvars out of the vsyscall page. Otherwise we have at least
one predictable syscall instruction every 1<<16 seconds because the
time is executable. It's prettier that way, too.

2. Remove the vsyscall64 sysctl. That will reduce the number of
vsyscalls that require the kernel's help to one. (This is a bit
unfortunate for UML users, but I'm not sure what to do about that.
UML vgetcpu is already terminally broken.)

3. Add int 0xcc and use it from vgettimeofday. It will SIGSEGV if
called from a user address (so it has no risk of ever becoming ABI)
and it will do gettimeofday if called from the right address. (I like
0xcc better than 0x81 because then I don't have to wonder whether any
syscall-like instructions start with 0x81.) I'm not convinced that
the existing syscall entries are usable, because syscall itself has a
different calling convention and int 0x80 is a compat syscall.


I might get this done in something resembling time for 2.6.40, but no
guarantees.

For later, the whole vsyscall page could become 0xcd <0xcc * 1023>,
repeated four times.

To save me a bunch of spec reading and code dissection, can you
explain the difference between zeroentry and paranoidzeroentry?

>
> Thanks,
>
> ? ? ? ?Ingo
>

2011-05-27 11:59:55

by Richard Weinberger

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 1:36 PM, Andrew Lutomirski <[email protected]> wrote:
> On Fri, May 27, 2011 at 2:12 AM, Ingo Molnar <[email protected]> wrote:
>>
>> * Andy Lutomirski <[email protected]> wrote:
>>
>>> On 05/26/2011 02:50 PM, Thomas Gleixner wrote:
>>> > ? ? ? x86-64: Add time to vDSO
>>>
>>> And here's the glibc bug:
>>>
>>> http://sourceware.org/bugzilla/show_bug.cgi?id=12813
>>>
>>> Let the deprecation of the vsyscall page begin :)
>>
>> Yeah :-)
>>
>> On a related note, now that these bits are upstream, what are your
>> rough plans for doing the int81 vsyscall emulation patch?
>>
>> The int81 patch would actually be (much!) more important to the
>> average Linux user than getting rid of the vsyscall from static
>> binaries: the vsyscall is the last predictable executable address in
>> PIE daemons with a dangerous SYSENTER instruction in it ...
>>
>> I'd actually consider accelerating it into v2.6.40.
>
> Working on it slowly. ?In principle I'm writing my thesis right now on
> things that have very little to do with software :)
>
> My rough plans for the near-term stuff are:
>
> 1. Move vvars out of the vsyscall page. ?Otherwise we have at least
> one predictable syscall instruction every 1<<16 seconds because the
> time is executable. ?It's prettier that way, too.
>
> 2. Remove the vsyscall64 sysctl. ?That will reduce the number of
> vsyscalls that require the kernel's help to one. ?(This is a bit
> unfortunate for UML users, but I'm not sure what to do about that.
> UML vgetcpu is already terminally broken.)

Can you please be a bit more precise on that?
When there is anything I can do, let me know...

--
Thanks,
//richard

2011-05-27 12:11:00

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 7:59 AM, richard -rw- weinberger
<[email protected]> wrote:
> On Fri, May 27, 2011 at 1:36 PM, Andrew Lutomirski <[email protected]> wrote:
>> 2. Remove the vsyscall64 sysctl. ?That will reduce the number of
>> vsyscalls that require the kernel's help to one. ?(This is a bit
>> unfortunate for UML users, but I'm not sure what to do about that.
>> UML vgetcpu is already terminally broken.)
>
> Can you please be a bit more precise on that?
> When there is anything I can do, let me know...

glibc's implementation of time, sched_getcpu, and (for static
binaries) gettimeofday calls a fixed address in kernel space which
returns the answer. On UML, that address is presumably in *host*
kernel space and the host will return what the host thinks the answer
is.

That's a bit unfortunate if the guest has a different idea of cpu
numbering or what time it is. Right now, setting kernel.vsyscall64=0
will turn gettimeofday and time (but not sched_getcpu) into a real
syscall. The problem is that I want to remove the vsyscall64 sysctl.

If this is considered enough of a regression, then I guess we can
leave vsyscall64 around for awhile, but it will require extra work in
the soon-to-be syscall emulation hack to make sure that UML can still
trap the syscall.

The real solution is to fix glibc to use the vDSO which should avoid
this problem entirely.

--Andy

>
> --
> Thanks,
> //richard
>

2011-05-27 14:55:20

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> called from a user address (so it has no risk of ever becoming ABI)
> and it will do gettimeofday if called from the right address. ?(I like
> 0xcc better than 0x81 because then I don't have to wonder whether any
> syscall-like instructions start with 0x81.) ?I'm not convinced that
> the existing syscall entries are usable, because syscall itself has a
> different calling convention and int 0x80 is a compat syscall.
>

I started looking at what needs to be done and I wanted to get your
opinion before I wrote a bunch of code that you'd reject. Here are
three ideas for how the int 0xcc / int 0x81 entry could work:

*** Idea 1 ***

Make it a real syscall but with extra constraints. It would have the
same calling convention as the syscall instruction, but it would turn
into SIGKILL if the calling address isn't in the VSYSCALL page or if
the syscall number isn't __NR_clock_gettimeofday. It would BUG() if
called from kernel mode. There are two ways to implement this:

1. Have the interrupt entry check constraints, twiddle its stack frame
to look like a syscall instruction, and jump to the syscall entry.
This way there's little code duplication. (Is it safe to sysret back
to userspace from an interrupt gate? I don't see why not, but it
seems to violate the spirit of the thing.)

2. Duplicate the syscall entry. Ugly.

(int 0x80 is ia32_syscall which is unworkable because it's not there
on !COMPAT and because it calls the compat wrapper which would make
the whole thing a mess.)

Pros:
- ptrace, audit, seccomp, etc. still work. (Although what happens if
ptrace changes the syscall number?)

Cons:
- If we ever want to emulate the whole vsyscall instead of just the
fallback (i.e. stick the int 0xcc instruction at the vsyscall entry)
then it's back to the drawing board.

*** Idea 2 ***

Write the whole thing in C.

Pros: Easy to write and easy to maintain.

Cons:
- We'd have to actually think about ptrace, audit, and seccomp semantics.
- A touch slow. Probably doesn't matter.
- If we let ptrace see the entry and think it's a syscall, then
ptrace might think it can emulate the syscall and things will break
unless we're very careful.

I'm inclined to go with idea 2 with these elaborations:
- If seccomp is enabled, SIGKILL. Might as well match vDSO behavior.
- Don't audit or call ptrace. These things aren't real syscalls and
that would just be confusing. In any case, audit will never see the
non-fallback paths for the vDSO.

--Andy

2011-05-27 14:59:52

by Richard Weinberger

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 2:10 PM, Andrew Lutomirski <[email protected]> wrote:
> On Fri, May 27, 2011 at 7:59 AM, richard -rw- weinberger
> <[email protected]> wrote:
>> On Fri, May 27, 2011 at 1:36 PM, Andrew Lutomirski <[email protected]> wrote:
>>> 2. Remove the vsyscall64 sysctl. ?That will reduce the number of
>>> vsyscalls that require the kernel's help to one. ?(This is a bit
>>> unfortunate for UML users, but I'm not sure what to do about that.
>>> UML vgetcpu is already terminally broken.)
>>
>> Can you please be a bit more precise on that?
>> When there is anything I can do, let me know...
>
> glibc's implementation of time, sched_getcpu, and (for static
> binaries) gettimeofday calls a fixed address in kernel space which
> returns the answer. ?On UML, that address is presumably in *host*
> kernel space and the host will return what the host thinks the answer
> is.
>
> That's a bit unfortunate if the guest has a different idea of cpu
> numbering or what time it is. ?Right now, setting kernel.vsyscall64=0
> will turn gettimeofday and time (but not sched_getcpu) into a real
> syscall. ?The problem is that I want to remove the vsyscall64 sysctl.
>
> If this is considered enough of a regression, then I guess we can
> leave vsyscall64 around for awhile, but it will require extra work in
> the soon-to-be syscall emulation hack to make sure that UML can still
> trap the syscall.

As long the time within UML is synchronous with the host everything is
fine, right?
So, as _last_ choice we could disable the ability to change the time within UML.

IMHO it's not a big deal when getcpu() returns a wrong CPU layout on UML.

> The real solution is to fix glibc to use the vDSO which should avoid
> this problem entirely.

Is this "bug" known? And the biggest question, will Ulrich fix it some day?

--
Thanks,
//richard

2011-05-27 15:06:12

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 10:59 AM, richard -rw- weinberger
<[email protected]> wrote:
> On Fri, May 27, 2011 at 2:10 PM, Andrew Lutomirski <[email protected]> wrote:
>> On Fri, May 27, 2011 at 7:59 AM, richard -rw- weinberger
>> If this is considered enough of a regression, then I guess we can
>> leave vsyscall64 around for awhile, but it will require extra work in
>> the soon-to-be syscall emulation hack to make sure that UML can still
>> trap the syscall.
>
> As long the time within UML is synchronous with the host everything is
> fine, right?

I think so. I haven't used UML in a long time.

> So, as _last_ choice we could disable the ability to change the time within UML.
>
> IMHO it's not a big deal when getcpu() returns a wrong CPU layout on UML.
>
>> The real solution is to fix glibc to use the vDSO which should avoid
>> this problem entirely.
>
> Is this "bug" known? And the biggest question, will Ulrich fix it some day?

I added the bit about UML to
http://sourceware.org/bugzilla/show_bug.cgi?id=12813. I don't know
what Ulrich will do.

--Andy

2011-05-28 15:35:48

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates


* Andrew Lutomirski <[email protected]> wrote:

> On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> > called from a user address (so it has no risk of ever becoming ABI)
> > and it will do gettimeofday if called from the right address. ?(I like
> > 0xcc better than 0x81 because then I don't have to wonder whether any
> > syscall-like instructions start with 0x81.) ?I'm not convinced that
> > the existing syscall entries are usable, because syscall itself has a
> > different calling convention and int 0x80 is a compat syscall.
> >
>
> I started looking at what needs to be done and I wanted to get your
> opinion before I wrote a bunch of code that you'd reject. Here are
> three ideas for how the int 0xcc / int 0x81 entry could work:
>
> *** Idea 1 ***
>
> Make it a real syscall but with extra constraints. It would have the
> same calling convention as the syscall instruction, but it would turn
> into SIGKILL if the calling address isn't in the VSYSCALL page or if
> the syscall number isn't __NR_clock_gettimeofday. It would BUG() if
> called from kernel mode. There are two ways to implement this:
>
> 1. Have the interrupt entry check constraints, twiddle its stack frame
> to look like a syscall instruction, and jump to the syscall entry.
> This way there's little code duplication. (Is it safe to sysret back
> to userspace from an interrupt gate? I don't see why not, but it
> seems to violate the spirit of the thing.)

Yeah, i think it should be safe. Lets try this? It looks like the
simplest variant.

Thanks,

Ingo

2011-05-29 01:41:46

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sat, May 28, 2011 at 11:35 AM, Ingo Molnar <[email protected]> wrote:
>
> * Andrew Lutomirski <[email protected]> wrote:
>
>> On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
>> > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
>> > called from a user address (so it has no risk of ever becoming ABI)
>> > and it will do gettimeofday if called from the right address. ?(I like
>> > 0xcc better than 0x81 because then I don't have to wonder whether any
>> > syscall-like instructions start with 0x81.) ?I'm not convinced that
>> > the existing syscall entries are usable, because syscall itself has a
>> > different calling convention and int 0x80 is a compat syscall.
>> >
>>
>> I started looking at what needs to be done and I wanted to get your
>> opinion before I wrote a bunch of code that you'd reject. ?Here are
>> three ideas for how the int 0xcc / int 0x81 entry could work:
>>
>> *** Idea 1 ***
>>
>> Make it a real syscall but with extra constraints. ?It would have the
>> same calling convention as the syscall instruction, but it would turn
>> into SIGKILL if the calling address isn't in the VSYSCALL page or if
>> the syscall number isn't __NR_clock_gettimeofday. ?It would BUG() if
>> called from kernel mode. ?There are two ways to implement this:
>>
>> 1. Have the interrupt entry check constraints, twiddle its stack frame
>> to look like a syscall instruction, and jump to the syscall entry.
>> This way there's little code duplication. ?(Is it safe to sysret back
>> to userspace from an interrupt gate? ?I don't see why not, but it
>> seems to violate the spirit of the thing.)
>
> Yeah, i think it should be safe. Lets try this? It looks like the
> simplest variant.

The code's in the thread "[PATCH 0/5] x86-64: Remove syscall
instructions at fixed addresses".

The interrupt handler ought to be reviewed especially carefully for
security since user code can call it at will. It has two glaring
problems that I've found already, and I'll send a v2 out soon.

--Andy

2011-05-29 09:51:23

by Richard Weinberger

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Fri, May 27, 2011 at 5:05 PM, Andrew Lutomirski <[email protected]> wrote:
> On Fri, May 27, 2011 at 10:59 AM, richard -rw- weinberger
> <[email protected]> wrote:
>> On Fri, May 27, 2011 at 2:10 PM, Andrew Lutomirski <[email protected]> wrote:
>>> On Fri, May 27, 2011 at 7:59 AM, richard -rw- weinberger
>>> If this is considered enough of a regression, then I guess we can
>>> leave vsyscall64 around for awhile, but it will require extra work in
>>> the soon-to-be syscall emulation hack to make sure that UML can still
>>> trap the syscall.
>>
>> As long the time within UML is synchronous with the host everything is
>> fine, right?
>
> I think so. ?I haven't used UML in a long time.
>
>> So, as _last_ choice we could disable the ability to change the time within UML.
>>
>> IMHO it's not a big deal when getcpu() returns a wrong CPU layout on UML.
>>
>>> The real solution is to fix glibc to use the vDSO which should avoid
>>> this problem entirely.
>>

Yesterday I had a closer look at 64bit UML.
Glibc is always using vsyscalls because 64bit UML does not support the vDSO.

On 32bit UML simply scans the ELF auxiliary vector provided by the host to
get the address of the vDSO.
How can I get this address on a 64bit host?

--
Thanks,
//richard

2011-05-29 14:39:53

by Mikael Pettersson

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

Ingo Molnar writes:
>
> * Andrew Lutomirski <[email protected]> wrote:
>
> > On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> > > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> > > called from a user address (so it has no risk of ever becoming ABI)
> > > and it will do gettimeofday if called from the right address. ?(I like
...
> > Make it a real syscall but with extra constraints. It would have the
> > same calling convention as the syscall instruction, but it would turn
> > into SIGKILL if the calling address isn't in the VSYSCALL page

This will make things difficult for user-space dynamic binary instrumentation
applications, since these normally execute generated code at different
addresses than the original code.

Is there a safe fallback for this particular vsyscall?

/Mikael

2011-05-29 14:58:12

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 5:51 AM, richard -rw- weinberger
<[email protected]> wrote:
> On Fri, May 27, 2011 at 5:05 PM, Andrew Lutomirski <[email protected]> wrote:
>> On Fri, May 27, 2011 at 10:59 AM, richard -rw- weinberger
>> <[email protected]> wrote:
>>> On Fri, May 27, 2011 at 2:10 PM, Andrew Lutomirski <[email protected]> wrote:
>>>> On Fri, May 27, 2011 at 7:59 AM, richard -rw- weinberger
>>>> If this is considered enough of a regression, then I guess we can
>>>> leave vsyscall64 around for awhile, but it will require extra work in
>>>> the soon-to-be syscall emulation hack to make sure that UML can still
>>>> trap the syscall.
>>>
>>> As long the time within UML is synchronous with the host everything is
>>> fine, right?
>>
>> I think so. ?I haven't used UML in a long time.
>>
>>> So, as _last_ choice we could disable the ability to change the time within UML.
>>>
>>> IMHO it's not a big deal when getcpu() returns a wrong CPU layout on UML.
>>>
>>>> The real solution is to fix glibc to use the vDSO which should avoid
>>>> this problem entirely.
>>>
>
> Yesterday I had a closer look at 64bit UML.
> Glibc is always using vsyscalls because 64bit UML does not support the vDSO.
>
> On 32bit UML simply scans the ELF auxiliary vector provided by the host to
> get the address of the vDSO.
> How can I get this address on a 64bit host?

I believe it's exactly the same. There's an auxv entry that points to the vDSO.

--Andy

2011-05-29 14:59:32

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 10:39 AM, Mikael Pettersson <[email protected]> wrote:
> Ingo Molnar writes:
> ?>
> ?> * Andrew Lutomirski <[email protected]> wrote:
> ?>
> ?> > On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> ?> > > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> ?> > > called from a user address (so it has no risk of ever becoming ABI)
> ?> > > and it will do gettimeofday if called from the right address. ?(I like
> ...
> ?> > Make it a real syscall but with extra constraints. ?It would have the
> ?> > same calling convention as the syscall instruction, but it would turn
> ?> > into SIGKILL if the calling address isn't in the VSYSCALL page
>
> This will make things difficult for user-space dynamic binary instrumentation
> applications, since these normally execute generated code at different
> addresses than the original code.
>
> Is there a safe fallback for this particular vsyscall?

All of the vsyscalls have vDSO versions that work like any other code.

Alternatively, if the dynamic instrumentation code knew about
vsyscalls, it could just not instrument addresses in the vsyscall
page.

What existing applications would get broken?

--Andy

2011-05-29 15:10:57

by Richard Weinberger

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 4:57 PM, Andrew Lutomirski <[email protected]> wrote:
> On Sun, May 29, 2011 at 5:51 AM, richard -rw- weinberger
> <[email protected]> wrote:
>> On Fri, May 27, 2011 at 5:05 PM, Andrew Lutomirski <[email protected]> wrote:
>>> On Fri, May 27, 2011 at 10:59 AM, richard -rw- weinberger
>>> <[email protected]> wrote:
>>>> On Fri, May 27, 2011 at 2:10 PM, Andrew Lutomirski <[email protected]> wrote:
>>>>> On Fri, May 27, 2011 at 7:59 AM, richard -rw- weinberger
>>>>> If this is considered enough of a regression, then I guess we can
>>>>> leave vsyscall64 around for awhile, but it will require extra work in
>>>>> the soon-to-be syscall emulation hack to make sure that UML can still
>>>>> trap the syscall.
>>>>
>>>> As long the time within UML is synchronous with the host everything is
>>>> fine, right?
>>>
>>> I think so. ?I haven't used UML in a long time.
>>>
>>>> So, as _last_ choice we could disable the ability to change the time within UML.
>>>>
>>>> IMHO it's not a big deal when getcpu() returns a wrong CPU layout on UML.
>>>>
>>>>> The real solution is to fix glibc to use the vDSO which should avoid
>>>>> this problem entirely.
>>>>
>>
>> Yesterday I had a closer look at 64bit UML.
>> Glibc is always using vsyscalls because 64bit UML does not support the vDSO.
>>
>> On 32bit UML simply scans the ELF auxiliary vector provided by the host to
>> get the address of the vDSO.
>> How can I get this address on a 64bit host?
>
> I believe it's exactly the same. ?There's an auxv entry that points to the vDSO.

I don't think so.
See:
http://www.win.tue.nl/~aeb/linux/lk/lk-4.html
Section "Address space randomization".
The demo program finds the vDSO only on x86.

UML uses quite the same method to find it.
arch/um/os-Linux/elf_aux.c

--
Thanks,
//richard

2011-05-29 15:28:35

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 11:10 AM, richard -rw- weinberger
<[email protected]> wrote:
> On Sun, May 29, 2011 at 4:57 PM, Andrew Lutomirski <[email protected]> wrote:
>> On Sun, May 29, 2011 at 5:51 AM, richard -rw- weinberger
>> <[email protected]> wrote:
>>> Yesterday I had a closer look at 64bit UML.
>>> Glibc is always using vsyscalls because 64bit UML does not support the vDSO.
>>>
>>> On 32bit UML simply scans the ELF auxiliary vector provided by the host to
>>> get the address of the vDSO.
>>> How can I get this address on a 64bit host?
>>
>> I believe it's exactly the same. ?There's an auxv entry that points to the vDSO.
>
> I don't think so.
> See:
> http://www.win.tue.nl/~aeb/linux/lk/lk-4.html
> Section "Address space randomization".
> The demo program finds the vDSO only on x86.
>
> UML uses quite the same method to find it.
> arch/um/os-Linux/elf_aux.c

The attached program works for me.

I don't know what this is supposed to mean, though:

/* See if the page is under TASK_SIZE */
if (vsyscall_ehdr < (unsigned long) envp)
vsyscall_ehdr = 0;

First, envp != TASK_SIZE.

Second, the vDSO can be wherever it wants. On current kernels at
least it is *always* mapped below TASK_SIZE (in the unsigned sense)
because it's mapped into user address space.

--Andy


Attachments:
auxv.c (998.00 B)

2011-05-29 16:01:21

by Mikael Pettersson

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

Andrew Lutomirski writes:
> On Sun, May 29, 2011 at 10:39 AM, Mikael Pettersson <[email protected]> wrote:
> > Ingo Molnar writes:
> > ?>
> > ?> * Andrew Lutomirski <[email protected]> wrote:
> > ?>
> > ?> > On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> > ?> > > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> > ?> > > called from a user address (so it has no risk of ever becoming ABI)
> > ?> > > and it will do gettimeofday if called from the right address. ?(I like
> > ...
> > ?> > Make it a real syscall but with extra constraints. ?It would have the
> > ?> > same calling convention as the syscall instruction, but it would turn
> > ?> > into SIGKILL if the calling address isn't in the VSYSCALL page
> >
> > This will make things difficult for user-space dynamic binary instrumentation
> > applications, since these normally execute generated code at different
> > addresses than the original code.
> >
> > Is there a safe fallback for this particular vsyscall?
>
> All of the vsyscalls have vDSO versions that work like any other code.

Easiest would be if we can simply map int $0xcc with rAX==FOO to syscall or
int 0x80 with rAX==BAR.

We currently don't even know about the vDSO, it's all just user-space code
to us.

> Alternatively, if the dynamic instrumentation code knew about
> vsyscalls, it could just not instrument addresses in the vsyscall
> page.

Not instrumenting code is not an option, unless we can prove that the
code in question has no relevant side-effects or unexpected control-flow.
(Where "side-effects" relate both to the integrity of the instrumentation
engine and the application-specific payload it's attaching to the code.)

> What existing applications would get broken?

My concern is ThreadSpotter, but any user-space dynamic binary instrumentation
engine that instruments down to the raw kernel interface (syscall/sysenter/int
instructions) would have a problem with syscalls that only work at specific
addresses.

Anyway, if I can map that vsyscall to a plain proper syscall, then I'm OK.

/Mikael

2011-05-29 16:35:25

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 12:01 PM, Mikael Pettersson <[email protected]> wrote:
> Andrew Lutomirski writes:
> ?> On Sun, May 29, 2011 at 10:39 AM, Mikael Pettersson <[email protected]> wrote:
> ?> > Ingo Molnar writes:
> ?> > ?>
> ?> > ?> * Andrew Lutomirski <[email protected]> wrote:
> ?> > ?>
> ?> > ?> > On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> ?> > ?> > > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> ?> > ?> > > called from a user address (so it has no risk of ever becoming ABI)
> ?> > ?> > > and it will do gettimeofday if called from the right address. ?(I like
> ?> > ...
> ?> > ?> > Make it a real syscall but with extra constraints. ?It would have the
> ?> > ?> > same calling convention as the syscall instruction, but it would turn
> ?> > ?> > into SIGKILL if the calling address isn't in the VSYSCALL page
> ?> >
> ?> > This will make things difficult for user-space dynamic binary instrumentation
> ?> > applications, since these normally execute generated code at different
> ?> > addresses than the original code.
> ?> >
> ?> > Is there a safe fallback for this particular vsyscall?
> ?>
> ?> All of the vsyscalls have vDSO versions that work like any other code.
>
> Easiest would be if we can simply map int $0xcc with rAX==FOO to syscall or
> int 0x80 with rAX==BAR.

Yes and no.

With the code I just posted (and am fixing up now) that will work.
But if we want to replace the entire vsyscall page with three int 0xcc
and 4090 int3 instructions, then we can't look at eax because it won't
contain anything meaningful.

--Andy

>
> We currently don't even know about the vDSO, it's all just user-space code
> to us.
>
> ?> Alternatively, if the dynamic instrumentation code knew about
> ?> vsyscalls, it could just not instrument addresses in the vsyscall
> ?> page.
>
> Not instrumenting code is not an option, unless we can prove that the
> code in question has no relevant side-effects or unexpected control-flow.
> (Where "side-effects" relate both to the integrity of the instrumentation
> engine and the application-specific payload it's attaching to the code.)

Calls to 0xffffffffff600000, 0xffffffffff600400, and
0xffffffffff600800 are syscalls, as an (unfortunate) part of the ABI.

>
> ?> What existing applications would get broken?
>
> My concern is ThreadSpotter, but any user-space dynamic binary instrumentation
> engine that instruments down to the raw kernel interface (syscall/sysenter/int
> instructions) would have a problem with syscalls that only work at specific
> addresses.

I'll look.

>
> Anyway, if I can map that vsyscall to a plain proper syscall, then I'm OK.

All three vsyscalls can be replaced with real syscalls without side
effects. Would it be possible to teach the instrumentation code to
deal with that?

--Andy

2011-05-29 16:40:38

by Richard Weinberger

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 5:28 PM, Andrew Lutomirski <[email protected]> wrote:
> On Sun, May 29, 2011 at 11:10 AM, richard -rw- weinberger
> <[email protected]> wrote:
>> On Sun, May 29, 2011 at 4:57 PM, Andrew Lutomirski <[email protected]> wrote:
>>> On Sun, May 29, 2011 at 5:51 AM, richard -rw- weinberger
>>> <[email protected]> wrote:
>>>> Yesterday I had a closer look at 64bit UML.
>>>> Glibc is always using vsyscalls because 64bit UML does not support the vDSO.
>>>>
>>>> On 32bit UML simply scans the ELF auxiliary vector provided by the host to
>>>> get the address of the vDSO.
>>>> How can I get this address on a 64bit host?
>>>
>>> I believe it's exactly the same. ?There's an auxv entry that points to the vDSO.
>>
>> I don't think so.
>> See:
>> http://www.win.tue.nl/~aeb/linux/lk/lk-4.html
>> Section "Address space randomization".
>> The demo program finds the vDSO only on x86.
>>
>> UML uses quite the same method to find it.
>> arch/um/os-Linux/elf_aux.c
>
> The attached program works for me.

Shouldn't it print the addresses of both AT_SYSINFO and AT_SYSINFO_EHDR?
On my x86_64 system (2.6.37) it seems to find only AT_SYSINFO_EHDR.

> I don't know what this is supposed to mean, though:
>
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?/* See if the page is under TASK_SIZE */
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?if (vsyscall_ehdr < (unsigned long) envp)
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?vsyscall_ehdr = 0;

See commits a502a359 and 14251809.

>
> First, envp != TASK_SIZE.
>
> Second, the vDSO can be wherever it wants. ?On current kernels at
> least it is *always* mapped below TASK_SIZE (in the unsigned sense)
> because it's mapped into user address space.

Hmm, this would explain why UML cannot find it. :-\

--
Thanks,
//richard

2011-05-29 18:06:58

by Mikael Pettersson

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

Andrew Lutomirski writes:
> On Sun, May 29, 2011 at 12:01 PM, Mikael Pettersson <[email protected]> wrote:
> > Andrew Lutomirski writes:
> > ?> On Sun, May 29, 2011 at 10:39 AM, Mikael Pettersson <[email protected]> wrote:
> > ?> > Ingo Molnar writes:
> > ?> > ?>
> > ?> > ?> * Andrew Lutomirski <[email protected]> wrote:
> > ?> > ?>
> > ?> > ?> > On Fri, May 27, 2011 at 7:36 AM, Andrew Lutomirski <[email protected]> wrote:
> > ?> > ?> > > 3. Add int 0xcc and use it from vgettimeofday. ?It will SIGSEGV if
> > ?> > ?> > > called from a user address (so it has no risk of ever becoming ABI)
> > ?> > ?> > > and it will do gettimeofday if called from the right address. ?(I like
> > ?> > ...
> > ?> > ?> > Make it a real syscall but with extra constraints. ?It would have the
> > ?> > ?> > same calling convention as the syscall instruction, but it would turn
> > ?> > ?> > into SIGKILL if the calling address isn't in the VSYSCALL page
> > ?> >
> > ?> > This will make things difficult for user-space dynamic binary instrumentation
> > ?> > applications, since these normally execute generated code at different
> > ?> > addresses than the original code.
> > ?> >
> > ?> > Is there a safe fallback for this particular vsyscall?
> > ?>
> > ?> All of the vsyscalls have vDSO versions that work like any other code.
> >
> > Easiest would be if we can simply map int $0xcc with rAX==FOO to syscall or
> > int 0x80 with rAX==BAR.
>
> Yes and no.
>
> With the code I just posted (and am fixing up now) that will work.
> But if we want to replace the entire vsyscall page with three int 0xcc
> and 4090 int3 instructions, then we can't look at eax because it won't
> contain anything meaningful.

I can relatively easily also consider the original application rIP
when decoding and translating these instructions.

>
> --Andy
>
> >
> > We currently don't even know about the vDSO, it's all just user-space code
> > to us.
> >
> > ?> Alternatively, if the dynamic instrumentation code knew about
> > ?> vsyscalls, it could just not instrument addresses in the vsyscall
> > ?> page.
> >
> > Not instrumenting code is not an option, unless we can prove that the
> > code in question has no relevant side-effects or unexpected control-flow.
> > (Where "side-effects" relate both to the integrity of the instrumentation
> > engine and the application-specific payload it's attaching to the code.)
>
> Calls to 0xffffffffff600000, 0xffffffffff600400, and
> 0xffffffffff600800 are syscalls, as an (unfortunate) part of the ABI.
>
> >
> > ?> What existing applications would get broken?
> >
> > My concern is ThreadSpotter, but any user-space dynamic binary instrumentation
> > engine that instruments down to the raw kernel interface (syscall/sysenter/int
> > instructions) would have a problem with syscalls that only work at specific
> > addresses.
>
> I'll look.
>
> >
> > Anyway, if I can map that vsyscall to a plain proper syscall, then I'm OK.
>
> All three vsyscalls can be replaced with real syscalls without side
> effects. Would it be possible to teach the instrumentation code to
> deal with that?

Yes, I just need to know how to identify them and what their equivalents are.
E.g., an int3 at <known address> becomes syscall rAX=<some constant>.

Sounds like this change will be manageable after all. Thanks.

/Mikael

2011-05-29 18:44:55

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 2:06 PM, Mikael Pettersson <[email protected]> wrote:
> Andrew Lutomirski writes:
> ?> On Sun, May 29, 2011 at 12:01 PM, Mikael Pettersson <[email protected]> wrote:
> ?> > Andrew Lutomirski writes:
> ?> > ?>
> ?> > ?> All of the vsyscalls have vDSO versions that work like any other code.
> ?> >
> ?> > Easiest would be if we can simply map int $0xcc with rAX==FOO to syscall or
> ?> > int 0x80 with rAX==BAR.
> ?>
> ?> Yes and no.
> ?>
> ?> With the code I just posted (and am fixing up now) that will work.
> ?> But if we want to replace the entire vsyscall page with three int 0xcc
> ?> and 4090 int3 instructions, then we can't look at eax because it won't
> ?> contain anything meaningful.
>
> I can relatively easily also consider the original application rIP
> when decoding and translating these instructions.
>
> ?>
> ?> --Andy
> ?>
> ?> >
> ?> > We currently don't even know about the vDSO, it's all just user-space code
> ?> > to us.
> ?> >
> ?> > ?> Alternatively, if the dynamic instrumentation code knew about
> ?> > ?> vsyscalls, it could just not instrument addresses in the vsyscall
> ?> > ?> page.
> ?> >
> ?> > Not instrumenting code is not an option, unless we can prove that the
> ?> > code in question has no relevant side-effects or unexpected control-flow.
> ?> > (Where "side-effects" relate both to the integrity of the instrumentation
> ?> > engine and the application-specific payload it's attaching to the code.)
> ?>
> ?> Calls to 0xffffffffff600000, 0xffffffffff600400, and
> ?> 0xffffffffff600800 are syscalls, as an (unfortunate) part of the ABI.
> ?>
> ?> >
> ?> > ?> What existing applications would get broken?
> ?> >
> ?> > My concern is ThreadSpotter, but any user-space dynamic binary instrumentation
> ?> > engine that instruments down to the raw kernel interface (syscall/sysenter/int
> ?> > instructions) would have a problem with syscalls that only work at specific
> ?> > addresses.
> ?>
> ?> I'll look.
> ?>
> ?> >
> ?> > Anyway, if I can map that vsyscall to a plain proper syscall, then I'm OK.
> ?>
> ?> All three vsyscalls can be replaced with real syscalls without side
> ?> effects. ?Would it be possible to teach the instrumentation code to
> ?> deal with that?
>
> Yes, I just need to know how to identify them and what their equivalents are.
> E.g., an int3 at <known address> becomes syscall rAX=<some constant>.
>
> Sounds like this change will be manageable after all. ?Thanks.

I'm not entirely sure I like that -- that way if we ever change it
again we break your stuff again.

Here are two proposals.

1. Teach your code that call 0xffffffffff600000 means
gettimeofday(rdi, rsi). That's guaranteed to never change and will
keep working even if we start to emulate vsyscalls by marking the page
not present and trapping the instruction fetch fault.

2. Use a magic incantation like:

mov $0xce,%al
int $0xcc
ret

for gettimeofday. (The other two vsyscalls could use 0xcc and 0xf4,
for example.) If I did this, I would make the 0xcc handler fault if
called from kernel space with al and rip not matching and it would log
a warning (but not fault) if called from user memory.

The idea is that, as far as a binary instrumentation tools is
concerned, int $0xcc is just a two-byte instruction without any funny
control flow. Also, if I looked everything up correctly, that magic
sequence will either fault or turn into plain ret if called at the
wrong offset.

If we went this route, then no software should assume *anything* about
int 0xcc because it could be changed again in the future. I might
even want to randomize the magic constants on each boot to make
certain that no one distributes software that cares.

--Andy

>
> /Mikael
>

2011-05-30 03:26:26

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [GIT pull] x86 vdso updates

On Sun, May 29, 2011 at 2:44 PM, Andrew Lutomirski <[email protected]> wrote:
> On Sun, May 29, 2011 at 2:06 PM, Mikael Pettersson <[email protected]> wrote:
>> Andrew Lutomirski writes:
>> ?> On Sun, May 29, 2011 at 12:01 PM, Mikael Pettersson <[email protected]> wrote:
>> ?> > Andrew Lutomirski writes:
>> ?> > ?>
>> ?> > ?> All of the vsyscalls have vDSO versions that work like any other code.
>> ?> >
>> ?> > Easiest would be if we can simply map int $0xcc with rAX==FOO to syscall or
>> ?> > int 0x80 with rAX==BAR.
>> ?>
>> ?> Yes and no.
>> ?>
>> ?> With the code I just posted (and am fixing up now) that will work.
>> ?> But if we want to replace the entire vsyscall page with three int 0xcc
>> ?> and 4090 int3 instructions, then we can't look at eax because it won't
>> ?> contain anything meaningful.
>>
>> I can relatively easily also consider the original application rIP
>> when decoding and translating these instructions.
>>
>> ?>
>> ?> --Andy
>> ?>
>> ?> >
>> ?> > We currently don't even know about the vDSO, it's all just user-space code
>> ?> > to us.
>> ?> >
>> ?> > ?> Alternatively, if the dynamic instrumentation code knew about
>> ?> > ?> vsyscalls, it could just not instrument addresses in the vsyscall
>> ?> > ?> page.
>> ?> >
>> ?> > Not instrumenting code is not an option, unless we can prove that the
>> ?> > code in question has no relevant side-effects or unexpected control-flow.
>> ?> > (Where "side-effects" relate both to the integrity of the instrumentation
>> ?> > engine and the application-specific payload it's attaching to the code.)
>> ?>
>> ?> Calls to 0xffffffffff600000, 0xffffffffff600400, and
>> ?> 0xffffffffff600800 are syscalls, as an (unfortunate) part of the ABI.
>> ?>
>> ?> >
>> ?> > ?> What existing applications would get broken?
>> ?> >
>> ?> > My concern is ThreadSpotter, but any user-space dynamic binary instrumentation
>> ?> > engine that instruments down to the raw kernel interface (syscall/sysenter/int
>> ?> > instructions) would have a problem with syscalls that only work at specific
>> ?> > addresses.
>> ?>
>> ?> I'll look.
>> ?>
>> ?> >
>> ?> > Anyway, if I can map that vsyscall to a plain proper syscall, then I'm OK.
>> ?>
>> ?> All three vsyscalls can be replaced with real syscalls without side
>> ?> effects. ?Would it be possible to teach the instrumentation code to
>> ?> deal with that?
>>
>> Yes, I just need to know how to identify them and what their equivalents are.
>> E.g., an int3 at <known address> becomes syscall rAX=<some constant>.
>>
>> Sounds like this change will be manageable after all. ?Thanks.
>
> I'm not entirely sure I like that -- that way if we ever change it
> again we break your stuff again.
>
> Here are two proposals.
>
> 1. Teach your code that call 0xffffffffff600000 means
> gettimeofday(rdi, rsi). ?That's guaranteed to never change and will
> keep working even if we start to emulate vsyscalls by marking the page
> not present and trapping the instruction fetch fault.

valgrind might be doing this. It does not appear to trace directly
into vsyscalls, or at least it doesn't copy the int 0xcc instruction
out of them.

--Andy