2002-10-18 04:27:58

by john stultz

[permalink] [raw]
Subject: [RFC][PATCH] linux-2.5.34_vsyscall_A0

Linus, Andrea, all,

This is a port of Andrea's x86-64 vsyscall(userspace) gettimeofday to
i386. Its fairly untested, but it works here! I'm sure it probably has a
few bugs, but since a number of folks are wanting this, I figured I'd go
ahead and post and just take the abuse.

I realize that this is probably in the "too late" category, but please
give any feedback you can and I'll try my best to get this ready to go
before sunday night.

A small test application will follow shortly.

All comments/flames/etc emphatically requested.

thanks
-john

diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile Thu Oct 17 21:25:02 2002
+++ b/arch/i386/kernel/Makefile Thu Oct 17 21:25:02 2002
@@ -9,7 +9,7 @@
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o \
- bootflag.o
+ bootflag.o vsyscall.o

obj-y += cpu/
obj-y += timers/
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c Thu Oct 17 21:25:02 2002
+++ b/arch/i386/kernel/time.c Thu Oct 17 21:25:02 2002
@@ -69,7 +69,10 @@
unsigned long cpu_khz; /* Detected as we calibrate the TSC */

extern rwlock_t xtime_lock;
-extern unsigned long wall_jiffies;
+struct timespec __xtime __section_xtime;
+unsigned long __wall_jiffies __section_wall_jiffies;
+struct timezone __sys_tz __section_sys_tz;
+volatile unsigned long __jiffies __section_jiffies;

spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;

@@ -110,6 +113,7 @@
void do_settimeofday(struct timeval *tv)
{
write_lock_irq(&xtime_lock);
+ vxtime_lock();
/*
* This is revolting. We need to set "xtime" correctly. However, the
* value in this location is the value at the most recent update of
@@ -126,6 +130,8 @@

xtime.tv_sec = tv->tv_sec;
xtime.tv_nsec = (tv->tv_usec * 1000);
+ vxtime_unlock();
+
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
@@ -277,11 +283,11 @@
* locally disabled. -arca
*/
write_lock(&xtime_lock);
-
+ vxtime_lock();
timer->mark_offset();

do_timer_interrupt(irq, NULL, regs);
-
+ vxtime_unlock();
write_unlock(&xtime_lock);

}
diff -Nru a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
--- a/arch/i386/kernel/timers/timer_tsc.c Thu Oct 17 21:25:02 2002
+++ b/arch/i386/kernel/timers/timer_tsc.c Thu Oct 17 21:25:02 2002
@@ -10,6 +10,7 @@
#include <linux/cpufreq.h>

#include <asm/timer.h>
+#include <asm/vsyscall.h>
#include <asm/io.h>

extern int x86_udelay_tsc;
@@ -17,16 +18,16 @@

static int use_tsc;
/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
+int __delay_at_last_interrupt __section_delay_at_last_interrupt;

-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
+unsigned long __last_tsc_low __section_last_tsc_low; /* lsb 32 bits of Time Stamp Counter */

/* Cached *multiplier* to convert TSC counts to microseconds.
* (see the equation below).
* Equal to 2^32 * (1 / (clocks per usec) ).
* Initialized in time_init.
*/
-unsigned long fast_gettimeoffset_quotient;
+unsigned long __fast_gettimeoffset_quotient __section_fast_gettimeoffset_quotient;

static unsigned long get_offset_tsc(void)
{
diff -Nru a/arch/i386/kernel/vsyscall.c b/arch/i386/kernel/vsyscall.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/arch/i386/kernel/vsyscall.c Thu Oct 17 21:25:02 2002
@@ -0,0 +1,203 @@
+/*
+ * linux/arch/x86_64/kernel/vsyscall.c
+ *
+ * Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
+ *
+ * Thanks to [email protected] for some useful hint.
+ * Special thanks to Ingo Molnar for his early experience with
+ * a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ * at virtual address -10Mbyte+1024bytes etc... There are at max 8192
+ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ * jumping out of line if necessary.
+ *
+ * $Id: vsyscall.c,v 1.9 2002/03/21 13:42:58 ak Exp $
+ *
+ * Ported to i386 by John Stultz <[email protected]>
+ */
+
+/*
+ * TODO 2001-03-20:
+ *
+ * 1) make page fault handler detect faults on page1-page-last of the vsyscall
+ * virtual space, and make it increase %rip and write -ENOSYS in %rax (so
+ * we'll be able to upgrade to a new glibc without upgrading kernel after
+ * we add more vsyscalls.
+ * 2) Possibly we need a fixmap table for the vsyscalls too if we want
+ * to avoid SIGSEGV and we want to return -EFAULT from the vsyscalls as well.
+ * Can we segfault inside a "syscall"? We can fix this anytime and those fixes
+ * won't be visible for userspace. Not fixing this is a noop for correct programs,
+ * broken programs will segfault and there's no security risk until we choose to
+ * fix it.
+ *
+ * These are not urgent things that we need to address only before shipping the first
+ * production binary kernels.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/msr.h>
+#include <asm/system.h>
+
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+
+//#define NO_VSYSCALL 1
+
+#ifdef NO_VSYSCALL
+#include <asm/unistd.h>
+
+static int errno __section_vxtime_sequence;
+
+static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz)
+
+#else
+long __vxtime_sequence[2] __section_vxtime_sequence;
+
+static inline void do_vgettimeofday(struct timeval * tv)
+{
+ long sequence;
+ unsigned long usec, sec;
+
+ do {
+ unsigned long eax, edx;
+
+ sequence = __vxtime_sequence[1];
+ rmb();
+
+ /* Read the Time Stamp Counter */
+ rdtsc(eax,edx);
+
+ /* .. relative to previous jiffy (32 bits is enough) */
+ eax -= __last_tsc_low; /* tsc_low delta */
+
+ /*
+ * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
+ * = (tsc_low delta) * (usecs_per_clock)
+ * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
+ *
+ * Using a mull instead of a divl saves up to 31 clock cycles
+ * in the critical path.
+ */
+
+
+ __asm__("mull %2"
+ :"=a" (eax), "=d" (edx)
+ :"rm" (__fast_gettimeoffset_quotient),
+ "0" (eax));
+
+ /* our adjusted time offset in microseconds */
+ usec = __delay_at_last_interrupt + edx;
+
+ {
+ unsigned long lost = __jiffies - __wall_jiffies;
+ if (lost)
+ usec += lost * (1000000 / HZ);
+ }
+ sec = __xtime.tv_sec;
+ usec += (__xtime.tv_nsec / 1000);;
+
+ rmb();
+ } while (sequence != __vxtime_sequence[0]);
+
+ tv->tv_sec = sec + usec / 1000000;
+ tv->tv_usec = usec % 1000000;
+}
+
+static inline void do_get_tz(struct timezone * tz)
+{
+ long sequence;
+
+ do {
+ sequence = __vxtime_sequence[1];
+ rmb();
+
+ *tz = __sys_tz;
+
+ rmb();
+ } while (sequence != __vxtime_sequence[0]);
+}
+#endif
+
+static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+#ifdef NO_VSYSCALL
+ return gettimeofday(tv,tz);
+#else
+ if (tv)
+ do_vgettimeofday(tv);
+ if (tz)
+ do_get_tz(tz);
+ return 0;
+#endif
+}
+
+static time_t __vsyscall(1) vtime(time_t * t)
+{
+ struct timeval tv;
+ vgettimeofday(&tv,NULL);
+ if (t)
+ *t = tv.tv_sec;
+ return tv.tv_sec;
+}
+
+static long __vsyscall(2) venosys_0(void)
+{
+ return -ENOSYS;
+}
+
+static long __vsyscall(3) venosys_1(void)
+{
+ return -ENOSYS;
+}
+static void __init map_vsyscall(void)
+{
+ extern char __vsyscall_0;
+ unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - __START_KERNEL_map;
+ pgd_t* pd;
+ pmd_t* pm;
+
+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+
+ /*
+ * Set vsyscall's fixmap pmd to be user readable:
+ * XXX HACK ALERT: this assumes non-vsyscall kernel space
+ * pte's will not have their userbit set. Otherwise this could
+ * be a security problem. Is this ok? Please advise [email protected]
+ */
+ pd = pgd_offset_k((unsigned long)&__last_tsc_low);
+ pm = pmd_offset(pd,(unsigned long)&__last_tsc_low);
+ pm->pmd |= _PAGE_USER;
+
+}
+
+static int __init vsyscall_init(void)
+{
+ printk("VSYSCALL: consistency checks...");
+ if ((unsigned long) &vgettimeofday != VSYSCALL_ADDR(__NR_vgettimeofday))
+ panic("vgettimeofday link addr broken");
+ if ((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime))
+ panic("vtime link addr broken");
+ if (VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))
+ panic("fixmap first vsyscall %lx should be %lx", __fix_to_virt(VSYSCALL_FIRST_PAGE),
+ VSYSCALL_ADDR(0));
+ printk("passed...mapping...");
+ map_vsyscall();
+ printk("done.\n");
+
+ printk("VSYSCALL: fixmap virt addr: 0x%lx\n",
+ __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
+ return 0;
+}
+
+__initcall(vsyscall_init);
diff -Nru a/arch/i386/vmlinux.lds.S b/arch/i386/vmlinux.lds.S
--- a/arch/i386/vmlinux.lds.S Thu Oct 17 21:25:02 2002
+++ b/arch/i386/vmlinux.lds.S Thu Oct 17 21:25:02 2002
@@ -4,7 +4,7 @@
OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(_start)
-jiffies = jiffies_64;
+jiffies_64 = jiffies;
SECTIONS
{
. = 0xC0000000 + 0x100000;
@@ -90,6 +90,33 @@
__bss_stop = .;

_end = . ;
+
+ . = ALIGN(64);
+ .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
+ .vsyscall_0 0xffffe000: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
+ __vsyscall_0 = LOADADDR(.vsyscall_0);
+ . = ALIGN(64);
+ .vxtime_sequence : AT ((LOADADDR(.vsyscall_0) + SIZEOF(.vsyscall_0) + 63) & ~(63)) { *(.vxtime_sequence) }
+ vxtime_sequence = LOADADDR(.vxtime_sequence);
+ .last_tsc_low : AT (LOADADDR(.vxtime_sequence) + SIZEOF(.vxtime_sequence)) { *(.last_tsc_low) }
+ last_tsc_low = LOADADDR(.last_tsc_low);
+ .delay_at_last_interrupt : AT (LOADADDR(.last_tsc_low) + SIZEOF(.last_tsc_low)) { *(.delay_at_last_interrupt) }
+ delay_at_last_interrupt = LOADADDR(.delay_at_last_interrupt);
+ .fast_gettimeoffset_quotient : AT (LOADADDR(.delay_at_last_interrupt) + SIZEOF(.delay_at_last_interrupt)) { *(.fast_gettimeoffset_quotient) }
+ fast_gettimeoffset_quotient = LOADADDR(.fast_gettimeoffset_quotient);
+ .wall_jiffies : AT (LOADADDR(.fast_gettimeoffset_quotient) + SIZEOF(.fast_gettimeoffset_quotient)) { *(.wall_jiffies) }
+ wall_jiffies = LOADADDR(.wall_jiffies);
+ .sys_tz : AT (LOADADDR(.wall_jiffies) + SIZEOF(.wall_jiffies)) { *(.sys_tz) }
+ sys_tz = LOADADDR(.sys_tz);
+ . = ALIGN(16);
+ .jiffies : AT ((LOADADDR(.sys_tz) + SIZEOF(.sys_tz) + 15) & ~(15)) { *(.jiffies) }
+ jiffies = LOADADDR(.jiffies);
+ . = ALIGN(16);
+ .xtime : AT ((LOADADDR(.jiffies) + SIZEOF(.jiffies) + 15) & ~(15)) { *(.xtime) }
+ xtime = LOADADDR(.xtime);
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
+ . = LOADADDR(.vsyscall_0) + 4096;

/* Sections to be discarded */
/DISCARD/ : {
diff -Nru a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
--- a/include/asm-i386/fixmap.h Thu Oct 17 21:25:02 2002
+++ b/include/asm-i386/fixmap.h Thu Oct 17 21:25:02 2002
@@ -18,6 +18,7 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
+#include <asm/vsyscall.h>
#ifdef CONFIG_HIGHMEM
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -49,6 +50,8 @@
* fix-mapped?
*/
enum fixed_addresses {
+ VSYSCALL_LAST_PAGE,
+ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
#ifdef CONFIG_X86_LOCAL_APIC
FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
#endif
diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
--- a/include/asm-i386/pgtable.h Thu Oct 17 21:25:02 2002
+++ b/include/asm-i386/pgtable.h Thu Oct 17 21:25:02 2002
@@ -139,11 +139,14 @@
#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
-
+#define __PAGE_KERNEL_VSYSCALL \
+ (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+
#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)

/*
* The i386 can't do page protection for execute, and considers that
diff -Nru a/include/asm-i386/vsyscall.h b/include/asm-i386/vsyscall.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-i386/vsyscall.h Thu Oct 17 21:25:02 2002
@@ -0,0 +1,49 @@
+#ifndef _ASM_i386_VSYSCALL_H_
+#define _ASM_i386_VSYSCALL_H_
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+};
+
+#define VSYSCALL_START 0xffffe000
+#define VSYSCALL_SIZE 1024
+#define VSYSCALL_END (0xffffe000 + PAGE_SIZE)
+#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
+
+#ifdef __KERNEL__
+#define __START_KERNEL_map 0xC0000000
+
+#define __section_last_tsc_low __attribute__ ((unused, __section__ (".last_tsc_low")))
+#define __section_delay_at_last_interrupt __attribute__ ((unused, __section__ (".delay_at_last_interrupt")))
+#define __section_fast_gettimeoffset_quotient __attribute__ ((unused, __section__ (".fast_gettimeoffset_quotient")))
+#define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies")))
+#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies")))
+#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz")))
+#define __section_xtime __attribute__ ((unused, __section__ (".xtime")))
+#define __section_vxtime_sequence __attribute__ ((unused, __section__ (".vxtime_sequence")))
+
+/* vsyscall space (readonly) */
+extern long __vxtime_sequence[2];
+extern int __delay_at_last_interrupt;
+extern unsigned long __last_tsc_low;
+extern unsigned long __fast_gettimeoffset_quotient;
+extern struct timespec __xtime;
+extern volatile unsigned long __jiffies;
+extern unsigned long __wall_jiffies;
+extern struct timezone __sys_tz;
+
+/* kernel space (writeable) */
+extern unsigned long last_tsc_low;
+extern int delay_at_last_interrupt;
+extern unsigned long fast_gettimeoffset_quotient;
+extern unsigned long wall_jiffies;
+extern struct timezone sys_tz;
+extern long vxtime_sequence[2];
+
+#define vxtime_lock() do { vxtime_sequence[0]++; wmb(); } while(0)
+#define vxtime_unlock() do { wmb(); vxtime_sequence[1]++; } while (0)
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_i386_VSYSCALL_H_ */


2002-10-18 04:28:51

by john stultz

[permalink] [raw]
Subject: [RFC] linux-2.5.34_vsyscall_A0 - Test App

All,
Attached is a application to test vsyscall_A0 gettimeofday.

thanks
-john



Attachments:
vsyscall-test.c (295.00 B)

2002-10-18 11:08:41

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

Hello,

On Thu, Oct 17, 2002 at 09:25:31PM -0700, john stultz wrote:
> Linus, Andrea, all,
>
> This is a port of Andrea's x86-64 vsyscall(userspace) gettimeofday to
> i386. Its fairly untested, but it works here! I'm sure it probably has a
> few bugs, but since a number of folks are wanting this, I figured I'd go
> ahead and post and just take the abuse.
>
> I realize that this is probably in the "too late" category, but please
> give any feedback you can and I'll try my best to get this ready to go
> before sunday night.

the main reason it wasn't backported to i386 is that if glibc start
using the vgettimeofday instead of sys_gettimeofday, you won't be able
to downgrade kernel anymore to say 2.4 (oh yeah, I would then backport
it to my tree or Marcelo could apply the patches too to 2.4 but then 2.2
would be left uncovered, new glibc would segfault on the old kernels).
Probably the only way to avoid breaking backwards compatibility is that
glibc will check the uname at the first invocation and then it will
store the information in a global variable in the library. So then for
every second invocation it will be only an overhead of a branch. But it
would be a slowdown for this sequence
fork()exec()gettimeofday()exit()fork()exec()gettimeofday()exit()...

Secondly if we take this long term 32bit route we should first fix the
x86 ABI for the calling conventions too at the very least internally to
the kernel (that would even be completely backwards compatibile but much
more than fixing them for kernel it would be very important to fix them
in userspace). Fixing the ABI should give you much more performance than
vsyscalls (depends on the workload of course but vsyscalls optimizes
only some crtitical program like databases, the other would optimize
them all and its global effect could be even more visible for databases
too, we don't know). You know the registers %eax %ecx and %edx are
clobbered by the callee but they're not used for passing the first three
parameter to functions. That's a bug in the calling conventions of x86,
I've no idea how such bug could ever see the light of the day, maybe
they expected a push + pop to ram to run faster than a single mov, I
can't know, but it definitely needs fixing. Such things as well will be
automatically fixed migrating to a 64bit kernel and userspace. This is
why I didn't attempt to drop FASTCALL and to compile the x86 with
mregparm=3 like I didn't backport the vsyscalls to 32bit.

However this is just a reminder message, I mainly wanted to point out
why I didn't spent time in this effort myself, if Linus is excited to
include vsyscalls on 32bit too that's fine with me, it would be a
definitive improvement at least for the non asymmetric multithreading
nor NUMA cases where the TSC loses synchronization (unless something
mmapped like cyclone or HPET is available of course). So it's up to you ;).

Andrea

2002-10-18 16:07:45

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

One reason gettimeofday ends up being important is that several
databases call it a lot. They use it to build up a transaction id. Under
big transaction loads, even the fast linux syscall path ends up being a
bottleneck. Also, on NUMA machines the data used for time of day (xtime)
ends up being a significant portion of the cache traffic.

It would be great to rework the whole TSC time of day stuff to work with
per cpu data and allow unsychronized TSC's like NUMA. The problem is
that for fast user level access, there would need to be some way to find
out the current CPU and avoid preemption/migration for a short period.
It seems like the LDT stuff for per-thread data could provide the
current cpu (and maybe current pid) somehow. And it would be possible
to avoid preemption while in a vsyscall text page, some other Unix
variants do this to implement portions of the thread library in kernel
provided user text pages.

2002-10-18 16:40:25

by George Anzinger

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

Stephen Hemminger wrote:
>
> One reason gettimeofday ends up being important is that several
> databases call it a lot. They use it to build up a transaction id. Under
> big transaction loads, even the fast linux syscall path ends up being a
> bottleneck. Also, on NUMA machines the data used for time of day (xtime)
> ends up being a significant portion of the cache traffic.
>
> It would be great to rework the whole TSC time of day stuff to work with
> per cpu data and allow unsychronized TSC's like NUMA. The problem is
> that for fast user level access, there would need to be some way to find
> out the current CPU and avoid preemption/migration for a short period.
> It seems like the LDT stuff for per-thread data could provide the
> current cpu (and maybe current pid) somehow. And it would be possible
> to avoid preemption while in a vsyscall text page, some other Unix
> variants do this to implement portions of the thread library in kernel
> provided user text pages.
>
Now there is an idea! Lock preemption in user space if and
only if the user is executing in a text page shared with the
kernel. I have seen the need for such locking, but have
always thought they were too dangerous. This convention
would introduce a much higher level of security. What is
left is to devise a way to let the kernel know that the
preemption targeted task has left such a page so that the
preemption may proceed. Possibly the kernel could plant a
hint on the page that each function would check for on exit.
--
George Anzinger [email protected]
High-res-timers:
http://sourceforge.net/projects/high-res-timers/
Preemption patch:
http://www.kernel.org/pub/linux/kernel/people/rml

2002-10-18 16:34:05

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, 2002-10-18 at 04:14, Andrea Arcangeli wrote:
> the main reason it wasn't backported to i386 is that if glibc start
> using the vgettimeofday instead of sys_gettimeofday, you won't be able
> to downgrade kernel anymore to say 2.4 (oh yeah, I would then backport
> it to my tree or Marcelo could apply the patches too to 2.4 but then 2.2
> would be left uncovered, new glibc would segfault on the old kernels).
> Probably the only way to avoid breaking backwards compatibility is that
> glibc will check the uname at the first invocation and then it will
> store the information in a global variable in the library. So then for
> every second invocation it will be only an overhead of a branch. But it
> would be a slowdown for this sequence
> fork()exec()gettimeofday()exit()fork()exec()gettimeofday()exit()...

Hmmm. Yes, that is a good point. Especially since due to TSC sync
issues, everyone probably won't want to convert to this right off.
Probing might do it, but that would require some fancy trapping in
glibc. Maby users could manually LDPRELOAD a library that would alias
gettimeofday, rather then changing glibc? That way the user of, as you
said specialized db apps,etc, who *really* wants this can get it, but
doesn't affect others.

[snip]
> However this is just a reminder message, I mainly wanted to point out
> why I didn't spent time in this effort myself, if Linus is excited to
> include vsyscalls on 32bit too that's fine with me, it would be a
> definitive improvement at least for the non asymmetric multithreading
> nor NUMA cases where the TSC loses synchronization (unless something
> mmapped like cyclone or HPET is available of course). So it's up to you ;).

It seems you been calling the psychic hotline recently! Actually that's
*exactly* what my plans were for the NUMA case. Very good call!

You bring up some good points. I'll try to think about it some more and
see if there isn't a better way.

thanks
-john

2002-10-18 17:28:20

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, Oct 18, 2002 at 10:19:58AM -0700, Linus Torvalds wrote:
>
> On Fri, 18 Oct 2002, Andrea Arcangeli wrote:
>
> > On Fri, Oct 18, 2002 at 09:45:41AM -0700, george anzinger wrote:
> > > Stephen Hemminger wrote:
> > > > current cpu (and maybe current pid) somehow. And it would be possible
> > > > to avoid preemption while in a vsyscall text page, some other Unix
> > > > variants do this to implement portions of the thread library in kernel
> > > > provided user text pages.
> > > >
> > > Now there is an idea! Lock preemption in user space if and
> >
> > sounds not good to me, you would miss a wakeup and you would delay the
> > schedule of 1/HZ in the worst (close to the common) case.
>
> That's not the real problem.
>
> The real problem is that somebody can jump into the middle of a function
> (or even into the middle of an instruction), causing the function to do
> something totally different from the intended effect.
>
> In particular, it can cause the function to loop forever.
>
> If you disable preemption of user space, you now killed the machine.
>
> In short - others may do it, but it's a total _DISASTER_ from a security
> and stability standpoint. Don't go there.

agreed. Hear my idea:

actually my idea on 64bit was to use the high 8 bit of each 64bit word to
give you the cpuid, to get out the coherent data, including the sequence
number that are read and written inversely with mb() like now (the
sequence number as well will become per-cpu), so it is definitely doable
without any single problem and in a very performant way, just not as
easy as without the per-cpu info. Even if segmentation per-cpu tricks
would be possible or available (remeber long mode is pure paging, no
segmentation) it would be not worthwhile IMHO, the cpuid encoded
atomically in each 64bit data provided by the vsyscall seems a much
simpler and possibly more performant solution. You set a different
per-cpu data-mapping with different pte settings in each cpu. The
vsyscall bytecode remains the same, aware about this cpuid encoded in
each 64bit word. Doing it in 32bit is ugly (or at least much slower)
since most data is natively at least 32bit, it would need some slow
demultiplexing.

Andrea

2002-10-18 17:39:43

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, Oct 18, 2002 at 09:13:39AM -0700, Stephen Hemminger wrote:
> One reason gettimeofday ends up being important is that several
> databases call it a lot. They use it to build up a transaction id. Under
> big transaction loads, even the fast linux syscall path ends up being a
> bottleneck. Also, on NUMA machines the data used for time of day (xtime)
> ends up being a significant portion of the cache traffic.

Yep. However the main bottleneck is to go inside/outside the kernel, the
xtime is one l1 cacheline readonly that can be trivially shared under
high load. I would be surprised if that was the bottleneck, today you
should see an huge bottleneck in the xtime_lock before you can remotely
see a bottleneck in xtime data itself. (I'm speaking HZ=100 at least,
HZ=1000 would hurt more here)

> It would be great to rework the whole TSC time of day stuff to work with
> per cpu data and allow unsychronized TSC's like NUMA. The problem is
> that for fast user level access, there would need to be some way to find
> out the current CPU and avoid preemption/migration for a short period.
> It seems like the LDT stuff for per-thread data could provide the
> current cpu (and maybe current pid) somehow. And it would be possible
> to avoid preemption while in a vsyscall text page, some other Unix
> variants do this to implement portions of the thread library in kernel
> provided user text pages.

actually my idea on 64bit was to use the high 8 bit of each 64bit word to
give you the cpuid, to get out the coherent data, including the sequence
number that are read and written inversely with mb() like now (the
sequence number as well will become per-cpu), so it is definitely doable
without any single problem and in a very performant way, just not as
easy as without the per-cpu info. Even if segmentation per-cpu tricks
would be possible or available (remeber long mode is pure paging, no
segmentation) it would be not worthwhile IMHO, the cpuid encoded
atomically in each 64bit data provided by the vsyscall seems a much
simpler and possibly more performant solution. You set a different
per-cpu data-mapping with different pte settings in each cpu. The
vsyscall bytecode remains the same, aware about this cpuid encoded in
each 64bit word. Doing it in 32bit is ugly (or at least much slower)
since most data is natively at least 32bit, it would need some slow
demultiplexing.

Andrea

2002-10-18 17:32:00

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, Oct 18, 2002 at 09:45:41AM -0700, george anzinger wrote:
> Stephen Hemminger wrote:
> >
> > One reason gettimeofday ends up being important is that several
> > databases call it a lot. They use it to build up a transaction id. Under
> > big transaction loads, even the fast linux syscall path ends up being a
> > bottleneck. Also, on NUMA machines the data used for time of day (xtime)
> > ends up being a significant portion of the cache traffic.
> >
> > It would be great to rework the whole TSC time of day stuff to work with
> > per cpu data and allow unsychronized TSC's like NUMA. The problem is
> > that for fast user level access, there would need to be some way to find
> > out the current CPU and avoid preemption/migration for a short period.
> > It seems like the LDT stuff for per-thread data could provide the
> > current cpu (and maybe current pid) somehow. And it would be possible
> > to avoid preemption while in a vsyscall text page, some other Unix
> > variants do this to implement portions of the thread library in kernel
> > provided user text pages.
> >
> Now there is an idea! Lock preemption in user space if and

sounds not good to me, you would miss a wakeup and you would delay the
schedule of 1/HZ in the worst (close to the common) case.

My idea of encoding the cpuid in the top 8bit of each 64bit word that
can be read atomically (of course this trivially applies in the now
per-cpu two sequence numbers that defines the critical section for each
cpu) sounds a way superior solution to me to read coherent per-cpu data
enterely locklessy and without preemption locks.

Andrea

2002-10-18 17:31:58

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0


On Fri, 18 Oct 2002, Andrea Arcangeli wrote:

> On Fri, Oct 18, 2002 at 09:45:41AM -0700, george anzinger wrote:
> > Stephen Hemminger wrote:
> > > current cpu (and maybe current pid) somehow. And it would be possible
> > > to avoid preemption while in a vsyscall text page, some other Unix
> > > variants do this to implement portions of the thread library in kernel
> > > provided user text pages.
> > >
> > Now there is an idea! Lock preemption in user space if and
>
> sounds not good to me, you would miss a wakeup and you would delay the
> schedule of 1/HZ in the worst (close to the common) case.

That's not the real problem.

The real problem is that somebody can jump into the middle of a function
(or even into the middle of an instruction), causing the function to do
something totally different from the intended effect.

In particular, it can cause the function to loop forever.

If you disable preemption of user space, you now killed the machine.

In short - others may do it, but it's a total _DISASTER_ from a security
and stability standpoint. Don't go there.

Linus

2002-10-18 18:15:17

by Manfred Spraul

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, 2002-10-18 at 04:14, Andrea Arcangeli wrote:
> the main reason it wasn't backported to i386 is that if glibc start
> using the vgettimeofday instead of sys_gettimeofday, you won't be able
> to downgrade kernel anymore to say 2.4 (oh yeah, I would then backport
> it to my tree or Marcelo could apply the patches too to 2.4 but then 2.2
> would be left uncovered, new glibc would segfault on the old kernels).

Does that problem actually exist?

http://marc.theaimsgroup.com/?l=linux-kernel&m=103253890431473&w=2

Jakub Jelinek <[email protected]> wrote on 2002-09-20 16:15:25

> glibc supports .note.ABI-tag notes for libraries, so there is no problem
> with having NPTL libpthread.so.0 --enable-kernel=2.5.36 in say
> /lib/i686/libpthread.so.0 and linuxthreads --enable-kernel=2.2.1 in
> /lib/libpthread.so.0. The dynamic linker will then choose based
> on currently running kernel.
> (well, ATM because of libc tsd DL_ERROR --without-tls ld.so cannot be used
> with --with-tls libs and vice versa, but that is beeing worked on).
>

It should be possible to have one library that supports both syscall
interfaces for gettimeofday().

--
Manfred

2002-10-18 18:31:30

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0


> agreed. Hear my idea:
>
> actually my idea on 64bit was to use the high 8 bit of each 64bit word to
> give you the cpuid, to get out the coherent data, including the sequence
> number that are read and written inversely with mb() like now (the
> sequence number as well will become per-cpu), so it is definitely doable
> without any single problem and in a very performant way, just not as
> easy as without the per-cpu info. Even if segmentation per-cpu tricks
> would be possible or available (remeber long mode is pure paging, no
> segmentation) it would be not worthwhile IMHO, the cpuid encoded
> atomically in each 64bit data provided by the vsyscall seems a much
> simpler and possibly more performant solution. You set a different
> per-cpu data-mapping with different pte settings in each cpu. The
> vsyscall bytecode remains the same, aware about this cpuid encoded in
> each 64bit word. Doing it in 32bit is ugly (or at least much slower)
> since most data is natively at least 32bit, it would need some slow
> demultiplexing.

At least on IA32 you could still use XCHG64 to atomically access the
values, but that always forces a write so it isn't cache friendly. Still
it probably is better than encoding the data in 32bit. It all depends
on how much data is needed.


2002-10-18 18:54:55

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, Oct 18, 2002 at 11:37:19AM -0700, Stephen Hemminger wrote:
>
> > agreed. Hear my idea:
> >
> > actually my idea on 64bit was to use the high 8 bit of each 64bit word to
> > give you the cpuid, to get out the coherent data, including the sequence
> > number that are read and written inversely with mb() like now (the
> > sequence number as well will become per-cpu), so it is definitely doable
> > without any single problem and in a very performant way, just not as
> > easy as without the per-cpu info. Even if segmentation per-cpu tricks
> > would be possible or available (remeber long mode is pure paging, no
> > segmentation) it would be not worthwhile IMHO, the cpuid encoded
> > atomically in each 64bit data provided by the vsyscall seems a much
> > simpler and possibly more performant solution. You set a different
> > per-cpu data-mapping with different pte settings in each cpu. The
> > vsyscall bytecode remains the same, aware about this cpuid encoded in
> > each 64bit word. Doing it in 32bit is ugly (or at least much slower)
> > since most data is natively at least 32bit, it would need some slow
> > demultiplexing.
>
> At least on IA32 you could still use XCHG64 to atomically access the
> values, but that always forces a write so it isn't cache friendly. Still

yep, it would hurt scalability if possible at all, and I doubt the
chpxchg64 could work on a readonly piece of memory, the pte is marked
writeprotect, so it should generate a sigsegv.

> it probably is better than encoding the data in 32bit. It all depends

yes.

> on how much data is needed.
>


Andrea

2002-10-18 18:58:45

by john stultz

[permalink] [raw]
Subject: [RFC] vsyscall_A0 LD_PRELOAD implementation

Here's an example use of the vsyscall via LD_PRELOAD.

Attached is an example library that can be LD_PRELOADED to alias glib's
gettimeofday function w/ the vsyscall implementation. I've also included
a quick performance test to give a rough idea of the savings this gives.

Example run on a SMP P4 box:

[jstultz@elm3b52 vsyscall_test]$ ./run.sh
Normal gettimeofday
gettimeofday ( 1391621us / 1000000runs ) = 1.391620us
vsyscall LD_PRELOAD gettimeofday
gettimeofday ( 286567us / 1000000runs ) = 0.286567us

So it looks like a pretty big win.

thanks
-john


Attachments:
vsyscall_test.tar.gz (787.00 B)

2002-10-18 19:24:51

by George Anzinger

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

Linus Torvalds wrote:
>
> On Fri, 18 Oct 2002, Andrea Arcangeli wrote:
>
> > On Fri, Oct 18, 2002 at 09:45:41AM -0700, george anzinger wrote:
> > > Stephen Hemminger wrote:
> > > > current cpu (and maybe current pid) somehow. And it would be possible
> > > > to avoid preemption while in a vsyscall text page, some other Unix
> > > > variants do this to implement portions of the thread library in kernel
> > > > provided user text pages.
> > > >
> > > Now there is an idea! Lock preemption in user space if and
> >
> > sounds not good to me, you would miss a wakeup and you would delay the
> > schedule of 1/HZ in the worst (close to the common) case.
>
> That's not the real problem.
>
> The real problem is that somebody can jump into the middle of a function
> (or even into the middle of an instruction), causing the function to do
> something totally different from the intended effect.
>
> In particular, it can cause the function to loop forever.
>
> If you disable preemption of user space, you now killed the machine.
>
> In short - others may do it, but it's a total _DISASTER_ from a security
> and stability standpoint. Don't go there.

Oops, hadn't thought of that. Back out, undo, etc. :)

--
George Anzinger [email protected]
High-res-timers:
http://sourceforge.net/projects/high-res-timers/
Preemption patch:
http://www.kernel.org/pub/linux/kernel/people/rml

2002-10-21 12:57:03

by Alan

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Fri, 2002-10-18 at 17:13, Stephen Hemminger wrote:
> It would be great to rework the whole TSC time of day stuff to work with
> per cpu data and allow unsychronized TSC's like NUMA. The problem is
> that for fast user level access, there would need to be some way to find

The timer isnt even necessarily constant rate. The tsc is a nice tool
for debugging. Using it as a clock was not in the long run brilliant.
Don't try and continue it further, we have ACPI and HPET and other
better solutions in upcoming PC hardware.


2002-10-21 17:10:00

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

On Mon, 2002-10-21 at 06:18, Alan Cox wrote:
> On Fri, 2002-10-18 at 17:13, Stephen Hemminger wrote:
> > It would be great to rework the whole TSC time of day stuff to work with
> > per cpu data and allow unsychronized TSC's like NUMA. The problem is
> > that for fast user level access, there would need to be some way to find
>
> The timer isnt even necessarily constant rate. The tsc is a nice tool
> for debugging. Using it as a clock was not in the long run brilliant.
> Don't try and continue it further, we have ACPI and HPET and other
> better solutions in upcoming PC hardware.

Yes, I also feel all this per-cpu TSC stuff is not the way to go (on top
of all this per-cpu mapping, etc. you'd also have to round-robin the
timer interrupt so each cpu has a last_tsc_low value, then if cpus are
varying in freq you have to recalc that occasionally. it just gets
messy.). The current vsyscall implementation uses the TSC because on 99%
of the boxes out there, the TSC is in sync and works fine as time source
(ie: normal gettimeofday uses it). On boxes that don't use the TSC, the
vsyscall shouldn't be mapped in until we have an alternate HPET(equv)
vsyscall solution.

I'll see if I can clean that up later today. I also didn't mind Andrea's
suggestion for using a /proc entry to disable/check for vsyscalls, so I
might give that a whirl as well.

thanks for all the feedback, everyone!
-john

2002-10-21 22:38:43

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [RFC][PATCH] linux-2.5.34_vsyscall_A0

This patch doesn't work if kernel is compiled with kernel symbols
enabled.
-------------------------------------------------------------------
ld -m elf_i386 -e stext -T arch/i386/vmlinux.lds.s arch/i386/kernel/head.o arch/i386/kernel/init_task.o init/built-in.o --start-group arch/i386/kernel/built-in.o arch/i386/mm/built-in.o arch/i386/mach-generic/built-in.o kernel/built-in.o mm/built-in.o fs/built-in.o ipc/built-in.o security/built-in.o lib/lib.a arch/i386/lib/lib.a drivers/built-in.o sound/built-in.o arch/i386/pci/built-in.o arch/i386/oprofile/built-in.o net/built-in.o --end-group .tmp_kallsyms1.o -o .tmp_vmlinux2
ld: section .vsyscall_0 [c0328000 -> c03280e1] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .vxtime_sequence [c0328100 -> c0328107] overlaps section __kallsyms
[c0327e60 -> c03afe8f]
ld: section .last_tsc_low [c0328108 -> c032810b] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .delay_at_last_interrupt [c032810c -> c032810f] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .fast_gettimeoffset_quotient [c0328110 -> c0328113] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .wall_jiffies [c0328114 -> c0328117] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .sys_tz [c0328118 -> c032811f] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .jiffies [c0328120 -> c0328123] overlaps section __kallsyms [c0327e60 -> c03afe8f]
ld: section .xtime [c0328130 -> c0328137] overlaps section __kallsyms [c0327e60
-> c03afe8f]
ld: section .vsyscall_1 [c0328400 -> c0328436] overlaps section __kallsyms [c0327e60 -> c03afe8f]