2011-06-07 19:32:54

by Andrew Lutomirski

[permalink] [raw]
Subject: [PATCH 0/5] Remove remaining executable code from x86-64 vsyscall page

After the vsyscall emulation patches, the only real executable code left
in the vsyscall page is vread_tsc and vread_hpet. That code is only
called from the vDSO, so move it into the vDSO.

The first four patches prepare to move the code.

vread_tsc() uses rdtsc_barrier(), which contains two alternatives.
Patch 1 and 2 make alternative patching work in the vDSO. (This has a
slightly odd side effect that the vDSO image dumped from memory doesn't
quite match the debug version anymore, but it's hard to imagine that
causing problems.)

Patch 3 fixes an annoyance I found while writing this code. If you
introduce an undefined symbol into the vDSO, you get an unhelpful error
message. ld is smart enough to give a nice error if you ask it to.

Patch 4 cleans up the architecture-specific part of struct clocksource.
IA64 had its own ifdefed code in there, and the supposedly generic vread
pointer was only used by x86-64. With the patch, each arch gets to set
up its own private part of struct clocksource.

*** Note to IA64 people: I have not even compile-tested this on IA64. ***

Patch 5 is the meat. It moves vread_tsc and vread_hpet into the vDSO
where they belong, and it's a net deletion of code because it removes a
bunch of magic needed to make regular functions accessible through the
vsyscall page.

With all of these patches applied, every single byte in the vsyscall
page is some sort of trap instruction.

This series applies to the x86/vdso branch of the -tip tree.

Andy Lutomirski (5):
x86: Make alternative instruction pointers relative
x86-64: Allow alternative patching in the vDSO
x86-64: Add --no-undefined to vDSO build
clocksource: Replace vread and fsys_mmio with generic arch data
x86-64: Move vread_tsc and vread_hpet into the vDSO

arch/ia64/include/asm/clocksource.h | 16 ++++++++++
arch/ia64/kernel/cyclone.c | 2 +-
arch/ia64/kernel/time.c | 2 +-
arch/ia64/sn/kernel/sn2/timer.c | 2 +-
arch/x86/include/asm/alternative-asm.h | 4 +-
arch/x86/include/asm/alternative.h | 8 ++--
arch/x86/include/asm/clocksource.h | 20 ++++++++++++
arch/x86/include/asm/cpufeature.h | 8 ++--
arch/x86/include/asm/tsc.h | 4 --
arch/x86/include/asm/vgtod.h | 2 +-
arch/x86/include/asm/vsyscall.h | 4 --
arch/x86/kernel/Makefile | 7 +----
arch/x86/kernel/alternative.c | 23 ++++++--------
arch/x86/kernel/hpet.c | 9 +-----
arch/x86/kernel/tsc.c | 2 +-
arch/x86/kernel/vmlinux.lds.S | 3 --
arch/x86/kernel/vread_tsc_64.c | 36 ----------------------
arch/x86/kernel/vsyscall_64.c | 2 +-
arch/x86/lib/copy_page_64.S | 9 ++----
arch/x86/lib/memmove_64.S | 11 +++----
arch/x86/vdso/Makefile | 1 +
arch/x86/vdso/vclock_gettime.c | 52 ++++++++++++++++++++++++++++----
arch/x86/vdso/vma.c | 32 +++++++++++++++++++
drivers/char/hpet.c | 2 +-
include/asm-generic/clocksource.h | 4 ++
include/linux/clocksource.h | 13 ++++----
26 files changed, 162 insertions(+), 116 deletions(-)
create mode 100644 arch/ia64/include/asm/clocksource.h
create mode 100644 arch/x86/include/asm/clocksource.h
delete mode 100644 arch/x86/kernel/vread_tsc_64.c
create mode 100644 include/asm-generic/clocksource.h

--
1.7.5.2


2011-06-07 19:33:02

by Andrew Lutomirski

[permalink] [raw]
Subject: [PATCH 1/5] x86: Make alternative instruction pointers relative

This save a few bytes on x86-64 and means that future patches can
apply alternatives to unrelocated code.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/alternative-asm.h | 4 ++--
arch/x86/include/asm/alternative.h | 8 ++++----
arch/x86/include/asm/cpufeature.h | 8 ++++----
arch/x86/kernel/alternative.c | 21 +++++++++++++--------
arch/x86/lib/copy_page_64.S | 9 +++------
arch/x86/lib/memmove_64.S | 11 +++++------
6 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 94d420b..4554cc6 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -17,8 +17,8 @@

.macro altinstruction_entry orig alt feature orig_len alt_len
.align 8
- .quad \orig
- .quad \alt
+ .long \orig - .
+ .long \alt - .
.word \feature
.byte \orig_len
.byte \alt_len
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bf535f9..23fb6d7 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -43,8 +43,8 @@
#endif

struct alt_instr {
- u8 *instr; /* original instruction */
- u8 *replacement;
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */
@@ -84,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
"661:\n\t" oldinstr "\n662:\n" \
".section .altinstructions,\"a\"\n" \
_ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
+ " .long 661b - .\n" /* label */ \
+ " .long 663f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 71cc380..8a1920e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -331,8 +331,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
- _ASM_PTR "1b\n"
- _ASM_PTR "0\n" /* no replacement */
+ " .long 1b - .\n"
+ " .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
@@ -349,8 +349,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
"2:\n"
".section .altinstructions,\"a\"\n"
_ASM_ALIGN "\n"
- _ASM_PTR "1b\n"
- _ASM_PTR "3f\n"
+ " .long 1b - .\n"
+ " .long 3f - .\n"
" .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d5..ddb207b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -263,6 +263,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
+ u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];

DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
@@ -276,25 +277,29 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
- u8 *instr = a->instr;
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= NCAPINTS*32);
if (!boot_cpu_has(a->cpuid))
continue;
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+
+ /* 0xe8 is a relative jump; fix the offset. */
+ if (*insnbuf == 0xe8 && a->replacementlen == 5)
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+
#ifdef CONFIG_X86_64
/* vsyscall code is not mapped yet. resolve it manually. */
if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- DPRINTK("%s: vsyscall fixup: %p => %p\n",
- __func__, a->instr, instr);
}
#endif
- memcpy(insnbuf, a->replacement, a->replacementlen);
- if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
text_poke_early(instr, insnbuf, a->instrlen);
}
}
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1..01c805b 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,6 +2,7 @@

#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>

ALIGN
copy_page_c:
@@ -110,10 +111,6 @@ ENDPROC(copy_page)
2:
.previous
.section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lcopy_page_end - copy_page
- .byte 2b - 1b
+ altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
+ .Lcopy_page_end-copy_page, 2b-1b
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index d0ec9c2..ee16461 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -9,6 +9,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>

#undef memmove

@@ -214,11 +215,9 @@ ENTRY(memmove)
.previous

.section .altinstructions,"a"
- .align 8
- .quad .Lmemmove_begin_forward
- .quad .Lmemmove_begin_forward_efs
- .word X86_FEATURE_ERMS
- .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
- .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+ altinstruction_entry .Lmemmove_begin_forward, \
+ .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
+ .Lmemmove_end_forward-.Lmemmove_begin_forward, \
+ .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
.previous
ENDPROC(memmove)
--
1.7.5.2

2011-06-07 19:33:04

by Andrew Lutomirski

[permalink] [raw]
Subject: [PATCH 2/5] x86-64: Allow alternative patching in the vDSO

This code is short enough and different enough from the module
loader that it's not worth trying to share anything.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/vdso/vma.c | 32 ++++++++++++++++++++++++++++++++
1 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7abd2be..b8b074c1 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -23,11 +23,43 @@ extern unsigned short vdso_sync_cpuid;
static struct page **vdso_pages;
static unsigned vdso_size;

+static void patch_vdso(void *vdso, size_t len)
+{
+ Elf64_Ehdr *hdr = vdso;
+ Elf64_Shdr *sechdrs, *alt_sec = 0;
+ char *secstrings;
+ void *alt_data;
+ int i;
+
+ BUG_ON(len < sizeof(Elf64_Ehdr));
+ BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
+
+ sechdrs = (void *)hdr + hdr->e_shoff;
+ secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ Elf64_Shdr *shdr = &sechdrs[i];
+ if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
+ alt_sec = shdr;
+ break;
+ }
+ }
+
+ if (!alt_sec)
+ return; /* nothing to patch */
+
+ alt_data = (void *)hdr + alt_sec->sh_offset;
+
+ apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
+}
+
static int __init init_vdso_vars(void)
{
int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
int i;

+ patch_vdso(vdso_start, vdso_end - vdso_start);
+
vdso_size = npages << PAGE_SHIFT;
vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
if (!vdso_pages)
--
1.7.5.2

2011-06-07 19:33:08

by Andrew Lutomirski

[permalink] [raw]
Subject: [PATCH 3/5] x86-64: Add --no-undefined to vDSO build

This gives much nicer diagnostics when something goes wrong. It's
supported at least as far back as binutils 2.15.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/vdso/Makefile | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index bef0bc9..5d17950 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
export CPPFLAGS_vdso.lds += -P -C

VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
+ -Wl,--no-undefined \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096

$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
--
1.7.5.2

2011-06-07 19:33:23

by Andrew Lutomirski

[permalink] [raw]
Subject: [PATCH 4/5] clocksource: Replace vread and fsys_mmio with generic arch data

The vread field was bloating struct clocksource everywhere except
x86_64, and I want to change the way this works on x86_64, so let's
split it out into per-arch data.

Cc: [email protected]
Cc: Clemens Ladisch <[email protected]>
Cc: [email protected]
Cc: Tony Luck <[email protected]>
Cc: Fenghua Yu <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/ia64/include/asm/clocksource.h | 16 ++++++++++++++++
arch/ia64/kernel/cyclone.c | 2 +-
arch/ia64/kernel/time.c | 2 +-
arch/ia64/sn/kernel/sn2/timer.c | 2 +-
arch/x86/include/asm/clocksource.h | 16 ++++++++++++++++
arch/x86/kernel/hpet.c | 2 +-
arch/x86/kernel/tsc.c | 2 +-
arch/x86/kernel/vsyscall_64.c | 2 +-
drivers/char/hpet.c | 2 +-
include/asm-generic/clocksource.h | 4 ++++
include/linux/clocksource.h | 13 ++++++-------
11 files changed, 49 insertions(+), 14 deletions(-)
create mode 100644 arch/ia64/include/asm/clocksource.h
create mode 100644 arch/x86/include/asm/clocksource.h
create mode 100644 include/asm-generic/clocksource.h

diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
new file mode 100644
index 0000000..453f363
--- /dev/null
+++ b/arch/ia64/include/asm/clocksource.h
@@ -0,0 +1,16 @@
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#ifdef CONFIG_X86_64
+
+#define __ARCH_HAS_CLOCKSOURCE_DATA
+
+struct arch_clocksource_data {
+ void *fsys_mmio; /* used by fsyscall asm code */
+};
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index f64097b..4826ff9 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -115,7 +115,7 @@ int __init init_cyclone_clock(void)
}
/* initialize last tick */
cyclone_mc = cyclone_timer;
- clocksource_cyclone.fsys_mmio = cyclone_timer;
+ clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);

return 0;
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 85118df..43920de 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -468,7 +468,7 @@ void update_vsyscall(struct timespec *wall, struct timespec *wtm,
fsyscall_gtod_data.clk_mask = c->mask;
fsyscall_gtod_data.clk_mult = mult;
fsyscall_gtod_data.clk_shift = c->shift;
- fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
+ fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio;
fsyscall_gtod_data.clk_cycle_last = c->cycle_last;

/* copy kernel time structures */
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index c34efda..0f8844e 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -54,7 +54,7 @@ ia64_sn_udelay (unsigned long usecs)

void __init sn_timer_init(void)
{
- clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR;
+ clocksource_sn2.archdata.fsys_mmio = RTC_COUNTER_ADDR;
clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second);

ia64_udelay = &ia64_sn_udelay;
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 0000000..a5df33f
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,16 @@
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#ifdef CONFIG_X86_64
+
+#define __ARCH_HAS_CLOCKSOURCE_DATA
+
+struct arch_clocksource_data {
+ cycle_t (*vread)(void);
+};
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e9f5605..0e07257 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -753,7 +753,7 @@ static struct clocksource clocksource_hpet = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
- .vread = vread_hpet,
+ .archdata = { .vread = vread_hpet },
#endif
};

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922..e7a74b8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
- .vread = vread_tsc,
+ .archdata = { .vread = vread_tsc },
#endif
};

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 10cd8ac..eb0d3ef 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);

/* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->vread;
+ vsyscall_gtod_data.clock.vread = clock->archdata.vread;
vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = mult;
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 051474c..0557651 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -931,7 +931,7 @@ int hpet_alloc(struct hpet_data *hdp)
#ifdef CONFIG_IA64
if (!hpet_clocksource) {
hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
- CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr);
+ clocksource_hpet.archdata.fsys_mmio = hpet_mctr;
clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq);
hpetp->hp_clocksource = &clocksource_hpet;
hpet_clocksource = &clocksource_hpet;
diff --git a/include/asm-generic/clocksource.h b/include/asm-generic/clocksource.h
new file mode 100644
index 0000000..0a462d3
--- /dev/null
+++ b/include/asm-generic/clocksource.h
@@ -0,0 +1,4 @@
+/*
+ * Architectures should override this file to add private userspace
+ * clock magic if needed.
+ */
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d4646b4..6bb6970 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -22,6 +22,8 @@
typedef u64 cycle_t;
struct clocksource;

+#include <asm/clocksource.h>
+
/**
* struct cyclecounter - hardware abstraction for a free running counter
* Provides completely state-free accessors to the underlying hardware.
@@ -153,7 +155,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
* @shift: cycle to nanosecond divisor (power of two)
* @max_idle_ns: max idle time permitted by the clocksource (nsecs)
* @flags: flags describing special properties
- * @vread: vsyscall based read
+ * @archdata: arch-specific data
* @suspend: suspend function for the clocksource, if necessary
* @resume: resume function for the clocksource, if necessary
*/
@@ -169,16 +171,13 @@ struct clocksource {
u32 shift;
u64 max_idle_ns;

-#ifdef CONFIG_IA64
- void *fsys_mmio; /* used by fsyscall asm code */
-#define CLKSRC_FSYS_MMIO_SET(mmio, addr) ((mmio) = (addr))
-#else
-#define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0)
+#ifdef __ARCH_HAS_CLOCKSOURCE_DATA
+ struct arch_clocksource_data archdata;
#endif
+
const char *name;
struct list_head list;
int rating;
- cycle_t (*vread)(void);
int (*enable)(struct clocksource *cs);
void (*disable)(struct clocksource *cs);
unsigned long flags;
--
1.7.5.2

2011-06-07 19:33:41

by Andrew Lutomirski

[permalink] [raw]
Subject: [PATCH 5/5] x86-64: Move vread_tsc and vread_hpet into the vDSO

The vsyscall page now consists entirely of trap instructions.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/clocksource.h | 6 +++-
arch/x86/include/asm/tsc.h | 4 ---
arch/x86/include/asm/vgtod.h | 2 +-
arch/x86/include/asm/vsyscall.h | 4 ---
arch/x86/kernel/Makefile | 7 +----
arch/x86/kernel/alternative.c | 8 -----
arch/x86/kernel/hpet.c | 9 +-----
arch/x86/kernel/tsc.c | 2 +-
arch/x86/kernel/vmlinux.lds.S | 3 --
arch/x86/kernel/vread_tsc_64.c | 36 -------------------------
arch/x86/kernel/vsyscall_64.c | 2 +-
arch/x86/vdso/vclock_gettime.c | 52 +++++++++++++++++++++++++++++++----
12 files changed, 56 insertions(+), 79 deletions(-)
delete mode 100644 arch/x86/kernel/vread_tsc_64.c

diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index a5df33f..3882c65 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -7,8 +7,12 @@

#define __ARCH_HAS_CLOCKSOURCE_DATA

+#define VCLOCK_NONE 0 /* No vDSO clock available. */
+#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
+#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
+
struct arch_clocksource_data {
- cycle_t (*vread)(void);
+ int vclock_mode;
};

#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9db5583..83e2efd 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern unsigned long native_calibrate_tsc(void);

-#ifdef CONFIG_X86_64
-extern cycles_t vread_tsc(void);
-#endif
-
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index aa5add8..815285b 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -13,7 +13,7 @@ struct vsyscall_gtod_data {

struct timezone sys_tz;
struct { /* extract of a clocksource struct */
- cycle_t (*vread)(void);
+ int vclock_mode;
cycle_t cycle_last;
cycle_t mask;
u32 mult;
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index bb710cb..fa904ab 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,10 +16,6 @@ enum vsyscall_num {
#ifdef __KERNEL__
#include <linux/seqlock.h>

-/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __vsyscall_fn \
- __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
-
#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cc0469a..2deef3d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,17 +24,12 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
CFLAGS_hpet.o := $(nostackp)
-CFLAGS_vread_tsc_64.o := $(nostackp)
CFLAGS_paravirt.o := $(nostackp)
GCOV_PROFILE_vsyscall_64.o := n
GCOV_PROFILE_hpet.o := n
GCOV_PROFILE_tsc.o := n
-GCOV_PROFILE_vread_tsc_64.o := n
GCOV_PROFILE_paravirt.o := n

-# vread_tsc_64 is hot and should be fully optimized:
-CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
-
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o
@@ -43,7 +38,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
+obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ddb207b..c638228 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
-#include <asm/vsyscall.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
@@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len)

extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-extern char __vsyscall_0;
void *text_poke_early(void *addr, const void *opcode, size_t len);

/* Replace instructions with better alternatives for this CPU type.
@@ -294,12 +292,6 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);

-#ifdef CONFIG_X86_64
- /* vsyscall code is not mapped yet. resolve it manually. */
- if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
- instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- }
-#endif
text_poke_early(instr, insnbuf, a->instrlen);
}
}
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0e07257..d10cc00 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -738,13 +738,6 @@ static cycle_t read_hpet(struct clocksource *cs)
return (cycle_t)hpet_readl(HPET_COUNTER);
}

-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
- return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-#endif
-
static struct clocksource clocksource_hpet = {
.name = "hpet",
.rating = 250,
@@ -753,7 +746,7 @@ static struct clocksource clocksource_hpet = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
- .archdata = { .vread = vread_hpet },
+ .archdata = { .vclock_mode = VCLOCK_HPET },
#endif
};

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e7a74b8..56c633a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
- .archdata = { .vread = vread_tsc },
+ .archdata = { .vclock_mode = VCLOCK_TSC },
#endif
};

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8017471..4aa9c54 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -169,9 +169,6 @@ SECTIONS
.vsyscall : AT(VLOAD(.vsyscall)) {
*(.vsyscall_0)

- . = ALIGN(L1_CACHE_BYTES);
- *(.vsyscall_fn)
-
. = 1024;
*(.vsyscall_1)

diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
deleted file mode 100644
index a81aa9e..0000000
--- a/arch/x86/kernel/vread_tsc_64.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* This code runs in userspace. */
-
-#define DISABLE_BRANCH_PROFILING
-#include <asm/vgtod.h>
-
-notrace cycle_t __vsyscall_fn vread_tsc(void)
-{
- cycle_t ret;
- u64 last;
-
- /*
- * Empirically, a fence (of type that depends on the CPU)
- * before rdtsc is enough to ensure that rdtsc is ordered
- * with respect to loads. The various CPU manuals are unclear
- * as to whether rdtsc can be reordered with later loads,
- * but no one has ever seen it happen.
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
-
- last = VVAR(vsyscall_gtod_data).clock.cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index eb0d3ef..584431f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);

/* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->archdata.vread;
+ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = mult;
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index cf54813..9869bac 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -25,6 +25,43 @@

#define gtod (&VVAR(vsyscall_gtod_data))

+notrace static cycle_t vread_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static notrace cycle_t vread_hpet(void)
+{
+ return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
@@ -36,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
notrace static inline long vgetns(void)
{
long v;
- cycles_t (*vread)(void);
- vread = gtod->clock.vread;
- v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
+ cycles_t cycles;
+ if (gtod->clock.vclock_mode == VCLOCK_TSC)
+ cycles = vread_tsc();
+ else
+ cycles = vread_hpet();
+ v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
return (v * gtod->clock.mult) >> gtod->clock.shift;
}

@@ -118,11 +158,11 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
switch (clock) {
case CLOCK_REALTIME:
- if (likely(gtod->clock.vread))
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
return do_realtime(ts);
break;
case CLOCK_MONOTONIC:
- if (likely(gtod->clock.vread))
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
return do_monotonic(ts);
break;
case CLOCK_REALTIME_COARSE:
@@ -139,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *)
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
long ret;
- if (likely(gtod->clock.vread)) {
+ if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
if (likely(tv != NULL)) {
BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
offsetof(struct timespec, tv_nsec) ||
--
1.7.5.2

2011-06-07 20:28:59

by john stultz

[permalink] [raw]
Subject: Re: [PATCH 4/5] clocksource: Replace vread and fsys_mmio with generic arch data

On Tue, 2011-06-07 at 15:32 -0400, Andy Lutomirski wrote:
> The vread field was bloating struct clocksource everywhere except
> x86_64, and I want to change the way this works on x86_64, so let's
> split it out into per-arch data.
[snip]
> diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
> new file mode 100644
> index 0000000..453f363
> --- /dev/null
> +++ b/arch/ia64/include/asm/clocksource.h
> @@ -0,0 +1,16 @@
> +/* x86-specific clocksource additions */
> +
> +#ifndef _ASM_X86_CLOCKSOURCE_H
> +#define _ASM_X86_CLOCKSOURCE_H
> +
> +#ifdef CONFIG_X86_64

Why do we want X86_64 ifdefs in the ia64 clocksource.h?


> +#define __ARCH_HAS_CLOCKSOURCE_DATA
> +
> +struct arch_clocksource_data {
> + void *fsys_mmio; /* used by fsyscall asm code */
> +};
> +
> +#endif /* CONFIG_X86_64 */
> +
> +#endif /* _ASM_X86_CLOCKSOURCE_H */
> diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
> index f64097b..4826ff9 100644
> --- a/arch/ia64/kernel/cyclone.c
> +++ b/arch/ia64/kernel/cyclone.c
> @@ -115,7 +115,7 @@ int __init init_cyclone_clock(void)
> }
> /* initialize last tick */
> cyclone_mc = cyclone_timer;
> - clocksource_cyclone.fsys_mmio = cyclone_timer;
> + clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
> clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
>
> return 0;
> diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
> index 85118df..43920de 100644
> --- a/arch/ia64/kernel/time.c
> +++ b/arch/ia64/kernel/time.c
> @@ -468,7 +468,7 @@ void update_vsyscall(struct timespec *wall, struct timespec *wtm,
> fsyscall_gtod_data.clk_mask = c->mask;
> fsyscall_gtod_data.clk_mult = mult;
> fsyscall_gtod_data.clk_shift = c->shift;
> - fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
> + fsyscall_gtod_data.clk_fsys_mmio = c->archdata.fsys_mmio;
> fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
>
> /* copy kernel time structures */

Overall this sort of feels a little messy to me.

While having the ifdefs in the clocksource structure wasn't great, I'm
not super excited about pushing all of this back into arch-specific
code. The hope was that folks like ppc and ia64 would convert over from
their own implementations to using more generic vread() implementations,
or atleast new arches with vdso implementations would make use of it
(possibly even allowing for a generic vdso gettime implementation).

Are there at least some hard numbers that help justify this? Or maybe
could you provide some thoughts on your future plans?

thanks
-john

2011-06-07 20:36:15

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 4/5] clocksource: Replace vread and fsys_mmio with generic arch data

On Tue, Jun 7, 2011 at 4:28 PM, john stultz <[email protected]> wrote:
> On Tue, 2011-06-07 at 15:32 -0400, Andy Lutomirski wrote:
>> The vread field was bloating struct clocksource everywhere except
>> x86_64, and I want to change the way this works on x86_64, so let's
>> split it out into per-arch data.
> [snip]
>> diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
>> new file mode 100644
>> index 0000000..453f363
>> --- /dev/null
>> +++ b/arch/ia64/include/asm/clocksource.h
>> @@ -0,0 +1,16 @@
>> +/* x86-specific clocksource additions */
>> +
>> +#ifndef _ASM_X86_CLOCKSOURCE_H
>> +#define _ASM_X86_CLOCKSOURCE_H
>> +
>> +#ifdef CONFIG_X86_64
>
> Why do we want X86_64 ifdefs in the ia64 clocksource.h?
>

We don't. That was a copy-and-paste-o.

>
> Overall this sort of feels a little messy to me.
>
> While having the ifdefs in the clocksource structure wasn't great, I'm
> not super excited about pushing all of this back into arch-specific
> code. The hope was that folks like ppc and ia64 would convert over from
> their own implementations to using more generic vread() implementations,
> or atleast new arches with vdso implementations would make use of it
> (possibly even allowing for a generic vdso gettime implementation).
>
> Are there at least some hard numbers that help justify this? Or maybe
> could you provide some thoughts on your future plans?

No numbers because there's no speedup (yet). Although I do want to
inline at least the TSC implementation eventually.

The real reason is security. Having vread be a function pointer
implies that userspace code can find that function at a fixed address,
which is a bad idea from a security POV (and I hope it's not even true
on any architecture except x86-64). The x86-64 vsyscall is finally
cleaned up to the point that the vread functions are the only pieces
user-executable code left at a fixed address, and I want to get rid of
them as well.

The followup change (patch 5) changes vread on x86-64 to specify TSC,
HPET, or fallback to syscall, and the vDSO reads it and acts
accordingly. This is unfortunate in that it hardcodes assumptions
about the clocksources.

The only other way I can think of to do it is to make the pointer
point into the vDSO. That would involve making it relative to
something in the vDSO, which would IMO be messier both in terms of
computing the pointer and in terms of calling whatever it points to.


Note that the vsyscall_fn hack is rather x86-64-specific as is.


--Andy

2011-06-07 21:06:54

by john stultz

[permalink] [raw]
Subject: Re: [PATCH 4/5] clocksource: Replace vread and fsys_mmio with generic arch data

On Tue, 2011-06-07 at 16:35 -0400, Andrew Lutomirski wrote:
> On Tue, Jun 7, 2011 at 4:28 PM, john stultz <[email protected]> wrote:
> >
> > While having the ifdefs in the clocksource structure wasn't great, I'm
> > not super excited about pushing all of this back into arch-specific
> > code. The hope was that folks like ppc and ia64 would convert over from
> > their own implementations to using more generic vread() implementations,
> > or atleast new arches with vdso implementations would make use of it
> > (possibly even allowing for a generic vdso gettime implementation).
> >
> > Are there at least some hard numbers that help justify this? Or maybe
> > could you provide some thoughts on your future plans?
>
> No numbers because there's no speedup (yet). Although I do want to
> inline at least the TSC implementation eventually.
>
> The real reason is security. Having vread be a function pointer
> implies that userspace code can find that function at a fixed address,
> which is a bad idea from a security POV (and I hope it's not even true
> on any architecture except x86-64).

Something to this effect should go into the change-log then.

> The x86-64 vsyscall is finally
> cleaned up to the point that the vread functions are the only pieces
> user-executable code left at a fixed address, and I want to get rid of
> them as well.
>
> The followup change (patch 5) changes vread on x86-64 to specify TSC,

Oh, sorry, I didn't see the rest of the patchset. Apologies. :)


> HPET, or fallback to syscall, and the vDSO reads it and acts
> accordingly. This is unfortunate in that it hardcodes assumptions
> about the clocksources.

Yea, that is unfortunate. Hmm.

> The only other way I can think of to do it is to make the pointer
> point into the vDSO. That would involve making it relative to
> something in the vDSO, which would IMO be messier both in terms of
> computing the pointer and in terms of calling whatever it points to.

Hrm. I'm not super familiar with how the vDSO randomization works, so I
can't really provide any informed insight here.

But I'd def like to someday get the vDSO stuff to be as generic as
possible. I already have some timekeeping changes I'd like to do which
affect the update_vsyscall path. And that is a total pain to do, since I
have to sync changes with x86, powerpc and ia64 (and the ia64 asm isn't
something I'm likely to touch :) and get those changes in all at once,
or introduce some redundant call or have lots of ifdef magic to keep
compatibility while each arch adapts to the changes.

So yea, I guess the fixed pointer removal is a priority, but I suspect
these changes will cause some maintenance pain in the future.

thanks
-john

2011-06-07 21:18:39

by Andrew Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 4/5] clocksource: Replace vread and fsys_mmio with generic arch data

[cc: linux-arch]

On Tue, Jun 7, 2011 at 5:06 PM, john stultz <[email protected]> wrote:
> On Tue, 2011-06-07 at 16:35 -0400, Andrew Lutomirski wrote:
>> On Tue, Jun 7, 2011 at 4:28 PM, john stultz <[email protected]> wrote:
>> >
>> > While having the ifdefs in the clocksource structure wasn't great, I'm
>> > not super excited about pushing all of this back into arch-specific
>> > code. The hope was that folks like ppc and ia64 would convert over from
>> > their own implementations to using more generic vread() implementations,
>> > or atleast new arches with vdso implementations would make use of it
>> > (possibly even allowing for a generic vdso gettime implementation).
>> >
>> > Are there at least some hard numbers that help justify this? Or maybe
>> > could you provide some thoughts on your future plans?
>>
>> No numbers because there's no speedup (yet). ?Although I do want to
>> inline at least the TSC implementation eventually.
>>
>> The real reason is security. ?Having vread be a function pointer
>> implies that userspace code can find that function at a fixed address,
>> which is a bad idea from a security POV (and I hope it's not even true
>> on any architecture except x86-64).
>
> Something to this effect should go into the change-log then.

Will do.

>> The followup change (patch 5) changes vread on x86-64 to specify TSC,
>
> Oh, sorry, I didn't see the rest of the patchset. Apologies. :)

No problem. I guess I should have cc'd you on at least the interesting one.

>
>
>> HPET, or fallback to syscall, and the vDSO reads it and acts
>> accordingly. ?This is unfortunate in that it hardcodes assumptions
>> about the clocksources.
>
> Yea, that is unfortunate. Hmm.
>
>> The only other way I can think of to do it is to make the pointer
>> point into the vDSO. ?That would involve making it relative to
>> something in the vDSO, which would IMO be messier both in terms of
>> computing the pointer and in terms of calling whatever it points to.
>
> Hrm. I'm not super familiar with how the vDSO randomization works, so I
> can't really provide any informed insight here.
>
> But I'd def like to someday get the vDSO stuff to be as generic as
> possible. I already have some timekeeping changes I'd like to do which
> affect the update_vsyscall path. And that is a total pain to do, since I
> have to sync changes with x86, powerpc and ia64 (and the ia64 asm isn't
> something I'm likely to touch :) and get those changes in all at once,
> or introduce some redundant call or have lots of ifdef magic to keep
> compatibility while each arch adapts to the changes.

The vDSO is just like a regular DSO. It's a bunch of code that's the
same in every process, but it's mapped in a different place for each
process. The kernel knows how to find it if needed.

One idea: Add a directory linux/vDSO (i.e. a vDSO directory in the
root of the tree) that contains files that get compiled into the
vDSO). Then generic timekeeping code can go in there. That could
replace the duplicate clock_gettime implementation that currently
lives in the x86_64 vDSO, for example.

There could be a generic (or at least common) mechanism to compute a
"pointer" (i.e. some pointer-sized blob) to a vDSO function from the
kernel and another mechanism to resolve the pointer from the vDSO.
That pointer could go into vread.


Simpler idea: Make the clock_gettime implementation common (stick it
in a header, for example), but require the arch to supply functions to
check whether vread is available and to actually call vread.

--Andy

2011-06-08 12:33:32

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 2/5] x86-64: Allow alternative patching in the vDSO

On Tue, Jun 07, 2011 at 03:32:39PM -0400, Andy Lutomirski wrote:
> This code is short enough and different enough from the module
> loader that it's not worth trying to share anything.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/vdso/vma.c | 32 ++++++++++++++++++++++++++++++++
> 1 files changed, 32 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index 7abd2be..b8b074c1 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -23,11 +23,43 @@ extern unsigned short vdso_sync_cpuid;
> static struct page **vdso_pages;
> static unsigned vdso_size;
>
> +static void patch_vdso(void *vdso, size_t len)
> +{
> + Elf64_Ehdr *hdr = vdso;
> + Elf64_Shdr *sechdrs, *alt_sec = 0;
> + char *secstrings;
> + void *alt_data;
> + int i;
> +
> + BUG_ON(len < sizeof(Elf64_Ehdr));
> + BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
> +
> + sechdrs = (void *)hdr + hdr->e_shoff;
> + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
> +
> + for (i = 1; i < hdr->e_shnum; i++) {
> + Elf64_Shdr *shdr = &sechdrs[i];
> + if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
> + alt_sec = shdr;
> + break;
> + }
> + }
> +
> + if (!alt_sec)
> + return; /* nothing to patch */

Just a minor nitpick:

Maybe do

for (.. ) {

if (!strcmp(...)) {
alt_sec = shdr;
goto apply;
}
}
return;

apply:

to save yourself the if (!alt_sec) test and the alt_sec = 0 aka NULL
assignment above.

> +
> + alt_data = (void *)hdr + alt_sec->sh_offset;
> +
> + apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
> +}
> +
> static int __init init_vdso_vars(void)
> {
> int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
> int i;
>
> + patch_vdso(vdso_start, vdso_end - vdso_start);
> +
> vdso_size = npages << PAGE_SHIFT;
> vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
> if (!vdso_pages)
> --
> 1.7.5.2

Thanks.

--
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632