2014-12-17 23:12:36

by Shaohua Li

[permalink] [raw]
Subject: [PATCH v2 1/3] X86: make VDSO data support multiple pages

Currently vdso data is one page. Next patches will add per-cpu data to
vdso, which requires several pages if CPU number is big. This makes VDSO
data support multiple pages.

Cc: Andy Lutomirski <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Shaohua Li <[email protected]>
---
arch/x86/include/asm/vdso.h | 2 +-
arch/x86/include/asm/vvar.h | 8 ++++++--
arch/x86/kernel/asm-offsets.c | 5 +++++
arch/x86/kernel/vmlinux.lds.S | 6 +++---
arch/x86/tools/relocs.c | 2 +-
arch/x86/vdso/vdso-layout.lds.S | 9 +++++----
arch/x86/vdso/vdso2c.c | 6 +++---
arch/x86/vdso/vma.c | 9 +++++----
8 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 8021bd2..35ca749 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -20,7 +20,7 @@ struct vdso_image {

long sym_vvar_start; /* Negative offset to the vvar area */

- long sym_vvar_page;
+ long sym_vvar_pages;
long sym_hpet_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 3f32dfc..62bc6f8 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -29,7 +29,7 @@

#else

-extern char __vvar_page;
+extern char __vvar_pages;

#define DECLARE_VVAR(offset, type, name) \
extern type vvar_ ## name __attribute__((visibility("hidden")));
@@ -45,7 +45,11 @@ extern char __vvar_page;
/* DECLARE_VVAR(offset, type, name) */

DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
-
+/*
+ * you must update VVAR_TOTAL_SIZE to reflect all of the variables we're
+ * stuffing into the vvar area. Don't change any of the above without
+ * also changing this math of VVAR_TOTAL_SIZE
+ */
#undef DECLARE_VVAR

#endif
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 9f6b934..0ab31a9 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -16,6 +16,7 @@
#include <asm/sigframe.h>
#include <asm/bootparam.h>
#include <asm/suspend.h>
+#include <asm/vgtod.h>

#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -71,4 +72,8 @@ void common(void) {

BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+ BLANK();
+ DEFINE(VVAR_TOTAL_SIZE,
+ ALIGN(128 + sizeof(struct vsyscall_gtod_data), PAGE_SIZE));
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300..2efeee9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -149,7 +149,7 @@ SECTIONS


. = ALIGN(PAGE_SIZE);
- __vvar_page = .;
+ __vvar_pages = .;

.vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
/* work around gold bug 13023 */
@@ -168,10 +168,10 @@ SECTIONS
* Pad the rest of the page with zeros. Otherwise the loader
* can leave garbage here.
*/
- . = __vvar_beginning_hack + PAGE_SIZE;
+ . = __vvar_beginning_hack + VVAR_TOTAL_SIZE;
} :data

- . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+ . = ALIGN(__vvar_pages + VVAR_TOTAL_SIZE, PAGE_SIZE);

/* Init code and data - will be freed after init */
. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8..ea01c48 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -73,7 +73,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"init_per_cpu__.*|"
"__end_rodata_hpage_align|"
#endif
- "__vvar_page|"
+ "__vvar_pages|"
"_end)$"
};

diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index de2c921..413c739 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -1,4 +1,5 @@
#include <asm/vdso.h>
+#include <asm/asm-offsets.h>

/*
* Linker script for vDSO. This is an ELF shared object prelinked to
@@ -25,17 +26,17 @@ SECTIONS
* segment.
*/

- vvar_start = . - 2 * PAGE_SIZE;
- vvar_page = vvar_start;
+ vvar_start = . - (VVAR_TOTAL_SIZE + PAGE_SIZE);
+ vvar_pages = vvar_start;

/* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_pages + offset;
#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR

- hpet_page = vvar_start + PAGE_SIZE;
+ hpet_page = vvar_start + VVAR_TOTAL_SIZE;

. = SIZEOF_HEADERS;

diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
index 8627db2..95eda74 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/vdso/vdso2c.c
@@ -71,14 +71,14 @@ const char *outfilename;
/* Symbols that we need in vdso2c. */
enum {
sym_vvar_start,
- sym_vvar_page,
+ sym_vvar_pages,
sym_hpet_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};

const int special_pages[] = {
- sym_vvar_page,
+ sym_vvar_pages,
sym_hpet_page,
};

@@ -89,7 +89,7 @@ struct vdso_sym {

struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
- [sym_vvar_page] = {"vvar_page", true},
+ [sym_vvar_pages] = {"vvar_pages", true},
[sym_hpet_page] = {"hpet_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 009495b..6496c65 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -19,6 +19,7 @@
#include <asm/page.h>
#include <asm/hpet.h>
#include <asm/desc.h>
+#include <asm/asm-offsets.h>

#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
@@ -133,11 +134,11 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
goto up_fail;
}

- if (image->sym_vvar_page)
+ if (image->sym_vvar_pages)
ret = remap_pfn_range(vma,
- text_start + image->sym_vvar_page,
- __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
- PAGE_SIZE,
+ text_start + image->sym_vvar_pages,
+ __pa_symbol(&__vvar_pages) >> PAGE_SHIFT,
+ VVAR_TOTAL_SIZE,
PAGE_READONLY);

if (ret)
--
1.8.1


2014-12-17 23:12:32

by Shaohua Li

[permalink] [raw]
Subject: [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch

vdso code can't disable preempt, so it can be preempted at any time.
This makes a challenge to implement specific features. This patch adds a
generic API to let vdso code detect context switch.

We can use a context switch count to do the detection. The change of the
count in giving time can be used to detect if context switch occurs.
Andy suggested we can use a timestamp, so in next patch we can save some
intructions. But the principle isn't changed here. This patch uses the
timestamp approach.

Cc: Andy Lutomirski <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Shaohua Li <[email protected]>
---
arch/x86/Kconfig | 4 ++++
arch/x86/include/asm/vdso.h | 17 +++++++++++++++++
arch/x86/include/asm/vvar.h | 6 ++++++
arch/x86/kernel/asm-offsets.c | 6 ++++++
arch/x86/vdso/vma.c | 1 +
kernel/sched/core.c | 5 +++++
6 files changed, 39 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d69f1cd..e384147 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1943,6 +1943,10 @@ config COMPAT_VDSO
If unsure, say N: if you are compiling your own kernel, you
are unlikely to be using a buggy version of glibc.

+config VDSO_CS_DETECT
+ def_bool y
+ depends on X86_64
+
config CMDLINE_BOOL
bool "Built-in kernel command line"
---help---
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 35ca749..d4556a3 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -49,6 +49,23 @@ extern const struct vdso_image *selected_vdso32;

extern void __init init_vdso_image(const struct vdso_image *image);

+#ifdef CONFIG_VDSO_CS_DETECT
+struct vdso_percpu_data {
+ u64 last_cs_timestamp;
+} ____cacheline_aligned;
+
+struct vdso_data {
+ int dummy;
+ struct vdso_percpu_data vpercpu[0];
+};
+extern struct vdso_data vdso_data;
+
+static inline void vdso_set_cpu_cs_timestamp(int cpu)
+{
+ rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
+}
+#endif
+
#endif /* __ASSEMBLER__ */

#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 62bc6f8..19ac55c 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -45,6 +45,12 @@ extern char __vvar_pages;
/* DECLARE_VVAR(offset, type, name) */

DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
+/*
+ * this one needs to be last because it ends with a per-cpu array.
+ */
+DECLARE_VVAR(320, struct vdso_data, vdso_data)
+#endif
/*
* you must update VVAR_TOTAL_SIZE to reflect all of the variables we're
* stuffing into the vvar area. Don't change any of the above without
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 0ab31a9..7321cdc 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
#include <asm/bootparam.h>
#include <asm/suspend.h>
#include <asm/vgtod.h>
+#include <asm/vdso.h>

#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -74,6 +75,11 @@ void common(void) {
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));

BLANK();
+#ifdef CONFIG_VDSO_CS_DETECT
+ DEFINE(VVAR_TOTAL_SIZE, ALIGN(320 + sizeof(struct vdso_data)
+ + sizeof(struct vdso_percpu_data) * CONFIG_NR_CPUS, PAGE_SIZE));
+#else
DEFINE(VVAR_TOTAL_SIZE,
ALIGN(128 + sizeof(struct vsyscall_gtod_data), PAGE_SIZE));
+#endif
}
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 6496c65..22b1a69 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -23,6 +23,7 @@

#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
+DEFINE_VVAR(struct vdso_data, vdso_data);
#endif

void __init init_vdso_image(const struct vdso_image *image)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b7..d8e882d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2232,6 +2232,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+#ifdef CONFIG_VDSO_CS_DETECT
+ int cpu = smp_processor_id();
+
+ vdso_set_cpu_cs_timestamp(cpu);
+#endif

rq->prev_mm = NULL;

--
1.8.1

2014-12-17 23:12:57

by Shaohua Li

[permalink] [raw]
Subject: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
use the following method to compute the thread cpu time:

t0 = process start
t1 = most recent context switch time
t2 = time at which the vsyscall is invoked

thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
= current->se.sum_exec_runtime + now - sched_clock()

At context switch time We stash away

adj_sched_time = sum_exec_runtime - sched_clock()

in a per-cpu struct in the VVAR page and then compute

thread_cpu_time = adj_sched_time + now

All computations are done in nanosecs on systems where TSC is stable. If
TSC is unstable, we fallback to a regular syscall.
Benchmark data:

for (i = 0; i < 100000000; i++) {
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
}

Baseline:
real 1m3.428s
user 0m5.190s
sys 0m58.220s

patched:
real 0m4.912s
user 0m4.910s
sys 0m0.000s

This should speed up profilers that need to query thread cpu time a lot
to do fine-grained timestamps.

No statistically significant regression was detected on x86_64 context
switch code. Most archs that don't support vsyscalls will have this code
disabled via jump labels.

Cc: Andy Lutomirski <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Kumar Sundararajan <[email protected]>
Signed-off-by: Arun Sharma <[email protected]>
Signed-off-by: Chris Mason <[email protected]>
Signed-off-by: Shaohua Li <[email protected]>
---
arch/x86/include/asm/vdso.h | 9 +++++-
arch/x86/kernel/tsc.c | 14 +++++++++
arch/x86/vdso/vclock_gettime.c | 69 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/vdso/vma.c | 10 +++++-
kernel/sched/core.c | 3 ++
5 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index d4556a3..fcdf611 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -52,14 +52,21 @@ extern void __init init_vdso_image(const struct vdso_image *image);
#ifdef CONFIG_VDSO_CS_DETECT
struct vdso_percpu_data {
u64 last_cs_timestamp;
+
+ u64 adj_sched_time;
+ u64 cyc2ns_offset;
+ u32 cyc2ns_mul;
} ____cacheline_aligned;

struct vdso_data {
- int dummy;
+ unsigned int thread_cputime_disabled;
struct vdso_percpu_data vpercpu[0];
};
extern struct vdso_data vdso_data;

+struct static_key;
+extern struct static_key vcpu_thread_cputime_enabled;
+
static inline void vdso_set_cpu_cs_timestamp(int cpu)
{
rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b7e50bb..dd3b281 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -21,6 +21,7 @@
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/vdso.h>

unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -263,6 +264,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);

+#ifdef CONFIG_VDSO_CS_DETECT
+ vdso_data.vpercpu[cpu].cyc2ns_mul = data->cyc2ns_mul;
+ vdso_data.vpercpu[cpu].cyc2ns_offset = data->cyc2ns_offset;
+#endif
+
cyc2ns_write_end(cpu, data);

done:
@@ -989,6 +995,10 @@ void mark_tsc_unstable(char *reason)
tsc_unstable = 1;
clear_sched_clock_stable();
disable_sched_clock_irqtime();
+#ifdef CONFIG_VDSO_CS_DETECT
+ vdso_data.thread_cputime_disabled = 1;
+ static_key_slow_dec(&vcpu_thread_cputime_enabled);
+#endif
pr_info("Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
@@ -1202,6 +1212,10 @@ void __init tsc_init(void)

tsc_disabled = 0;
static_key_slow_inc(&__use_tsc);
+#ifdef CONFIG_VDSO_CS_DETECT
+ vdso_data.thread_cputime_disabled = !cpu_has(&boot_cpu_data,
+ X86_FEATURE_RDTSCP);
+#endif

if (!no_sched_irq_time)
enable_sched_clock_irqtime();
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322..0aa39b1 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -17,6 +17,7 @@
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
+#include <asm/vdso.h>
#include <linux/math64.h>
#include <linux/time.h>

@@ -289,6 +290,62 @@ notrace static void do_monotonic_coarse(struct timespec *ts)
} while (unlikely(gtod_read_retry(gtod, seq)));
}

+#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+notrace static inline u64 __cycles_2_ns(u64 cyc, u64 scale, u64 offset)
+{
+ u64 ns = offset;
+ ns += mul_u64_u32_shr(cyc, scale, CYC2NS_SCALE_FACTOR);
+ return ns;
+}
+
+notrace static inline u64 vdso_get_cpu_cs_timestamp(int cpu)
+{
+ return VVAR(vdso_data).vpercpu[cpu].last_cs_timestamp;
+}
+
+#define THREAD_CPU_TIME_MAX_LOOP 20
+notrace static u64 do_thread_cpu_time(void)
+{
+ unsigned int p;
+ u64 tscval;
+ u64 adj_sched_time;
+ u64 scale;
+ u64 offset;
+ const struct vdso_data *vp = &VVAR(vdso_data);
+ int cpuo, cpun;
+ int loop = 0;
+
+ do {
+ loop++;
+ if (loop > THREAD_CPU_TIME_MAX_LOOP)
+ return -1LL;
+
+ rdtscpll(tscval, p);
+ cpuo = p & VGETCPU_CPU_MASK;
+ adj_sched_time = vp->vpercpu[cpuo].adj_sched_time;
+ scale = vp->vpercpu[cpuo].cyc2ns_mul;
+ offset = vp->vpercpu[cpuo].cyc2ns_offset;
+
+ cpun = __getcpu() & VGETCPU_CPU_MASK;
+ /*
+ * The CPU check goarantees this runs in the same cpu, so no
+ * barrier required
+ */
+ } while (unlikely(cpuo != cpun ||
+ tscval <= vdso_get_cpu_cs_timestamp(cpun)));
+
+ return __cycles_2_ns(tscval, scale, offset) + adj_sched_time;
+}
+
+notrace static inline void ns_to_ts(u64 ns, struct timespec *ts)
+{
+ u32 rem;
+ ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &rem);
+ ts->tv_nsec = rem;
+}
+#endif
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
switch (clock) {
@@ -306,6 +363,18 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
case CLOCK_MONOTONIC_COARSE:
do_monotonic_coarse(ts);
break;
+#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
+ case CLOCK_THREAD_CPUTIME_ID: {
+ u64 time;
+ if (VVAR(vdso_data).thread_cputime_disabled)
+ goto fallback;
+ time = do_thread_cpu_time();
+ if (time == -1LL)
+ goto fallback;
+ ns_to_ts(time, ts);
+ break;
+ }
+#endif
default:
goto fallback;
}
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 22b1a69..237b3af 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -12,6 +12,8 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
+#include <linux/jump_label.h>
+#include <asm/vsyscall.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
@@ -23,7 +25,11 @@

#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
-DEFINE_VVAR(struct vdso_data, vdso_data);
+
+DEFINE_VVAR(struct vdso_data, vdso_data) = {
+ .thread_cputime_disabled = 1,
+};
+struct static_key vcpu_thread_cputime_enabled;
#endif

void __init init_vdso_image(const struct vdso_image *image)
@@ -275,6 +281,8 @@ static int __init init_vdso(void)
init_vdso_image(&vdso_image_x32);
#endif

+ if (!vdso_data.thread_cputime_disabled)
+ static_key_slow_inc(&vcpu_thread_cputime_enabled);
cpu_notifier_register_begin();

on_each_cpu(vgetcpu_cpu_init, NULL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8e882d..03e6175 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2236,6 +2236,9 @@ static struct rq *finish_task_switch(struct task_struct *prev)
int cpu = smp_processor_id();

vdso_set_cpu_cs_timestamp(cpu);
+ if (static_key_false(&vcpu_thread_cputime_enabled))
+ vdso_data.vpercpu[cpu].adj_sched_time =
+ current->se.sum_exec_runtime - sched_clock();
#endif

rq->prev_mm = NULL;
--
1.8.1

2014-12-18 23:30:33

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <[email protected]> wrote:
> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
> use the following method to compute the thread cpu time:
>
> t0 = process start
> t1 = most recent context switch time
> t2 = time at which the vsyscall is invoked
>
> thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
> = current->se.sum_exec_runtime + now - sched_clock()
>
> At context switch time We stash away
>
> adj_sched_time = sum_exec_runtime - sched_clock()
>
> in a per-cpu struct in the VVAR page and then compute
>
> thread_cpu_time = adj_sched_time + now
>
> All computations are done in nanosecs on systems where TSC is stable. If
> TSC is unstable, we fallback to a regular syscall.
> Benchmark data:
>
> for (i = 0; i < 100000000; i++) {
> clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
> sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
> }

A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
is spent taking various locks, and I think it could be worth adding a
fast path for the read-my-own-clock case in which we just disable
preemption and read the thing without any locks.

If we're actually going to go the vdso route, I'd like to make the
scheduler hooks clean. Peterz and/or John, what's the right way to
get an arch-specific callback with sum_exec_runtime and an up to date
sched_clock value during a context switch? I'd much rather not add
yet another rdtsc instruction to the scheduler.

--Andy

>
> Baseline:
> real 1m3.428s
> user 0m5.190s
> sys 0m58.220s
>
> patched:
> real 0m4.912s
> user 0m4.910s
> sys 0m0.000s
>
> This should speed up profilers that need to query thread cpu time a lot
> to do fine-grained timestamps.
>
> No statistically significant regression was detected on x86_64 context
> switch code. Most archs that don't support vsyscalls will have this code
> disabled via jump labels.
>
> Cc: Andy Lutomirski <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Signed-off-by: Kumar Sundararajan <[email protected]>
> Signed-off-by: Arun Sharma <[email protected]>
> Signed-off-by: Chris Mason <[email protected]>
> Signed-off-by: Shaohua Li <[email protected]>
> ---
> arch/x86/include/asm/vdso.h | 9 +++++-
> arch/x86/kernel/tsc.c | 14 +++++++++
> arch/x86/vdso/vclock_gettime.c | 69 ++++++++++++++++++++++++++++++++++++++++++
> arch/x86/vdso/vma.c | 10 +++++-
> kernel/sched/core.c | 3 ++
> 5 files changed, 103 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
> index d4556a3..fcdf611 100644
> --- a/arch/x86/include/asm/vdso.h
> +++ b/arch/x86/include/asm/vdso.h
> @@ -52,14 +52,21 @@ extern void __init init_vdso_image(const struct vdso_image *image);
> #ifdef CONFIG_VDSO_CS_DETECT
> struct vdso_percpu_data {
> u64 last_cs_timestamp;
> +
> + u64 adj_sched_time;
> + u64 cyc2ns_offset;
> + u32 cyc2ns_mul;
> } ____cacheline_aligned;
>
> struct vdso_data {
> - int dummy;
> + unsigned int thread_cputime_disabled;
> struct vdso_percpu_data vpercpu[0];
> };
> extern struct vdso_data vdso_data;
>
> +struct static_key;
> +extern struct static_key vcpu_thread_cputime_enabled;
> +
> static inline void vdso_set_cpu_cs_timestamp(int cpu)
> {
> rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index b7e50bb..dd3b281 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -21,6 +21,7 @@
> #include <asm/hypervisor.h>
> #include <asm/nmi.h>
> #include <asm/x86_init.h>
> +#include <asm/vdso.h>
>
> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
> EXPORT_SYMBOL(cpu_khz);
> @@ -263,6 +264,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
> data->cyc2ns_offset = ns_now -
> mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
>
> +#ifdef CONFIG_VDSO_CS_DETECT
> + vdso_data.vpercpu[cpu].cyc2ns_mul = data->cyc2ns_mul;
> + vdso_data.vpercpu[cpu].cyc2ns_offset = data->cyc2ns_offset;
> +#endif
> +
> cyc2ns_write_end(cpu, data);
>
> done:
> @@ -989,6 +995,10 @@ void mark_tsc_unstable(char *reason)
> tsc_unstable = 1;
> clear_sched_clock_stable();
> disable_sched_clock_irqtime();
> +#ifdef CONFIG_VDSO_CS_DETECT
> + vdso_data.thread_cputime_disabled = 1;
> + static_key_slow_dec(&vcpu_thread_cputime_enabled);
> +#endif
> pr_info("Marking TSC unstable due to %s\n", reason);
> /* Change only the rating, when not registered */
> if (clocksource_tsc.mult)
> @@ -1202,6 +1212,10 @@ void __init tsc_init(void)
>
> tsc_disabled = 0;
> static_key_slow_inc(&__use_tsc);
> +#ifdef CONFIG_VDSO_CS_DETECT
> + vdso_data.thread_cputime_disabled = !cpu_has(&boot_cpu_data,
> + X86_FEATURE_RDTSCP);
> +#endif
>
> if (!no_sched_irq_time)
> enable_sched_clock_irqtime();
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index 9793322..0aa39b1 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -17,6 +17,7 @@
> #include <asm/vvar.h>
> #include <asm/unistd.h>
> #include <asm/msr.h>
> +#include <asm/vdso.h>
> #include <linux/math64.h>
> #include <linux/time.h>
>
> @@ -289,6 +290,62 @@ notrace static void do_monotonic_coarse(struct timespec *ts)
> } while (unlikely(gtod_read_retry(gtod, seq)));
> }
>
> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
> +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
> +notrace static inline u64 __cycles_2_ns(u64 cyc, u64 scale, u64 offset)
> +{
> + u64 ns = offset;
> + ns += mul_u64_u32_shr(cyc, scale, CYC2NS_SCALE_FACTOR);
> + return ns;
> +}
> +
> +notrace static inline u64 vdso_get_cpu_cs_timestamp(int cpu)
> +{
> + return VVAR(vdso_data).vpercpu[cpu].last_cs_timestamp;
> +}
> +
> +#define THREAD_CPU_TIME_MAX_LOOP 20
> +notrace static u64 do_thread_cpu_time(void)
> +{
> + unsigned int p;
> + u64 tscval;
> + u64 adj_sched_time;
> + u64 scale;
> + u64 offset;
> + const struct vdso_data *vp = &VVAR(vdso_data);
> + int cpuo, cpun;
> + int loop = 0;
> +
> + do {
> + loop++;
> + if (loop > THREAD_CPU_TIME_MAX_LOOP)
> + return -1LL;
> +
> + rdtscpll(tscval, p);
> + cpuo = p & VGETCPU_CPU_MASK;
> + adj_sched_time = vp->vpercpu[cpuo].adj_sched_time;
> + scale = vp->vpercpu[cpuo].cyc2ns_mul;
> + offset = vp->vpercpu[cpuo].cyc2ns_offset;
> +
> + cpun = __getcpu() & VGETCPU_CPU_MASK;
> + /*
> + * The CPU check goarantees this runs in the same cpu, so no
> + * barrier required
> + */
> + } while (unlikely(cpuo != cpun ||
> + tscval <= vdso_get_cpu_cs_timestamp(cpun)));
> +
> + return __cycles_2_ns(tscval, scale, offset) + adj_sched_time;
> +}
> +
> +notrace static inline void ns_to_ts(u64 ns, struct timespec *ts)
> +{
> + u32 rem;
> + ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &rem);
> + ts->tv_nsec = rem;
> +}
> +#endif
> +
> notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
> {
> switch (clock) {
> @@ -306,6 +363,18 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
> case CLOCK_MONOTONIC_COARSE:
> do_monotonic_coarse(ts);
> break;
> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
> + case CLOCK_THREAD_CPUTIME_ID: {
> + u64 time;
> + if (VVAR(vdso_data).thread_cputime_disabled)
> + goto fallback;
> + time = do_thread_cpu_time();
> + if (time == -1LL)
> + goto fallback;
> + ns_to_ts(time, ts);
> + break;
> + }
> +#endif
> default:
> goto fallback;
> }
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index 22b1a69..237b3af 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -12,6 +12,8 @@
> #include <linux/random.h>
> #include <linux/elf.h>
> #include <linux/cpu.h>
> +#include <linux/jump_label.h>
> +#include <asm/vsyscall.h>
> #include <asm/vgtod.h>
> #include <asm/proto.h>
> #include <asm/vdso.h>
> @@ -23,7 +25,11 @@
>
> #if defined(CONFIG_X86_64)
> unsigned int __read_mostly vdso64_enabled = 1;
> -DEFINE_VVAR(struct vdso_data, vdso_data);
> +
> +DEFINE_VVAR(struct vdso_data, vdso_data) = {
> + .thread_cputime_disabled = 1,
> +};
> +struct static_key vcpu_thread_cputime_enabled;
> #endif
>
> void __init init_vdso_image(const struct vdso_image *image)
> @@ -275,6 +281,8 @@ static int __init init_vdso(void)
> init_vdso_image(&vdso_image_x32);
> #endif
>
> + if (!vdso_data.thread_cputime_disabled)
> + static_key_slow_inc(&vcpu_thread_cputime_enabled);
> cpu_notifier_register_begin();
>
> on_each_cpu(vgetcpu_cpu_init, NULL, 1);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d8e882d..03e6175 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2236,6 +2236,9 @@ static struct rq *finish_task_switch(struct task_struct *prev)
> int cpu = smp_processor_id();
>
> vdso_set_cpu_cs_timestamp(cpu);
> + if (static_key_false(&vcpu_thread_cputime_enabled))
> + vdso_data.vpercpu[cpu].adj_sched_time =
> + current->se.sum_exec_runtime - sched_clock();
> #endif
>
> rq->prev_mm = NULL;
> --
> 1.8.1
>



--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 00:23:22

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Thu, Dec 18, 2014 at 3:30 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <[email protected]> wrote:
>> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
>> use the following method to compute the thread cpu time:
>>
>> t0 = process start
>> t1 = most recent context switch time
>> t2 = time at which the vsyscall is invoked
>>
>> thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
>> = current->se.sum_exec_runtime + now - sched_clock()
>>
>> At context switch time We stash away
>>
>> adj_sched_time = sum_exec_runtime - sched_clock()
>>
>> in a per-cpu struct in the VVAR page and then compute
>>
>> thread_cpu_time = adj_sched_time + now
>>
>> All computations are done in nanosecs on systems where TSC is stable. If
>> TSC is unstable, we fallback to a regular syscall.
>> Benchmark data:
>>
>> for (i = 0; i < 100000000; i++) {
>> clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
>> sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
>> }
>
> A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
> is spent taking various locks, and I think it could be worth adding a
> fast path for the read-my-own-clock case in which we just disable
> preemption and read the thing without any locks.
>
> If we're actually going to go the vdso route, I'd like to make the
> scheduler hooks clean. Peterz and/or John, what's the right way to
> get an arch-specific callback with sum_exec_runtime and an up to date
> sched_clock value during a context switch? I'd much rather not add
> yet another rdtsc instruction to the scheduler.

Bad news: this patch is incorrect, I think. Take a look at
update_rq_clock -- it does fancy things involving irq time and
paravirt steal time. So this patch could result in extremely
non-monotonic results.

There's CPUCLOCK_VIRT (which, oddly, I see no well-defined API for)
that *might* be a decent approximation.

Thoughts?

--Andy

>
> --Andy
>
>>
>> Baseline:
>> real 1m3.428s
>> user 0m5.190s
>> sys 0m58.220s
>>
>> patched:
>> real 0m4.912s
>> user 0m4.910s
>> sys 0m0.000s
>>
>> This should speed up profilers that need to query thread cpu time a lot
>> to do fine-grained timestamps.
>>
>> No statistically significant regression was detected on x86_64 context
>> switch code. Most archs that don't support vsyscalls will have this code
>> disabled via jump labels.
>>
>> Cc: Andy Lutomirski <[email protected]>
>> Cc: H. Peter Anvin <[email protected]>
>> Cc: Ingo Molnar <[email protected]>
>> Signed-off-by: Kumar Sundararajan <[email protected]>
>> Signed-off-by: Arun Sharma <[email protected]>
>> Signed-off-by: Chris Mason <[email protected]>
>> Signed-off-by: Shaohua Li <[email protected]>
>> ---
>> arch/x86/include/asm/vdso.h | 9 +++++-
>> arch/x86/kernel/tsc.c | 14 +++++++++
>> arch/x86/vdso/vclock_gettime.c | 69 ++++++++++++++++++++++++++++++++++++++++++
>> arch/x86/vdso/vma.c | 10 +++++-
>> kernel/sched/core.c | 3 ++
>> 5 files changed, 103 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
>> index d4556a3..fcdf611 100644
>> --- a/arch/x86/include/asm/vdso.h
>> +++ b/arch/x86/include/asm/vdso.h
>> @@ -52,14 +52,21 @@ extern void __init init_vdso_image(const struct vdso_image *image);
>> #ifdef CONFIG_VDSO_CS_DETECT
>> struct vdso_percpu_data {
>> u64 last_cs_timestamp;
>> +
>> + u64 adj_sched_time;
>> + u64 cyc2ns_offset;
>> + u32 cyc2ns_mul;
>> } ____cacheline_aligned;
>>
>> struct vdso_data {
>> - int dummy;
>> + unsigned int thread_cputime_disabled;
>> struct vdso_percpu_data vpercpu[0];
>> };
>> extern struct vdso_data vdso_data;
>>
>> +struct static_key;
>> +extern struct static_key vcpu_thread_cputime_enabled;
>> +
>> static inline void vdso_set_cpu_cs_timestamp(int cpu)
>> {
>> rdtscll(vdso_data.vpercpu[cpu].last_cs_timestamp);
>> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
>> index b7e50bb..dd3b281 100644
>> --- a/arch/x86/kernel/tsc.c
>> +++ b/arch/x86/kernel/tsc.c
>> @@ -21,6 +21,7 @@
>> #include <asm/hypervisor.h>
>> #include <asm/nmi.h>
>> #include <asm/x86_init.h>
>> +#include <asm/vdso.h>
>>
>> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
>> EXPORT_SYMBOL(cpu_khz);
>> @@ -263,6 +264,11 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
>> data->cyc2ns_offset = ns_now -
>> mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
>>
>> +#ifdef CONFIG_VDSO_CS_DETECT
>> + vdso_data.vpercpu[cpu].cyc2ns_mul = data->cyc2ns_mul;
>> + vdso_data.vpercpu[cpu].cyc2ns_offset = data->cyc2ns_offset;
>> +#endif
>> +
>> cyc2ns_write_end(cpu, data);
>>
>> done:
>> @@ -989,6 +995,10 @@ void mark_tsc_unstable(char *reason)
>> tsc_unstable = 1;
>> clear_sched_clock_stable();
>> disable_sched_clock_irqtime();
>> +#ifdef CONFIG_VDSO_CS_DETECT
>> + vdso_data.thread_cputime_disabled = 1;
>> + static_key_slow_dec(&vcpu_thread_cputime_enabled);
>> +#endif
>> pr_info("Marking TSC unstable due to %s\n", reason);
>> /* Change only the rating, when not registered */
>> if (clocksource_tsc.mult)
>> @@ -1202,6 +1212,10 @@ void __init tsc_init(void)
>>
>> tsc_disabled = 0;
>> static_key_slow_inc(&__use_tsc);
>> +#ifdef CONFIG_VDSO_CS_DETECT
>> + vdso_data.thread_cputime_disabled = !cpu_has(&boot_cpu_data,
>> + X86_FEATURE_RDTSCP);
>> +#endif
>>
>> if (!no_sched_irq_time)
>> enable_sched_clock_irqtime();
>> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
>> index 9793322..0aa39b1 100644
>> --- a/arch/x86/vdso/vclock_gettime.c
>> +++ b/arch/x86/vdso/vclock_gettime.c
>> @@ -17,6 +17,7 @@
>> #include <asm/vvar.h>
>> #include <asm/unistd.h>
>> #include <asm/msr.h>
>> +#include <asm/vdso.h>
>> #include <linux/math64.h>
>> #include <linux/time.h>
>>
>> @@ -289,6 +290,62 @@ notrace static void do_monotonic_coarse(struct timespec *ts)
>> } while (unlikely(gtod_read_retry(gtod, seq)));
>> }
>>
>> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
>> +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
>> +notrace static inline u64 __cycles_2_ns(u64 cyc, u64 scale, u64 offset)
>> +{
>> + u64 ns = offset;
>> + ns += mul_u64_u32_shr(cyc, scale, CYC2NS_SCALE_FACTOR);
>> + return ns;
>> +}
>> +
>> +notrace static inline u64 vdso_get_cpu_cs_timestamp(int cpu)
>> +{
>> + return VVAR(vdso_data).vpercpu[cpu].last_cs_timestamp;
>> +}
>> +
>> +#define THREAD_CPU_TIME_MAX_LOOP 20
>> +notrace static u64 do_thread_cpu_time(void)
>> +{
>> + unsigned int p;
>> + u64 tscval;
>> + u64 adj_sched_time;
>> + u64 scale;
>> + u64 offset;
>> + const struct vdso_data *vp = &VVAR(vdso_data);
>> + int cpuo, cpun;
>> + int loop = 0;
>> +
>> + do {
>> + loop++;
>> + if (loop > THREAD_CPU_TIME_MAX_LOOP)
>> + return -1LL;
>> +
>> + rdtscpll(tscval, p);
>> + cpuo = p & VGETCPU_CPU_MASK;
>> + adj_sched_time = vp->vpercpu[cpuo].adj_sched_time;
>> + scale = vp->vpercpu[cpuo].cyc2ns_mul;
>> + offset = vp->vpercpu[cpuo].cyc2ns_offset;
>> +
>> + cpun = __getcpu() & VGETCPU_CPU_MASK;
>> + /*
>> + * The CPU check goarantees this runs in the same cpu, so no
>> + * barrier required
>> + */
>> + } while (unlikely(cpuo != cpun ||
>> + tscval <= vdso_get_cpu_cs_timestamp(cpun)));
>> +
>> + return __cycles_2_ns(tscval, scale, offset) + adj_sched_time;
>> +}
>> +
>> +notrace static inline void ns_to_ts(u64 ns, struct timespec *ts)
>> +{
>> + u32 rem;
>> + ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &rem);
>> + ts->tv_nsec = rem;
>> +}
>> +#endif
>> +
>> notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>> {
>> switch (clock) {
>> @@ -306,6 +363,18 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
>> case CLOCK_MONOTONIC_COARSE:
>> do_monotonic_coarse(ts);
>> break;
>> +#if defined(CONFIG_VDSO_CS_DETECT) && defined(CONFIG_X86_64)
>> + case CLOCK_THREAD_CPUTIME_ID: {
>> + u64 time;
>> + if (VVAR(vdso_data).thread_cputime_disabled)
>> + goto fallback;
>> + time = do_thread_cpu_time();
>> + if (time == -1LL)
>> + goto fallback;
>> + ns_to_ts(time, ts);
>> + break;
>> + }
>> +#endif
>> default:
>> goto fallback;
>> }
>> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
>> index 22b1a69..237b3af 100644
>> --- a/arch/x86/vdso/vma.c
>> +++ b/arch/x86/vdso/vma.c
>> @@ -12,6 +12,8 @@
>> #include <linux/random.h>
>> #include <linux/elf.h>
>> #include <linux/cpu.h>
>> +#include <linux/jump_label.h>
>> +#include <asm/vsyscall.h>
>> #include <asm/vgtod.h>
>> #include <asm/proto.h>
>> #include <asm/vdso.h>
>> @@ -23,7 +25,11 @@
>>
>> #if defined(CONFIG_X86_64)
>> unsigned int __read_mostly vdso64_enabled = 1;
>> -DEFINE_VVAR(struct vdso_data, vdso_data);
>> +
>> +DEFINE_VVAR(struct vdso_data, vdso_data) = {
>> + .thread_cputime_disabled = 1,
>> +};
>> +struct static_key vcpu_thread_cputime_enabled;
>> #endif
>>
>> void __init init_vdso_image(const struct vdso_image *image)
>> @@ -275,6 +281,8 @@ static int __init init_vdso(void)
>> init_vdso_image(&vdso_image_x32);
>> #endif
>>
>> + if (!vdso_data.thread_cputime_disabled)
>> + static_key_slow_inc(&vcpu_thread_cputime_enabled);
>> cpu_notifier_register_begin();
>>
>> on_each_cpu(vgetcpu_cpu_init, NULL, 1);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index d8e882d..03e6175 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2236,6 +2236,9 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>> int cpu = smp_processor_id();
>>
>> vdso_set_cpu_cs_timestamp(cpu);
>> + if (static_key_false(&vcpu_thread_cputime_enabled))
>> + vdso_data.vpercpu[cpu].adj_sched_time =
>> + current->se.sum_exec_runtime - sched_clock();
>> #endif
>>
>> rq->prev_mm = NULL;
>> --
>> 1.8.1
>>
>
>
>
> --
> Andy Lutomirski
> AMA Capital Management, LLC



--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 00:31:26

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> On Thu, Dec 18, 2014 at 3:30 PM, Andy Lutomirski <[email protected]> wrote:
> > On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <[email protected]> wrote:
> >> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
> >> use the following method to compute the thread cpu time:
> >>
> >> t0 = process start
> >> t1 = most recent context switch time
> >> t2 = time at which the vsyscall is invoked
> >>
> >> thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
> >> = current->se.sum_exec_runtime + now - sched_clock()
> >>
> >> At context switch time We stash away
> >>
> >> adj_sched_time = sum_exec_runtime - sched_clock()
> >>
> >> in a per-cpu struct in the VVAR page and then compute
> >>
> >> thread_cpu_time = adj_sched_time + now
> >>
> >> All computations are done in nanosecs on systems where TSC is stable. If
> >> TSC is unstable, we fallback to a regular syscall.
> >> Benchmark data:
> >>
> >> for (i = 0; i < 100000000; i++) {
> >> clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
> >> sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
> >> }
> >
> > A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
> > is spent taking various locks, and I think it could be worth adding a
> > fast path for the read-my-own-clock case in which we just disable
> > preemption and read the thing without any locks.
> >
> > If we're actually going to go the vdso route, I'd like to make the
> > scheduler hooks clean. Peterz and/or John, what's the right way to
> > get an arch-specific callback with sum_exec_runtime and an up to date
> > sched_clock value during a context switch? I'd much rather not add
> > yet another rdtsc instruction to the scheduler.
>
> Bad news: this patch is incorrect, I think. Take a look at
> update_rq_clock -- it does fancy things involving irq time and
> paravirt steal time. So this patch could result in extremely
> non-monotonic results.

Yes, it's not precise. But bear in mind, CONFIG_IRQ_TIME_ACCOUNTING is a
optional feature. Actually it's added not long time ago. I thought it's
acceptable the time isn't precise just like what we have before the
feature is added.

Thanks,
Shaohua

2014-12-19 00:33:19

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Thu, Dec 18, 2014 at 4:30 PM, Shaohua Li <[email protected]> wrote:
> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> On Thu, Dec 18, 2014 at 3:30 PM, Andy Lutomirski <[email protected]> wrote:
>> > On Wed, Dec 17, 2014 at 3:12 PM, Shaohua Li <[email protected]> wrote:
>> >> This primarily speeds up clock_gettime(CLOCK_THREAD_CPUTIME_ID, ..). We
>> >> use the following method to compute the thread cpu time:
>> >>
>> >> t0 = process start
>> >> t1 = most recent context switch time
>> >> t2 = time at which the vsyscall is invoked
>> >>
>> >> thread_cpu_time = sum(time slices between t0 to t1) + (t2 - t1)
>> >> = current->se.sum_exec_runtime + now - sched_clock()
>> >>
>> >> At context switch time We stash away
>> >>
>> >> adj_sched_time = sum_exec_runtime - sched_clock()
>> >>
>> >> in a per-cpu struct in the VVAR page and then compute
>> >>
>> >> thread_cpu_time = adj_sched_time + now
>> >>
>> >> All computations are done in nanosecs on systems where TSC is stable. If
>> >> TSC is unstable, we fallback to a regular syscall.
>> >> Benchmark data:
>> >>
>> >> for (i = 0; i < 100000000; i++) {
>> >> clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
>> >> sum += ts.tv_sec * NSECS_PER_SEC + ts.tv_nsec;
>> >> }
>> >
>> > A bunch of the time spent processing a CLOCK_THREAD_CPUTIME_ID syscall
>> > is spent taking various locks, and I think it could be worth adding a
>> > fast path for the read-my-own-clock case in which we just disable
>> > preemption and read the thing without any locks.
>> >
>> > If we're actually going to go the vdso route, I'd like to make the
>> > scheduler hooks clean. Peterz and/or John, what's the right way to
>> > get an arch-specific callback with sum_exec_runtime and an up to date
>> > sched_clock value during a context switch? I'd much rather not add
>> > yet another rdtsc instruction to the scheduler.
>>
>> Bad news: this patch is incorrect, I think. Take a look at
>> update_rq_clock -- it does fancy things involving irq time and
>> paravirt steal time. So this patch could result in extremely
>> non-monotonic results.
>
> Yes, it's not precise. But bear in mind, CONFIG_IRQ_TIME_ACCOUNTING is a
> optional feature. Actually it's added not long time ago. I thought it's
> acceptable the time isn't precise just like what we have before the
> feature is added.
>

Nonetheless, I think that the vdso accelerated functions should be
careful to remain interchangeable with the syscall equivalents. If
that means that some kconfig magic needs to be added to prevent this
code from being enabled when it won't work, then so be it. But it
might be better to use a different clock id entirely, and I don't
really understand the logic behind all the clock ids.

John?

--Andy

> Thanks,
> Shaohua



--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 00:35:07

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Thu, 18 Dec 2014, Shaohua Li wrote:
> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> > Bad news: this patch is incorrect, I think. Take a look at
> > update_rq_clock -- it does fancy things involving irq time and
> > paravirt steal time. So this patch could result in extremely
> > non-monotonic results.
>
> Yes, it's not precise. But bear in mind, CONFIG_IRQ_TIME_ACCOUNTING is a
> optional feature. Actually it's added not long time ago. I thought it's
> acceptable the time isn't precise just like what we have before the
> feature is added.

Wrong thought, really.

If the vdso can give you observable inconsistent representation
vs. real syscalls then it does not matter at all how long ago the
feature was added. It's simply wrong no matter what.

Thanks,

tglx

2014-12-19 01:06:04

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v2 2/3] X86: add a generic API to let vdso code detect context switch

On Wed, 17 Dec 2014, Shaohua Li wrote:
> vdso code can't disable preempt, so it can be preempted at any time.
> This makes a challenge to implement specific features. This patch adds a
> generic API to let vdso code detect context switch.

Please change this to:

This adds a complete trainwreck into the scheduler hotpath.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index b5797b7..d8e882d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2232,6 +2232,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
> struct rq *rq = this_rq();
> struct mm_struct *mm = rq->prev_mm;
> long prev_state;
> +#ifdef CONFIG_VDSO_CS_DETECT

Ever heard about header files and CONFIG dependent inline functions?

> + int cpu = smp_processor_id();

If you think hard enough the you might find an even more expensive way
to figure out on which cpu you are running on.

> + vdso_set_cpu_cs_timestamp(cpu);

So we hand in the cpu we are running on to update percpu data in non
preemptible context instead of handing in the really relevant data,
i.e. the timestamp which is readily available right here for free. I
bet the completely misnomed function will go through loops and hoops
to figure that out.

Thanks,

tglx


2014-12-19 11:24:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> Bad news: this patch is incorrect, I think. Take a look at
> update_rq_clock -- it does fancy things involving irq time and
> paravirt steal time. So this patch could result in extremely
> non-monotonic results.

Yeah, I'm not sure how (and if) we could make all that work :/

2014-12-19 16:48:32

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <[email protected]> wrote:
> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> Bad news: this patch is incorrect, I think. Take a look at
>> update_rq_clock -- it does fancy things involving irq time and
>> paravirt steal time. So this patch could result in extremely
>> non-monotonic results.
>
> Yeah, I'm not sure how (and if) we could make all that work :/

I obviously can't comment on what Facebook needs, but if I were
rigging something up to profile my own code*, I'd want a count of
elapsed time, including user, system, and probably interrupt as well.
I would probably not want to count time during which I'm not
scheduled, and I would also probably not want to count steal time.
The latter makes any implementation kind of nasty.

The API presumably doesn't need to be any particular clock id for
clock_gettime, and it may not even need to be clock_gettime at all.

Is perf self-monitoring good enough for this? If not, can we make it
good enough?

* I do this today using CLOCK_MONOTONIC

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 17:03:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <[email protected]> wrote:
> > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> >> Bad news: this patch is incorrect, I think. Take a look at
> >> update_rq_clock -- it does fancy things involving irq time and
> >> paravirt steal time. So this patch could result in extremely
> >> non-monotonic results.
> >
> > Yeah, I'm not sure how (and if) we could make all that work :/
>
> I obviously can't comment on what Facebook needs, but if I were
> rigging something up to profile my own code*, I'd want a count of
> elapsed time, including user, system, and probably interrupt as well.
> I would probably not want to count time during which I'm not
> scheduled, and I would also probably not want to count steal time.
> The latter makes any implementation kind of nasty.
>
> The API presumably doesn't need to be any particular clock id for
> clock_gettime, and it may not even need to be clock_gettime at all.
>
> Is perf self-monitoring good enough for this? If not, can we make it
> good enough?

Yeah, I think you should be able to use that. You could count a NOP
event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
such purposes iirc.

The advantage of using perf self profiling is that it (obviously)
extends to more than just walltime.

2014-12-19 17:08:13

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 9:03 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, Dec 19, 2014 at 08:48:07AM -0800, Andy Lutomirski wrote:
>> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <[email protected]> wrote:
>> > On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>> >> Bad news: this patch is incorrect, I think. Take a look at
>> >> update_rq_clock -- it does fancy things involving irq time and
>> >> paravirt steal time. So this patch could result in extremely
>> >> non-monotonic results.
>> >
>> > Yeah, I'm not sure how (and if) we could make all that work :/
>>
>> I obviously can't comment on what Facebook needs, but if I were
>> rigging something up to profile my own code*, I'd want a count of
>> elapsed time, including user, system, and probably interrupt as well.
>> I would probably not want to count time during which I'm not
>> scheduled, and I would also probably not want to count steal time.
>> The latter makes any implementation kind of nasty.
>>
>> The API presumably doesn't need to be any particular clock id for
>> clock_gettime, and it may not even need to be clock_gettime at all.
>>
>> Is perf self-monitoring good enough for this? If not, can we make it
>> good enough?
>
> Yeah, I think you should be able to use that. You could count a NOP
> event and simply use its activated time. We have PERF_COUNT_SW_DUMMY for
> such purposes iirc.
>
> The advantage of using perf self profiling is that it (obviously)
> extends to more than just walltime.

Re-asking my old question: would it make sense to add a vdso helper
for the magic self-monitoring interface? Or, at the very least, we
could try to tidy up the docs a bit.

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 17:27:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 09:07:49AM -0800, Andy Lutomirski wrote:
> Re-asking my old question: would it make sense to add a vdso helper
> for the magic self-monitoring interface? Or, at the very least, we
> could try to tidy up the docs a bit.

I find it really helps (performance wise) to strip down that magic to
the bare minimum required.

A VDSO helper would always have to do everything. I suppose we could
provide a generic helper, one can always hand code the stuff anyhow.

Then again, what is the benefit of having it in the VDSO as opposed to a
regular DSO?

Updating the docs is always good.

2014-12-19 17:42:59

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 9:27 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, Dec 19, 2014 at 09:07:49AM -0800, Andy Lutomirski wrote:
>> Re-asking my old question: would it make sense to add a vdso helper
>> for the magic self-monitoring interface? Or, at the very least, we
>> could try to tidy up the docs a bit.
>
> I find it really helps (performance wise) to strip down that magic to
> the bare minimum required.
>
> A VDSO helper would always have to do everything. I suppose we could
> provide a generic helper, one can always hand code the stuff anyhow.
>
> Then again, what is the benefit of having it in the VDSO as opposed to a
> regular DSO?

The benefit of using the VDSO is that it means that we can change the
data structure whenever we want. The __vdso_clock_gettime data
structure changes on a somewhat regular basis.

Other than that, a regular DSO is somewhat easier.

>
> Updating the docs is always good.

:)

--Andy

--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 17:43:28

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO



On Fri, Dec 19, 2014 at 11:48 AM, Andy Lutomirski <[email protected]>
wrote:
> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra
> <[email protected]> wrote:
>> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>>> Bad news: this patch is incorrect, I think. Take a look at
>>> update_rq_clock -- it does fancy things involving irq time and
>>> paravirt steal time. So this patch could result in extremely
>>> non-monotonic results.
>>
>> Yeah, I'm not sure how (and if) we could make all that work :/
>
> I obviously can't comment on what Facebook needs, but if I were
> rigging something up to profile my own code*, I'd want a count of
> elapsed time, including user, system, and probably interrupt as well.
> I would probably not want to count time during which I'm not
> scheduled, and I would also probably not want to count steal time.
> The latter makes any implementation kind of nasty.
>
> The API presumably doesn't need to be any particular clock id for
> clock_gettime, and it may not even need to be clock_gettime at all.
>
> Is perf self-monitoring good enough for this? If not, can we make it
> good enough?
>
> * I do this today using CLOCK_MONOTONIC

The clock_gettime calls are used for a wide variety of things, but
usually they are trying to instrument how much CPU the application is
using. So for example with the HHVM interpreter they have a ratio of
the number of hhvm instructions they were able to execute in N seconds
of cputime. This gets used to optimize the HHVM implementation and can
be used as a push blocking counter (code can't go in if it makes it
slower).

Wall time isn't a great representation of this because it includes
factors that might be outside a given HHVM patch, but it sounds like
we're saying almost the same thing.

I'm not familiar with the perf self monitoring?

-chris



2014-12-19 17:53:48

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 9:42 AM, Chris Mason <[email protected]> wrote:
>
>
> On Fri, Dec 19, 2014 at 11:48 AM, Andy Lutomirski <[email protected]>
> wrote:
>>
>> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <[email protected]>
>> wrote:
>>>
>>> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
>>>>
>>>> Bad news: this patch is incorrect, I think. Take a look at
>>>> update_rq_clock -- it does fancy things involving irq time and
>>>> paravirt steal time. So this patch could result in extremely
>>>> non-monotonic results.
>>>
>>>
>>> Yeah, I'm not sure how (and if) we could make all that work :/
>>
>>
>> I obviously can't comment on what Facebook needs, but if I were
>> rigging something up to profile my own code*, I'd want a count of
>> elapsed time, including user, system, and probably interrupt as well.
>> I would probably not want to count time during which I'm not
>> scheduled, and I would also probably not want to count steal time.
>> The latter makes any implementation kind of nasty.
>>
>> The API presumably doesn't need to be any particular clock id for
>> clock_gettime, and it may not even need to be clock_gettime at all.
>>
>> Is perf self-monitoring good enough for this? If not, can we make it
>> good enough?
>>
>> * I do this today using CLOCK_MONOTONIC
>
>
> The clock_gettime calls are used for a wide variety of things, but usually
> they are trying to instrument how much CPU the application is using. So for
> example with the HHVM interpreter they have a ratio of the number of hhvm
> instructions they were able to execute in N seconds of cputime. This gets
> used to optimize the HHVM implementation and can be used as a push blocking
> counter (code can't go in if it makes it slower).
>
> Wall time isn't a great representation of this because it includes factors
> that might be outside a given HHVM patch, but it sounds like we're saying
> almost the same thing.
>
> I'm not familiar with the perf self monitoring?

You can call perf_event_open and mmap the result. Then you can read
the docs^Wheader file.

On the god side, it's an explicit mmap, so all the nasty preemption
issues are entirely moot. And you can count cache misses and such if
you want to be fancy.

On the bad side, the docs are a bit weak, and the added context switch
overhead might be higher.

--Andy

>
> -chris
>
>
>
>



--
Andy Lutomirski
AMA Capital Management, LLC

2014-12-19 18:17:00

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] X86: Add a thread cpu time implementation to vDSO

On Fri, Dec 19, 2014 at 09:53:24AM -0800, Andy Lutomirski wrote:
> On Fri, Dec 19, 2014 at 9:42 AM, Chris Mason <[email protected]> wrote:
> >
> >
> > On Fri, Dec 19, 2014 at 11:48 AM, Andy Lutomirski <[email protected]>
> > wrote:
> >>
> >> On Fri, Dec 19, 2014 at 3:23 AM, Peter Zijlstra <[email protected]>
> >> wrote:
> >>>
> >>> On Thu, Dec 18, 2014 at 04:22:59PM -0800, Andy Lutomirski wrote:
> >>>>
> >>>> Bad news: this patch is incorrect, I think. Take a look at
> >>>> update_rq_clock -- it does fancy things involving irq time and
> >>>> paravirt steal time. So this patch could result in extremely
> >>>> non-monotonic results.
> >>>
> >>>
> >>> Yeah, I'm not sure how (and if) we could make all that work :/
> >>
> >>
> >> I obviously can't comment on what Facebook needs, but if I were
> >> rigging something up to profile my own code*, I'd want a count of
> >> elapsed time, including user, system, and probably interrupt as well.
> >> I would probably not want to count time during which I'm not
> >> scheduled, and I would also probably not want to count steal time.
> >> The latter makes any implementation kind of nasty.
> >>
> >> The API presumably doesn't need to be any particular clock id for
> >> clock_gettime, and it may not even need to be clock_gettime at all.
> >>
> >> Is perf self-monitoring good enough for this? If not, can we make it
> >> good enough?
> >>
> >> * I do this today using CLOCK_MONOTONIC
> >
> >
> > The clock_gettime calls are used for a wide variety of things, but usually
> > they are trying to instrument how much CPU the application is using. So for
> > example with the HHVM interpreter they have a ratio of the number of hhvm
> > instructions they were able to execute in N seconds of cputime. This gets
> > used to optimize the HHVM implementation and can be used as a push blocking
> > counter (code can't go in if it makes it slower).
> >
> > Wall time isn't a great representation of this because it includes factors
> > that might be outside a given HHVM patch, but it sounds like we're saying
> > almost the same thing.
> >
> > I'm not familiar with the perf self monitoring?
>
> You can call perf_event_open and mmap the result. Then you can read
> the docs^Wheader file.
>
> On the god side, it's an explicit mmap, so all the nasty preemption
> issues are entirely moot. And you can count cache misses and such if
> you want to be fancy.
>
> On the bad side, the docs are a bit weak, and the added context switch
> overhead might be higher.

I'll measure the overhead for sure. If overhead isn't high, the perf
approach is very interesting. On the other hand, is it acceptable the
clock_gettime fallbacks to slow path if irq time is enabled (it's
overhead is high, we don't enable it actually)?

Thanks,
Shaohua