From: Alexander Sverdlin <[email protected]>
Add CLOCK_MONOTONIC_RAW to the existing clock_gettime() vDSO
implementation. This is based on the ideas of Jason Vas Dias and comments
of Thomas Gleixner.
---- Test code ----
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#define CLOCK_TYPE CLOCK_MONOTONIC_RAW
#define DURATION_SEC 10
int main(int argc, char **argv)
{
struct timespec t, end;
unsigned long long cnt = 0;
clock_gettime(CLOCK_TYPE, &end);
end.tv_sec += DURATION_SEC;
do {
clock_gettime(CLOCK_TYPE, &t);
++cnt;
} while (t.tv_sec < end.tv_sec || t.tv_nsec < end.tv_nsec);
dprintf(STDOUT_FILENO, "%llu", cnt);
return EXIT_SUCCESS;
}
-------------------
The results from the above test program:
Clock Before After Diff
----- ------ ----- ----
CLOCK_MONOTONIC 355531720 338039373 -5%
CLOCK_MONOTONIC_RAW 44831738 336064912 +650%
CLOCK_REALTIME 347665133 338102907 -3%
Link: https://lore.kernel.org/patchwork/patch/933583/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198961
Cc: Thomas Gleixner <[email protected]>
Cc: Jason Vas Dias <[email protected]>
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/x86/entry/vdso/vclock_gettime.c | 11 +++++++++--
arch/x86/entry/vsyscall/vsyscall_gtod.c | 6 ++++++
arch/x86/include/asm/vgtod.h | 5 ++++-
3 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 98c7d12..7c9db26 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -144,6 +144,13 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
struct vgtod_ts *base = >od->basetime[clk];
u64 cycles, last, sec, ns;
unsigned int seq;
+ u32 mult = gtod->mult;
+ u32 shift = gtod->shift;
+
+ if (clk == CLOCK_MONOTONIC_RAW) {
+ mult = gtod->raw_mult;
+ shift = gtod->raw_shift;
+ }
do {
seq = gtod_read_begin(gtod);
@@ -153,8 +160,8 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
if (unlikely((s64)cycles < 0))
return vdso_fallback_gettime(clk, ts);
if (cycles > last)
- ns += (cycles - last) * gtod->mult;
- ns >>= gtod->shift;
+ ns += (cycles - last) * mult;
+ ns >>= shift;
sec = base->sec;
} while (unlikely(gtod_read_retry(gtod, seq)));
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index cfcdba0..a967f02 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -46,11 +46,17 @@ void update_vsyscall(struct timekeeper *tk)
vdata->mask = tk->tkr_mono.mask;
vdata->mult = tk->tkr_mono.mult;
vdata->shift = tk->tkr_mono.shift;
+ vdata->raw_mult = tk->tkr_raw.mult;
+ vdata->raw_shift = tk->tkr_raw.shift;
base = &vdata->basetime[CLOCK_REALTIME];
base->sec = tk->xtime_sec;
base->nsec = tk->tkr_mono.xtime_nsec;
+ base = &vdata->basetime[CLOCK_MONOTONIC_RAW];
+ base->sec = tk->raw_sec;
+ base->nsec = tk->tkr_raw.xtime_nsec;
+
base = &vdata->basetime[CLOCK_TAI];
base->sec = tk->xtime_sec + (s64)tk->tai_offset;
base->nsec = tk->tkr_mono.xtime_nsec;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 913a133..f65c42b 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -28,7 +28,8 @@ struct vgtod_ts {
};
#define VGTOD_BASES (CLOCK_TAI + 1)
-#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI))
+#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | \
+ BIT(CLOCK_MONOTONIC_RAW) | BIT(CLOCK_TAI))
#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE))
/*
@@ -43,6 +44,8 @@ struct vsyscall_gtod_data {
u64 mask;
u32 mult;
u32 shift;
+ u32 raw_mult;
+ u32 raw_shift;
struct vgtod_ts basetime[VGTOD_BASES];
--
2.4.6
Alexander,
On Tue, 28 May 2019, Sverdlin, Alexander (Nokia - DE/Ulm) wrote:
> Add CLOCK_MONOTONIC_RAW to the existing clock_gettime() vDSO
> implementation. This is based on the ideas of Jason Vas Dias and comments
> of Thomas Gleixner.
Well to some extent, but
> The results from the above test program:
>
> Clock Before After Diff
> ----- ------ ----- ----
> CLOCK_MONOTONIC 355531720 338039373 -5%
> CLOCK_MONOTONIC_RAW 44831738 336064912 +650%
> CLOCK_REALTIME 347665133 338102907 -3%
the preformance regressions for CLOCK_MONOTONIC and CLOCK_REALTIME are just
not acceptable.
> diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
> index 98c7d12..7c9db26 100644
> --- a/arch/x86/entry/vdso/vclock_gettime.c
> +++ b/arch/x86/entry/vdso/vclock_gettime.c
> @@ -144,6 +144,13 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
> struct vgtod_ts *base = >od->basetime[clk];
> u64 cycles, last, sec, ns;
> unsigned int seq;
> + u32 mult = gtod->mult;
> + u32 shift = gtod->shift;
> +
> + if (clk == CLOCK_MONOTONIC_RAW) {
> + mult = gtod->raw_mult;
> + shift = gtod->raw_shift;
> + }
They stem from this extra conditional.
>
> do {
> seq = gtod_read_begin(gtod);
> @@ -153,8 +160,8 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
> if (unlikely((s64)cycles < 0))
> return vdso_fallback_gettime(clk, ts);
> if (cycles > last)
> - ns += (cycles - last) * gtod->mult;
> - ns >>= gtod->shift;
> + ns += (cycles - last) * mult;
And this is completely broken because you read the mult/shift values
outside of the sequence count protected region, so a concurrent update of
the timekeeping values will lead to inconsistent state.
Thanks,
tglx
From: Alexander Sverdlin <[email protected]>
Add CLOCK_MONOTONIC_RAW to the existing clock_gettime() vDSO
implementation. This is based on the ideas of Jason Vas Dias and comments
of Thomas Gleixner.
---- Test code ----
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#define CLOCK_TYPE CLOCK_MONOTONIC_RAW
#define DURATION_SEC 10
int main(int argc, char **argv)
{
struct timespec t, end;
unsigned long long cnt = 0;
clock_gettime(CLOCK_TYPE, &end);
end.tv_sec += DURATION_SEC;
do {
clock_gettime(CLOCK_TYPE, &t);
++cnt;
} while (t.tv_sec < end.tv_sec || t.tv_nsec < end.tv_nsec);
dprintf(STDOUT_FILENO, "%llu", cnt);
return EXIT_SUCCESS;
}
-------------------
The results from the above test program:
Clock Before After Diff
----- ------ ----- ----
CLOCK_MONOTONIC 355.5M 355.5M
CLOCK_MONOTONIC_RAW 44.9M 371.2M +726%
CLOCK_REALTIME 355.5M 355.5M
Link: https://lore.kernel.org/patchwork/patch/933583/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198961
Cc: Thomas Gleixner <[email protected]>
Cc: Jason Vas Dias <[email protected]>
Signed-off-by: Alexander Sverdlin <[email protected]>
---
Changelog:
v2: copy do_hres() into do_monotonic_raw()
arch/x86/entry/vdso/vclock_gettime.c | 35 +++++++++++++++++++++++++++++++++
arch/x86/entry/vsyscall/vsyscall_gtod.c | 6 ++++++
arch/x86/include/asm/vgtod.h | 2 ++
3 files changed, 43 insertions(+)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 0f82a70..64736a4 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -168,6 +168,39 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
return 0;
}
+/*
+ * Attempts to merge the below copy with the above routine led to 5% performance
+ * drop (CLOCK_MONOTONIC, CLOCK_REALTIME) up to now. Test before making changes.
+ */
+notrace static int do_monotonic_raw(struct timespec *ts)
+{
+ struct vgtod_ts *base = >od->basetime[CLOCK_MONOTONIC_RAW];
+ u64 cycles, last, sec, ns;
+ unsigned int seq;
+
+ do {
+ seq = gtod_read_begin(gtod);
+ cycles = vgetcyc(gtod->vclock_mode);
+ ns = base->nsec;
+ last = gtod->cycle_last;
+ if (unlikely((s64)cycles < 0))
+ return vdso_fallback_gettime(CLOCK_MONOTONIC_RAW, ts);
+ if (cycles > last)
+ ns += (cycles - last) * gtod->raw_mult;
+ ns >>= gtod->raw_shift;
+ sec = base->sec;
+ } while (unlikely(gtod_read_retry(gtod, seq)));
+
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return 0;
+}
+
notrace static void do_coarse(clockid_t clk, struct timespec *ts)
{
struct vgtod_ts *base = >od->basetime[clk];
@@ -199,6 +232,8 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
do_coarse(clock, ts);
return 0;
}
+ if (clock == CLOCK_MONOTONIC_RAW)
+ return do_monotonic_raw(ts);
return vdso_fallback_gettime(clock, ts);
}
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index cfcdba0..9f7744f3 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -46,6 +46,8 @@ void update_vsyscall(struct timekeeper *tk)
vdata->mask = tk->tkr_mono.mask;
vdata->mult = tk->tkr_mono.mult;
vdata->shift = tk->tkr_mono.shift;
+ vdata->raw_mult = tk->tkr_raw.mult;
+ vdata->raw_shift = tk->tkr_raw.shift;
base = &vdata->basetime[CLOCK_REALTIME];
base->sec = tk->xtime_sec;
@@ -65,6 +67,10 @@ void update_vsyscall(struct timekeeper *tk)
}
base->nsec = nsec;
+ base = &vdata->basetime[CLOCK_MONOTONIC_RAW];
+ base->sec = tk->raw_sec;
+ base->nsec = tk->tkr_raw.xtime_nsec;
+
base = &vdata->basetime[CLOCK_REALTIME_COARSE];
base->sec = tk->xtime_sec;
base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 913a133..65ac320 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -43,6 +43,8 @@ struct vsyscall_gtod_data {
u64 mask;
u32 mult;
u32 shift;
+ u32 raw_mult;
+ u32 raw_shift;
struct vgtod_ts basetime[VGTOD_BASES];
--
2.4.6
On Tue, Jun 4, 2019 at 9:42 AM Sverdlin, Alexander (Nokia - DE/Ulm)
<[email protected]> wrote:
>
> From: Alexander Sverdlin <[email protected]>
>
> Add CLOCK_MONOTONIC_RAW to the existing clock_gettime() vDSO
> implementation. This is based on the ideas of Jason Vas Dias and comments
> of Thomas Gleixner.
>
Can you benchmark this against pulling the mult and shift into the
per-clock base?
--Andy
From: Alexander Sverdlin <[email protected]>
Trivial vDSO implementation saves a syscall and brings 700% performance
boost of clock_gettime(CLOCK_MONOTONIC_RAW, ...) call.
Changelog:
v3: Move mult and shift into struct vgtod_ts
v2: copy do_hres() into do_monotonic_raw()
Alexander Sverdlin (2):
x86/vdso: Move mult and shift into struct vgtod_ts
x86/vdso: implement clock_gettime(CLOCK_MONOTONIC_RAW, ...)
arch/x86/entry/vdso/vclock_gettime.c | 4 ++--
arch/x86/entry/vsyscall/vsyscall_gtod.c | 14 ++++++++++++--
arch/x86/include/asm/vgtod.h | 7 ++++---
3 files changed, 18 insertions(+), 7 deletions(-)
--
2.4.6
From: Alexander Sverdlin <[email protected]>
Add CLOCK_MONOTONIC_RAW to the existing clock_gettime() vDSO
implementation. This is based on the ideas of Jason Vas Dias and comments
of Thomas Gleixner.
Test program from the previous patch shows the following improvement:
Clock Before After Diff
----- ------ ----- ----
CLOCK_MONOTONIC_RAW 44.9M 359.6M +700%
Link: https://lore.kernel.org/patchwork/patch/933583/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=198961
Cc: Thomas Gleixner <[email protected]>
Cc: Jason Vas Dias <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/x86/entry/vsyscall/vsyscall_gtod.c | 6 ++++++
arch/x86/include/asm/vgtod.h | 3 ++-
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 64b1e7c..402c3b3 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -69,6 +69,12 @@ void update_vsyscall(struct timekeeper *tk)
base->mult = tk->tkr_mono.mult;
base->shift = tk->tkr_mono.shift;
+ base = &vdata->basetime[CLOCK_MONOTONIC_RAW];
+ base->sec = tk->raw_sec;
+ base->nsec = tk->tkr_raw.xtime_nsec;
+ base->mult = tk->tkr_raw.mult;
+ base->shift = tk->tkr_raw.shift;
+
base = &vdata->basetime[CLOCK_REALTIME_COARSE];
base->sec = tk->xtime_sec;
base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index b1f6df3..1fb7048 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -30,7 +30,8 @@ struct vgtod_ts {
};
#define VGTOD_BASES (CLOCK_TAI + 1)
-#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI))
+#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | \
+ BIT(CLOCK_MONOTONIC_RAW) | BIT(CLOCK_TAI))
#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE))
/*
--
2.4.6
From: Alexander Sverdlin <[email protected]>
This is a preparation for CLOCK_MONOTONIC_RAW vDSO implementation.
Coincidentally it had a slight performance improvement as well:
---- Test code ----
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#define CLOCK_TYPE CLOCK_MONOTONIC_RAW
#define DURATION_SEC 10
int main(int argc, char **argv)
{
struct timespec t, end;
unsigned long long cnt = 0;
clock_gettime(CLOCK_TYPE, &end);
end.tv_sec += DURATION_SEC;
do {
clock_gettime(CLOCK_TYPE, &t);
++cnt;
} while (t.tv_sec < end.tv_sec || t.tv_nsec < end.tv_nsec);
dprintf(STDOUT_FILENO, "%llu", cnt);
return EXIT_SUCCESS;
}
-------------------
The results from the above test program:
Clock Before After Diff
----- ------ ----- ----
CLOCK_MONOTONIC 355.5M 359.6M +1%
CLOCK_REALTIME 355.5M 359.6M +1%
Cc: Thomas Gleixner <[email protected]>
Cc: Jason Vas Dias <[email protected]>
Suggested-by: Andy Lutomirski <[email protected]>
Signed-off-by: Alexander Sverdlin <[email protected]>
---
arch/x86/entry/vdso/vclock_gettime.c | 4 ++--
arch/x86/entry/vsyscall/vsyscall_gtod.c | 8 ++++++--
arch/x86/include/asm/vgtod.h | 4 ++--
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 0f82a70..a99eaf5 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -153,8 +153,8 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
if (unlikely((s64)cycles < 0))
return vdso_fallback_gettime(clk, ts);
if (cycles > last)
- ns += (cycles - last) * gtod->mult;
- ns >>= gtod->shift;
+ ns += (cycles - last) * base->mult;
+ ns >>= base->shift;
sec = base->sec;
} while (unlikely(gtod_read_retry(gtod, seq)));
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index cfcdba0..64b1e7c 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -44,16 +44,18 @@ void update_vsyscall(struct timekeeper *tk)
vdata->vclock_mode = vclock_mode;
vdata->cycle_last = tk->tkr_mono.cycle_last;
vdata->mask = tk->tkr_mono.mask;
- vdata->mult = tk->tkr_mono.mult;
- vdata->shift = tk->tkr_mono.shift;
base = &vdata->basetime[CLOCK_REALTIME];
base->sec = tk->xtime_sec;
base->nsec = tk->tkr_mono.xtime_nsec;
+ base->mult = tk->tkr_mono.mult;
+ base->shift = tk->tkr_mono.shift;
base = &vdata->basetime[CLOCK_TAI];
base->sec = tk->xtime_sec + (s64)tk->tai_offset;
base->nsec = tk->tkr_mono.xtime_nsec;
+ base->mult = tk->tkr_mono.mult;
+ base->shift = tk->tkr_mono.shift;
base = &vdata->basetime[CLOCK_MONOTONIC];
base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
@@ -64,6 +66,8 @@ void update_vsyscall(struct timekeeper *tk)
base->sec++;
}
base->nsec = nsec;
+ base->mult = tk->tkr_mono.mult;
+ base->shift = tk->tkr_mono.shift;
base = &vdata->basetime[CLOCK_REALTIME_COARSE];
base->sec = tk->xtime_sec;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 913a133..b1f6df3 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -25,6 +25,8 @@ typedef unsigned long gtod_long_t;
struct vgtod_ts {
u64 sec;
u64 nsec;
+ u32 mult;
+ u32 shift;
};
#define VGTOD_BASES (CLOCK_TAI + 1)
@@ -41,8 +43,6 @@ struct vsyscall_gtod_data {
int vclock_mode;
u64 cycle_last;
u64 mask;
- u32 mult;
- u32 shift;
struct vgtod_ts basetime[VGTOD_BASES];
--
2.4.6
Alexander,
On Wed, 5 Jun 2019, Sverdlin, Alexander (Nokia - DE/Ulm) wrote:
> From: Alexander Sverdlin <[email protected]>
>
> This is a preparation for CLOCK_MONOTONIC_RAW vDSO implementation.
> Coincidentally it had a slight performance improvement as well:
>
> ---- Test code ----
> #include <errno.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <time.h>
> #include <unistd.h>
>
> #define CLOCK_TYPE CLOCK_MONOTONIC_RAW
> #define DURATION_SEC 10
>
> int main(int argc, char **argv)
> {
> struct timespec t, end;
> unsigned long long cnt = 0;
>
> clock_gettime(CLOCK_TYPE, &end);
> end.tv_sec += DURATION_SEC;
>
> do {
> clock_gettime(CLOCK_TYPE, &t);
> ++cnt;
> } while (t.tv_sec < end.tv_sec || t.tv_nsec < end.tv_nsec);
>
> dprintf(STDOUT_FILENO, "%llu", cnt);
>
> return EXIT_SUCCESS;
> }
> -------------------
>
> The results from the above test program:
>
> Clock Before After Diff
> ----- ------ ----- ----
> CLOCK_MONOTONIC 355.5M 359.6M +1%
> CLOCK_REALTIME 355.5M 359.6M +1%
Been there done that. But the results of the micro benchmark above have to
be taken with a grain of salt.
Let's look at the cache side effects of this change:
Before:
struct vsyscall_gtod_data {
unsigned int seq; /* 0 4 */
int vclock_mode; /* 4 4 */
u64 cycle_last; /* 8 8 */
u64 mask; /* 16 8 */
u32 mult; /* 24 4 */
u32 shift; /* 28 4 */
struct vgtod_ts basetime[12]; /* 32 192 */
/* --- cacheline 3 boundary (192 bytes) was 32 bytes ago --- */
int tz_minuteswest; /* 224 4 */
int tz_dsttime; /* 228 4 */
/* size: 232, cachelines: 4, members: 9 */
/* last cacheline: 40 bytes */
};
After:
struct vsyscall_gtod_data {
unsigned int seq; /* 0 4 */
int vclock_mode; /* 4 4 */
u64 cycle_last; /* 8 8 */
u64 mask; /* 16 8 */
struct vgtod_ts basetime[12]; /* 24 288 */
/* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */
int tz_minuteswest; /* 312 4 */
int tz_dsttime; /* 316 4 */
/* size: 320, cachelines: 5, members: 7 */
};
So the interesting change is here:
struct vgtod_ts basetime[12]; /* 32 192 */
vs.
struct vgtod_ts basetime[12]; /* 24 288 */
In the original version struct vgtod_ts occupies 16 bytes in the modified
version it has 24 bytes. As a consequence:
CLOCK Before After
data->basetime[CLOCK_REALTIME] 32 .. 47 24 .. 47
data->basetime[CLOCK_MONOTONIC] 48 .. 63 48 .. 71
That means that the CLOCK_MONOTONIC storage now overlaps into a second
cache line. CLOCK_MONOTONIC is widely used.
Of course on a micro benchmark this does not matter at all, but in real
world use cases which care about cache pressure it does.
Another thing is that just adding some small computation to the benchmark
loop makes the advantage of the modified version go away and it becomes
slightly slower.
On a real world application which is a cache heavy and uses CLOCK MONOTONIC
extensivly I can observe a significant L1 miss increase (%2) with the
modified version.
Now there is something else which is interesting:
On a guest (pvclock) I can observe with your benchmark:
Before:
MONO cnt: 576.7
REAL cnt: 576.6
After:
MONO cnt: 577.1
REAL cnt: 577.0
But on a (different) host:
Before:
MONO cnt: 353.9
REAL cnt: 354.4
After:
MONO cnt: 347.9
REAL cnt: 347.1
Yuck!
So this is really sensitive and I'm inclined not to do that due to the
cache line side effect.
The alternative solution for this is what Vincenzo has in his unified VDSO
patch series:
https://lkml.kernel.org/r/[email protected]
It leaves the data struct unmodified and has a separate array for the raw
clock. That does not have the side effects at all.
I'm in the process of merging that series and I actually adapted your
scheme to the new unified infrastructure where it has exactly the same
effects as with your original patches against the x86 version.
Thanks,
tglx
Hi Thomas,
On 23/06/2019 12:18, Thomas Gleixner wrote:
> The alternative solution for this is what Vincenzo has in his unified VDSO
> patch series:
>
> https://lkml.kernel.org/r/[email protected]
>
> It leaves the data struct unmodified and has a separate array for the raw
> clock. That does not have the side effects at all.
>
> I'm in the process of merging that series and I actually adapted your
> scheme to the new unified infrastructure where it has exactly the same
> effects as with your original patches against the x86 version.
please let me know if I need to rework [2/2] based on some not-yet-published
branch of yours.
--
Best regards,
Alexander Sverdlin.
Alexander,
On Mon, 24 Jun 2019, Sverdlin, Alexander (Nokia - DE/Ulm) wrote:
> On 23/06/2019 12:18, Thomas Gleixner wrote:
> > The alternative solution for this is what Vincenzo has in his unified VDSO
> > patch series:
> >
> > https://lkml.kernel.org/r/[email protected]
> >
> > It leaves the data struct unmodified and has a separate array for the raw
> > clock. That does not have the side effects at all.
> >
> > I'm in the process of merging that series and I actually adapted your
> > scheme to the new unified infrastructure where it has exactly the same
> > effects as with your original patches against the x86 version.
>
> please let me know if I need to rework [2/2] based on some not-yet-published
> branch of yours.
I've pushed it out now to
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/vdso
The generic VDSO library has the support for RAW already with that separate
array. Testing would be welcomed!
Feel free to experiment with moving mult/shift again. Maybe you find a way
to mitigate the effects and make it all magically faster. :)
Thanks,
tglx
Hello Thomas!
On 24/06/2019 11:40, Thomas Gleixner wrote:
>>> The alternative solution for this is what Vincenzo has in his unified VDSO
>>> patch series:
>>>
>>> https://lkml.kernel.org/r/[email protected]
>>>
>>> It leaves the data struct unmodified and has a separate array for the raw
>>> clock. That does not have the side effects at all.
>>>
>>> I'm in the process of merging that series and I actually adapted your
>>> scheme to the new unified infrastructure where it has exactly the same
>>> effects as with your original patches against the x86 version.
>> please let me know if I need to rework [2/2] based on some not-yet-published
>> branch of yours.
> I've pushed it out now to
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/vdso
>
> The generic VDSO library has the support for RAW already with that separate
> array. Testing would be welcomed!
Thanks for your and Vincenzo's efforts!
I've applied the series onto 5.2.0-rc6 and did a quick test on a bare x86_64 and
for me it looks good:
Number of clock_gettime() calls in 10 seconds:
Before After Diff
MONOTONIC 152404300 200825950 +32%
MONOTONIC_RAW 38804788 198765053 +412%
REALTIME 151672619 201371468 +33%
--
Best regards,
Alexander Sverdlin.
Alexander,
On Thu, 27 Jun 2019, Sverdlin, Alexander (Nokia - DE/Ulm) wrote:
> On 24/06/2019 11:40, Thomas Gleixner wrote:
> >>> The alternative solution for this is what Vincenzo has in his unified VDSO
> >>> patch series:
> >>>
> >>> https://lkml.kernel.org/r/[email protected]
> >>>
> >>> It leaves the data struct unmodified and has a separate array for the raw
> >>> clock. That does not have the side effects at all.
> >>>
> >>> I'm in the process of merging that series and I actually adapted your
> >>> scheme to the new unified infrastructure where it has exactly the same
> >>> effects as with your original patches against the x86 version.
> >> please let me know if I need to rework [2/2] based on some not-yet-published
> >> branch of yours.
> > I've pushed it out now to
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/vdso
> >
> > The generic VDSO library has the support for RAW already with that separate
> > array. Testing would be welcomed!
>
> Thanks for your and Vincenzo's efforts!
> I've applied the series onto 5.2.0-rc6 and did a quick test on a bare x86_64 and
> for me it looks good:
Did you use the git tree? If not, it would be interesting to have a test
against that as well because that's the final version.
> Number of clock_gettime() calls in 10 seconds:
>
> Before After Diff
> MONOTONIC 152404300 200825950 +32%
> MONOTONIC_RAW 38804788 198765053 +412%
> REALTIME 151672619 201371468 +33%
The increase for mono and realtime is impressive. Which CPU is that?
Thanks,
tglx
Hi!
On 27/06/2019 14:07, Thomas Gleixner wrote:
>>>>> I'm in the process of merging that series and I actually adapted your
>>>>> scheme to the new unified infrastructure where it has exactly the same
>>>>> effects as with your original patches against the x86 version.
>>>> please let me know if I need to rework [2/2] based on some not-yet-published
>>>> branch of yours.
>>> I've pushed it out now to
>>>
>>> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/vdso
>>>
>>> The generic VDSO library has the support for RAW already with that separate
>>> array. Testing would be welcomed!
>>
>> Thanks for your and Vincenzo's efforts!
>> I've applied the series onto 5.2.0-rc6 and did a quick test on a bare x86_64 and
>> for me it looks good:
>
> Did you use the git tree? If not, it would be interesting to have a test
> against that as well because that's the final version.
I've applied the following list:
32e29396f00e hrtimer: Split out hrtimer defines into separate header
361f8aee9b09 vdso: Define standardized vdso_datapage
00b26474c2f1 lib/vdso: Provide generic VDSO implementation
629fdf77ac45 lib/vdso: Add compat support
44f57d788e7d timekeeping: Provide a generic update_vsyscall() implementation
28b1a824a4f4 arm64: vdso: Substitute gettimeofday() with C implementation
98cd3c3f83fb arm64: vdso: Build vDSO with -ffixed-x18
53c489e1dfeb arm64: compat: Add missing syscall numbers
206c0dfa3c55 arm64: compat: Expose signal related structures
f14d8025d263 arm64: compat: Generate asm offsets for signals
a7f71a2c8903 arm64: compat: Add vDSO
c7aa2d71020d arm64: vdso: Refactor vDSO code
7c1deeeb0130 arm64: compat: VDSO setup for compat layer
1e3f17f55aec arm64: elf: VDSO code page discovery
f01703b3d2e6 arm64: compat: Get sigreturn trampolines from vDSO
bfe801ebe84f arm64: vdso: Enable vDSO compat support
7ac870747988 x86/vdso: Switch to generic vDSO implementation
f66501dc53e7 x86/vdso: Add clock_getres() entry point
22ca962288c0 (tip/WIP.vdso) x86/vdso: Add clock_gettime64() entry point
ecf9db3d1f1a x86/vdso: Give the [ph]vclock_page declarations real types
ed75e8f60bb1 vdso: Remove superfluous #ifdef __KERNEL__ in vdso/datapage.h
94fee4d43752 arm64: vdso: Remove unnecessary asm-offsets.c definitions
6a5b78b32d10 arm64: compat: No need for pre-ARMv7 barriers on an ARMv8 system
e70980312a94 MAINTAINERS: Add entry for the generic VDSO library
9d90b93bf325 lib/vdso: Make delta calculation work correctly
27e11a9fe2e2 arm64: Fix __arch_get_hw_counter() implementation
6241c4dc6ec5 arm64: compat: Fix __arch_get_hw_counter() implementation
3acf4be23528 (tip/timers/vdso) arm64: vdso: Fix compilation with clang older than 8
If you expect a difference, I can re-test using your tree as-is.
>> Number of clock_gettime() calls in 10 seconds:
>>
>> Before After Diff
>> MONOTONIC 152404300 200825950 +32%
>> MONOTONIC_RAW 38804788 198765053 +412%
>> REALTIME 151672619 201371468 +33%
>
> The increase for mono and realtime is impressive. Which CPU is that?
This time it was
processor : 3
vendor_id : AuthenticAMD
cpu family : 21
model : 96
model name : AMD PRO A10-8700B R6, 10 Compute Cores 4C+6G
stepping : 1
microcode : 0x600611a
cpu MHz : 2622.775
cache size : 1024 KB
physical id : 0
siblings : 4
core id : 3
cpu cores : 2
apicid : 19
initial apicid : 3
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good acc_power nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs xop skinit wdt lwp fma4 tce nodeid_msr tbm topoext perfctr_core perfctr_nb bpext ptsc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 xsaveopt arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic vgif overflow_recov
bugs : fxsave_leak sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass
bogomips : 3594.00
TLB size : 1536 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 48 bits physical, 48 bits virtual
power management: ts ttp tm 100mhzsteps hwpstate cpb eff_freq_ro acc_power [13]
(it's different from the one I've used for my patches)
--
Best regards,
Alexander Sverdlin.
Alexander,
On Thu, 27 Jun 2019, Sverdlin, Alexander (Nokia - DE/Ulm) wrote:
> On 27/06/2019 14:07, Thomas Gleixner wrote:
> >
> > Did you use the git tree? If not, it would be interesting to have a test
> > against that as well because that's the final version.
>
> I've applied the following list:
>
> 3acf4be23528 (tip/timers/vdso) arm64: vdso: Fix compilation with clang older than 8
> If you expect a difference, I can re-test using your tree as-is.
That's the tip of tree. All good.
> > The increase for mono and realtime is impressive. Which CPU is that?
>
> This time it was
>
> processor : 3
> vendor_id : AuthenticAMD
> cpu family : 21
> model : 96
> model name : AMD PRO A10-8700B R6, 10 Compute Cores 4C+6G
Interesting. Need to find something similar in my fleet and figure out what
changed that so much.
Thanks!
tglx