Hi tglx,
This little series applies on top of your patches and, at least on
my laptop, gets back all the performance you lost and then some :)
--Andy
Andy Lutomirski (3):
x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks
x86/vdso: Rearrange do_hres() to improve code generation
x86/vdso: Document vgtod_ts better
arch/x86/entry/vdso/vclock_gettime.c | 9 ++++-----
arch/x86/include/asm/vgtod.h | 9 +++++++++
2 files changed, 13 insertions(+), 5 deletions(-)
--
2.17.1
When a vDSO clock function falls back to the syscall, no special
barriers or ordering is needed, and the syscall fallbacks don't
clobber any memory that is not explicitly listed in the asm
constraints. Remove the "memory" clobber.
This causes minor changes to the generated code, but otherwise has
no obvious performance impact. I think it's nice to have, though,
since it may help the optimizer in the future.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/vdso/vclock_gettime.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index b7ccbff26a3f..18c8a78d1ec9 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -45,7 +45,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
long ret;
asm ("syscall" : "=a" (ret), "=m" (*ts) :
"0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
- "memory", "rcx", "r11");
+ "rcx", "r11");
return ret;
}
@@ -62,7 +62,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
"mov %%edx, %%ebx \n"
: "=a" (ret), "=m" (*ts)
: "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
- : "memory", "edx");
+ : "edx");
return ret;
}
--
2.17.1
vgetcyc() is full of barriers, so fetching values out of the vvar
page before vgetcyc() for use after vgetcyc() results in poor code
generation. Put vgetcyc() first to avoid this problem.
Also, pull the tv_sec division into the loop and put all the ts
writes together. The old code wrote ts->tv_sec on each iteration
before the syscall fallback check and then added in the offset
afterwards, which forced the compiler to pointlessly copy base->sec
to ts->tv_sec on each iteration. The new version seems to generate
sensible code.
Saves several cycles. With this patch applied, the result is faster
than before the clock_gettime() rewrite.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/vdso/vclock_gettime.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 18c8a78d1ec9..419de7552c2f 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -147,10 +147,9 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
do {
seq = gtod_read_begin(gtod);
- ts->tv_sec = base->sec;
+ cycles = vgetcyc(gtod->vclock_mode);
ns = base->nsec;
last = gtod->cycle_last;
- cycles = vgetcyc(gtod->vclock_mode);
if (unlikely((s64)cycles < 0))
return vdso_fallback_gettime(clk, ts);
if (cycles > last)
@@ -158,7 +157,7 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_sec = base->sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return 0;
--
2.17.1
After reading do_hres() and do_course() and scratching my head a
bit, I figured out why the arithmetic is strange. Document it.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/vgtod.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index d17b092b9f1b..69d05c6d47f5 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -13,6 +13,15 @@ typedef u64 gtod_long_t;
typedef unsigned long gtod_long_t;
#endif
+/*
+ * There is one of these objects in the vvar page for each
+ * vDSO-accelerated clockid. For high-resolution clocks, this encodes
+ * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse
+ * clocks, this encodes the actual time.
+ *
+ * To confuse the reader, for high-resolution clocks, nsec is left-shifted
+ * by vsyscall_gtod_data.shift.
+ */
struct vgtod_ts {
u64 sec;
u64 nsec;
--
2.17.1
On Thu, 4 Oct 2018, Andy Lutomirski wrote:
> index 18c8a78d1ec9..419de7552c2f 100644
> --- a/arch/x86/entry/vdso/vclock_gettime.c
> +++ b/arch/x86/entry/vdso/vclock_gettime.c
> @@ -147,10 +147,9 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
>
> do {
> seq = gtod_read_begin(gtod);
> - ts->tv_sec = base->sec;
> + cycles = vgetcyc(gtod->vclock_mode);
> ns = base->nsec;
> last = gtod->cycle_last;
> - cycles = vgetcyc(gtod->vclock_mode);
> if (unlikely((s64)cycles < 0))
> return vdso_fallback_gettime(clk, ts);
> if (cycles > last)
> @@ -158,7 +157,7 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
> ns >>= gtod->shift;
> } while (unlikely(gtod_read_retry(gtod, seq)));
>
> - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
> + ts->tv_sec = base->sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
You cannot access base->sec outside of the seqcount protected region. It
might have been incremented by now and you'll get a time jump by a full
second.
Thanks,
tglx
Commit-ID: 89fe0a1f1c694a3b0b3cfa8c0952d603753f36df
Gitweb: https://git.kernel.org/tip/89fe0a1f1c694a3b0b3cfa8c0952d603753f36df
Author: Andy Lutomirski <[email protected]>
AuthorDate: Thu, 4 Oct 2018 14:44:43 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 5 Oct 2018 10:12:18 +0200
x86/vdso: Remove "memory" clobbers in the vDSO syscall fallbacks
When a vDSO clock function falls back to the syscall, no special
barriers or ordering is needed, and the syscall fallbacks don't
clobber any memory that is not explicitly listed in the asm
constraints. Remove the "memory" clobber.
This causes minor changes to the generated code, but otherwise has
no obvious performance impact. I think it's nice to have, though,
since it may help the optimizer in the future.
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/r/3a7438f5fb2422ed881683d2ccffd7f987b2dc44.1538689401.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/entry/vdso/vclock_gettime.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index b7ccbff26a3f..18c8a78d1ec9 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -45,7 +45,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
long ret;
asm ("syscall" : "=a" (ret), "=m" (*ts) :
"0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
- "memory", "rcx", "r11");
+ "rcx", "r11");
return ret;
}
@@ -62,7 +62,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
"mov %%edx, %%ebx \n"
: "=a" (ret), "=m" (*ts)
: "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
- : "memory", "edx");
+ : "edx");
return ret;
}
Commit-ID: bcc4a62a73cb65327d7268fbfa3a786d603f52dc
Gitweb: https://git.kernel.org/tip/bcc4a62a73cb65327d7268fbfa3a786d603f52dc
Author: Andy Lutomirski <[email protected]>
AuthorDate: Thu, 4 Oct 2018 14:44:45 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Fri, 5 Oct 2018 10:12:18 +0200
x86/vdso: Document vgtod_ts better
After reading do_hres() and do_course() and scratching my head a
bit, I figured out why the arithmetic is strange. Document it.
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/r/f66f53d81150bbad47d7b282c9207a71a3ce1c16.1538689401.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/vgtod.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index d17b092b9f1b..69d05c6d47f5 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -13,6 +13,15 @@ typedef u64 gtod_long_t;
typedef unsigned long gtod_long_t;
#endif
+/*
+ * There is one of these objects in the vvar page for each
+ * vDSO-accelerated clockid. For high-resolution clocks, this encodes
+ * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse
+ * clocks, this encodes the actual time.
+ *
+ * To confuse the reader, for high-resolution clocks, nsec is left-shifted
+ * by vsyscall_gtod_data.shift.
+ */
struct vgtod_ts {
u64 sec;
u64 nsec;
> On Oct 4, 2018, at 11:00 PM, Thomas Gleixner <[email protected]> wrote:
>
>> On Thu, 4 Oct 2018, Andy Lutomirski wrote:
>> index 18c8a78d1ec9..419de7552c2f 100644
>> --- a/arch/x86/entry/vdso/vclock_gettime.c
>> +++ b/arch/x86/entry/vdso/vclock_gettime.c
>> @@ -147,10 +147,9 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
>>
>> do {
>> seq = gtod_read_begin(gtod);
>> - ts->tv_sec = base->sec;
>> + cycles = vgetcyc(gtod->vclock_mode);
>> ns = base->nsec;
>> last = gtod->cycle_last;
>> - cycles = vgetcyc(gtod->vclock_mode);
>> if (unlikely((s64)cycles < 0))
>> return vdso_fallback_gettime(clk, ts);
>> if (cycles > last)
>> @@ -158,7 +157,7 @@ notrace static int do_hres(clockid_t clk, struct timespec *ts)
>> ns >>= gtod->shift;
>> } while (unlikely(gtod_read_retry(gtod, seq)));
>>
>> - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
>> + ts->tv_sec = base->sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
>
> You cannot access base->sec outside of the seqcount protected region. It
> might have been incremented by now and you'll get a time jump by a full
> second.
Duh. Let me try this again.
>
> Thanks,
>
> tglx
>
>