This is a third tentative to switch powerpc VDSO to generic C implementation.
This version should work on PPC64 (untested). VDSO32 for PPC64 is
impossible to build and has been de-activated, because the powerpc
ASM header files for C are not prepared to build 32 bits code with CONFIG_PPC64.
powerpc is a bit special for VDSO as well as system calls in the
way that it requires setting CR SO bit which cannot be done in C.
Therefore, entry/exit and fallback need to be performed in ASM.
On a powerpc8xx, with current powerpc/32 ASM VDSO:
gettimeofday: vdso: 737 nsec/call
clock-getres-realtime: vdso: 475 nsec/call
clock-gettime-realtime: vdso: 892 nsec/call
The first patch adds VDSO generic C support without any changes to common code.
Performance is as follows:
gettimeofday: vdso: 1379 nsec/call
clock-getres-realtime-coarse: vdso: 984 nsec/call
clock-gettime-realtime-coarse: vdso: 868 nsec/call
clock-getres-realtime: vdso: 922 nsec/call
clock-gettime-realtime: vdso: 1511 nsec/call
clock-getres-monotonic-raw: vdso: 968 nsec/call
clock-gettime-monotonic-raw: vdso: 1576 nsec/call
Then a few changes in the common code have allowed performance improvement. At
the end of the series we have:
gettimeofday: vdso: 899 nsec/call
clock-getres-realtime-coarse: vdso: 546 nsec/call
clock-gettime-realtime-coarse: vdso: 615 nsec/call
clock-getres-realtime: vdso: 545 nsec/call
clock-gettime-realtime: vdso: 1064 nsec/call
clock-getres-monotonic-raw: vdso: 546 nsec/call
clock-gettime-monotonic-raw: vdso: 1125 nsec/call
Christophe Leroy (12):
powerpc/64: Don't provide time functions in compat VDSO32
powerpc/vdso: Switch VDSO to generic C implementation.
lib: vdso: mark __cvdso_clock_getres() as static
lib: vdso: inline do_hres() and do_coarse()
lib: vdso: Avoid duplication in __cvdso_clock_getres()
lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time
powerpc/vdso: simplify __get_datapage()
lib: vdso: allow arches to provide vdso data pointer
powerpc/vdso: provide inline alternative to __get_datapage()
powerpc/vdso: provide vdso data pointer from the ASM caller.
lib: vdso: split clock verification out of __arch_get_hw_counter()
powerpc/vdso: provide __arch_is_hw_counter_valid()
arch/powerpc/Kconfig | 2 +
arch/powerpc/include/asm/vdso/gettimeofday.h | 104 +++++++++++
arch/powerpc/include/asm/vdso/vsyscall.h | 25 +++
arch/powerpc/include/asm/vdso_datapage.h | 52 +++---
arch/powerpc/kernel/asm-offsets.c | 46 +----
arch/powerpc/kernel/time.c | 90 ----------
arch/powerpc/kernel/vdso.c | 58 ++----
arch/powerpc/kernel/vdso32/Makefile | 30 +++-
arch/powerpc/kernel/vdso32/datapage.S | 10 +-
arch/powerpc/kernel/vdso32/gettimeofday.S | 258 ++++-----------------------
arch/powerpc/kernel/vdso32/vdso32.lds.S | 9 +-
arch/powerpc/kernel/vdso32/vgettimeofday.c | 29 +++
arch/powerpc/kernel/vdso64/Makefile | 23 ++-
arch/powerpc/kernel/vdso64/datapage.S | 13 +-
arch/powerpc/kernel/vdso64/gettimeofday.S | 257 ++++----------------------
arch/powerpc/kernel/vdso64/vdso64.lds.S | 7 +-
arch/powerpc/kernel/vdso64/vgettimeofday.c | 29 +++
lib/vdso/gettimeofday.c | 130 +++++++++++---
18 files changed, 457 insertions(+), 715 deletions(-)
create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h
create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h
create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c
create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c
--
2.13.3
Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.
Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.
Before:
gettimeofday: vdso: 1078 nsec/call
clock-gettime-monotonic-raw: vdso: 1317 nsec/call
clock-gettime-monotonic: vdso: 1255 nsec/call
After:
gettimeofday: vdso: 1032 nsec/call
clock-gettime-monotonic-raw: vdso: 1312 nsec/call
clock-gettime-monotonic: vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <[email protected]>
---
lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
1 file changed, 23 insertions(+), 3 deletions(-)
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#endif
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder)
+{
+ u32 ret = 0;
+
+ while (dividend >= divisor) {
+ /* The following asm() prevents the compiler from
+ optimising this loop into a modulo operation. */
+ asm("" : "+rm"(dividend));
+
+ dividend -= divisor;
+ ret++;
+ }
+
+ *remainder = dividend;
+
+ return ret;
+}
+
static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
u64 cycles, last, sec, ns;
u32 seq;
+ unsigned long nsec;
do {
seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
return -1;
ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
- ns >>= vd->shift;
+ nsec = ns >> vd->shift;
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
* Do this outside the loop: a race inside the loop could result
* in __iter_div_u64_rem() being extremely slow.
*/
- ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
+ ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+ ts->tv_nsec = nsec;
return 0;
}
--
2.13.3
Christophe Leroy <[email protected]> writes:
> Using __iter_div_ulong_rem() is suboptimal on 32 bits.
> Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
> so nsec will never overflow 32 bits.
That's theory and perhaps true for bare metal, but there is no guarantee
on VIRT that the CPU which has the timekeeping duty assigned is not
scheduled out for longer than 4 seconds.
Thanks,
tglx