2009-07-17 23:39:53

by john stultz

[permalink] [raw]
Subject: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

Hey all,

After talking with some application writers who want very fast, but not
fine-grained timestamps, I decided to try to implement a new clock_ids
to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE
which returns the time at the last tick. This is very fast as we don't
have to access any hardware (which can be very painful if you're using
something like the acpi_pm clocksource), and we can even use the vdso
clock_gettime() method to avoid the syscall. The only trade off is you
only get low-res tick grained time resolution.

This isn't a new idea, I know Ingo pushed a patch (see commit
5899a0f044f3c80e9f7262ec5bc7164773a4c28e) a little while ago that made
the vsyscall gettimeofday() return coarse grained time when the
vsyscall64 sysctrl was set to 2. However this affects all applications
on a system.

With this method, applications can choose the proper speed/granularity
trade-off for themselves.

This is a first pass on this implementation, and while I did test it,
the box I tested it with did not have a glibc new enough to utilize the
vdso clock_gettime(), so there may still be issues there. I'll find a
newer box for testing shortly.

Any thoughts or feedback will be appreciated!

thanks
-john

Signed-off-by: John Stultz <[email protected]>
---
arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++---
include/linux/time.h | 3 +++
kernel/posix-timers.c | 33 +++++++++++++++++++++++++++++++++
kernel/time/timekeeping.c | 19 +++++++++++++++++++
4 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90..e949cc7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -83,14 +83,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
return 0;
}

+notrace static noinline int do_realtime_coarse(struct timespec *ts)
+{
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ ts->tv_sec = gtod->wall_time_sec;
+ ts->tv_nsec = gtod->wall_time_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ return 0;
+}
+
+notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+{
+ unsigned long seq, ns, secs;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ secs = gtod->wall_time_sec;
+ ns = gtod->wall_time_nsec;
+ secs += gtod->wall_to_monotonic.tv_sec;
+ ns += gtod->wall_to_monotonic.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ vset_normalized_timespec(ts, secs, ns);
+ return 0;
+}
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+ if (likely(gtod->sysctl_enabled))
switch (clock) {
case CLOCK_REALTIME:
- return do_realtime(ts);
+ if (likely(gtod->clock.vread))
+ return do_realtime(ts);
+ break;
case CLOCK_MONOTONIC:
- return do_monotonic(ts);
+ if (likely(gtod->clock.vread))
+ return do_monotonic(ts);
+ break;
+ case CLOCK_REALTIME_COARSE:
+ return do_realtime_coarse(ts);
+ case CLOCK_MONOTONIC_COARSE:
+ return do_monotonic_coarse(ts);
}
return vdso_fallback_gettime(clock, ts);
}
diff --git a/include/linux/time.h b/include/linux/time.h
index ce321ac..9db8421 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -108,6 +108,7 @@ void timekeeping_init(void);

unsigned long get_seconds(void);
struct timespec current_kernel_time(void);
+struct timespec get_monotonic_coarse(void);

#define CURRENT_TIME (current_kernel_time())
#define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 })
@@ -225,6 +226,8 @@ struct itimerval {
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6

/*
* The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 752b036..6821ea1 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -236,6 +236,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
return 0;
}

+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+ *tp = current_kernel_time();
+ return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+ struct timespec *tp)
+{
+ *tp = get_monotonic_coarse();
+ return 0;
+}
+
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+ *tp = ktime_to_timespec(KTIME_LOW_RES);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
@@ -255,10 +274,24 @@ static __init int init_posix_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
};
+ struct k_clock clock_realtime_coarse = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_realtime_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };
+ struct k_clock clock_monotonic_coarse = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_monotonic_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };

register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);

posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88..cfe1f32 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -602,3 +602,22 @@ struct timespec current_kernel_time(void)
return now;
}
EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+ struct timespec now, mono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ now = xtime_cache;
+ mono = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ now.tv_nsec + mono.tv_nsec);
+ return now;
+}
+EXPORT_SYMBOL(get_monotonic_coarse);
+


2009-07-18 08:30:42

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Fri, 17 Jul 2009, john stultz wrote:
> +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
> +{
> + *tp = current_kernel_time();
> + return 0;
> +}
> +
> +static int posix_get_monotonic_coarse(clockid_t which_clock,
> + struct timespec *tp)
> +{
> + *tp = get_monotonic_coarse();
> + return 0;
> +}
> +
> +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
> +{
> + *tp = ktime_to_timespec(KTIME_LOW_RES);
> + return 0;
> +}
> /*
> * Initialize everything, well, just everything in Posix clocks/timers ;)
> */
> @@ -255,10 +274,24 @@ static __init int init_posix_timers(void)
> .clock_set = do_posix_clock_nosettime,
> .timer_create = no_timer_create,
> };
> + struct k_clock clock_realtime_coarse = {
> + .clock_getres = hrtimer_get_res,

shouldn't that be posix_get_coarse_res ?

> + .clock_get = posix_get_realtime_coarse,
> + .clock_set = do_posix_clock_nosettime,
> + .timer_create = no_timer_create,
> + };
> + struct k_clock clock_monotonic_coarse = {
> + .clock_getres = hrtimer_get_res,

ditto

> + .clock_get = posix_get_monotonic_coarse,
> + .clock_set = do_posix_clock_nosettime,
> + .timer_create = no_timer_create,
> + };

Looks good otherwise.

tglx

2009-07-18 12:10:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE


* john stultz <[email protected]> wrote:

> Hey all,
>
> After talking with some application writers who want very
> fast, but not fine-grained timestamps, I decided to try to
> implement a new clock_ids to clock_gettime():
> CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE which returns the
> time at the last tick. This is very fast as we don't have to
> access any hardware (which can be very painful if you're using
> something like the acpi_pm clocksource), and we can even use the
> vdso clock_gettime() method to avoid the syscall. The only trade
> off is you only get low-res tick grained time resolution.
>
> This isn't a new idea, I know Ingo pushed a patch (see commit
> 5899a0f044f3c80e9f7262ec5bc7164773a4c28e) a little while ago that
> made the vsyscall gettimeofday() return coarse grained time when
> the vsyscall64 sysctrl was set to 2. However this affects all
> applications on a system.

Note, that patch is an -rt commit, right? I.e. not yet upstream.

> With this method, applications can choose the proper
> speed/granularity trade-off for themselves.
>
> This is a first pass on this implementation, and while I did test
> it, the box I tested it with did not have a glibc new enough to
> utilize the vdso clock_gettime(), so there may still be issues
> there. I'll find a newer box for testing shortly.
>
> Any thoughts or feedback will be appreciated!

Looks good. I think we should offer both methods: your patch as an
unconditional 'coarse time' approximator always available
everywhere, plus the vsyscall redirector as well from -rt, to allow
admins/users to tweak in a global way on apps that cannot be
changed.

Ingo

2009-07-18 22:09:44

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 2009-07-18 at 10:30 +0200, Thomas Gleixner wrote:
> On Fri, 17 Jul 2009, john stultz wrote:
> > +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
> > +{
> > + *tp = current_kernel_time();
> > + return 0;
> > +}
> > +
> > +static int posix_get_monotonic_coarse(clockid_t which_clock,
> > + struct timespec *tp)
> > +{
> > + *tp = get_monotonic_coarse();
> > + return 0;
> > +}
> > +
> > +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
> > +{
> > + *tp = ktime_to_timespec(KTIME_LOW_RES);
> > + return 0;
> > +}
> > /*
> > * Initialize everything, well, just everything in Posix clocks/timers ;)
> > */
> > @@ -255,10 +274,24 @@ static __init int init_posix_timers(void)
> > .clock_set = do_posix_clock_nosettime,
> > .timer_create = no_timer_create,
> > };
> > + struct k_clock clock_realtime_coarse = {
> > + .clock_getres = hrtimer_get_res,
>
> shouldn't that be posix_get_coarse_res ?

Doh! Thanks for pointing that out.

Fixed patch below.

-john


After talking with some application writers who want very fast, but not
fine-grained timestamps, I decided to try to implement a new clock_ids
to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE
which returns the time at the last tick. This is very fast as we don't
have to access any hardware (which can be very painful if you're using
something like the acpi_pm clocksource), and we can even use the vdso
clock_gettime() method to avoid the syscall. The only trade off is you
only get low-res tick grained time resolution.

This isn't a new idea, I know Ingo pushed a patch (see commit
5899a0f044f3c80e9f7262ec5bc7164773a4c28e) a little while ago that made
the vsyscall gettimeofday() return coarse grained time when the
vsyscall64 sysctrl was set to 2. However this affects all applications
on a system.

With this method, applications can choose the proper speed/granularity
trade-off for themselves.

This is a first pass on this implementation, and while I did test it,
the box I tested it with did not have a glibc new enough to utilize the
vdso clock_gettime(), so there may still be issues there. I'll find a
newer box for testing shortly.

Any thoughts or feedback will be appreciated!

thanks
-john

Signed-off-by: John Stultz <[email protected]>
---
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90..e949cc7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -83,14 +83,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
return 0;
}

+notrace static noinline int do_realtime_coarse(struct timespec *ts)
+{
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ ts->tv_sec = gtod->wall_time_sec;
+ ts->tv_nsec = gtod->wall_time_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ return 0;
+}
+
+notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+{
+ unsigned long seq, ns, secs;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ secs = gtod->wall_time_sec;
+ ns = gtod->wall_time_nsec;
+ secs += gtod->wall_to_monotonic.tv_sec;
+ ns += gtod->wall_to_monotonic.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ vset_normalized_timespec(ts, secs, ns);
+ return 0;
+}
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+ if (likely(gtod->sysctl_enabled))
switch (clock) {
case CLOCK_REALTIME:
- return do_realtime(ts);
+ if (likely(gtod->clock.vread))
+ return do_realtime(ts);
+ break;
case CLOCK_MONOTONIC:
- return do_monotonic(ts);
+ if (likely(gtod->clock.vread))
+ return do_monotonic(ts);
+ break;
+ case CLOCK_REALTIME_COARSE:
+ return do_realtime_coarse(ts);
+ case CLOCK_MONOTONIC_COARSE:
+ return do_monotonic_coarse(ts);
}
return vdso_fallback_gettime(clock, ts);
}
diff --git a/include/linux/time.h b/include/linux/time.h
index ce321ac..9db8421 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -108,6 +108,7 @@ void timekeeping_init(void);

unsigned long get_seconds(void);
struct timespec current_kernel_time(void);
+struct timespec get_monotonic_coarse(void);

#define CURRENT_TIME (current_kernel_time())
#define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 })
@@ -225,6 +226,8 @@ struct itimerval {
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6

/*
* The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 752b036..3a60c50 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -236,6 +236,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
return 0;
}

+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+ *tp = current_kernel_time();
+ return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+ struct timespec *tp)
+{
+ *tp = get_monotonic_coarse();
+ return 0;
+}
+
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+ *tp = ktime_to_timespec(KTIME_LOW_RES);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
@@ -255,10 +274,24 @@ static __init int init_posix_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
};
+ struct k_clock clock_realtime_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };
+ struct k_clock clock_monotonic_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };

register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);

posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88..cfe1f32 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -602,3 +602,22 @@ struct timespec current_kernel_time(void)
return now;
}
EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+ struct timespec now, mono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ now = xtime_cache;
+ mono = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ now.tv_nsec + mono.tv_nsec);
+ return now;
+}
+EXPORT_SYMBOL(get_monotonic_coarse);
+

2009-07-18 22:20:30

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 2009-07-18 at 14:09 +0200, Ingo Molnar wrote:
> * john stultz <[email protected]> wrote:
>
> > Hey all,
> >
> > After talking with some application writers who want very
> > fast, but not fine-grained timestamps, I decided to try to
> > implement a new clock_ids to clock_gettime():
> > CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE which returns the
> > time at the last tick. This is very fast as we don't have to
> > access any hardware (which can be very painful if you're using
> > something like the acpi_pm clocksource), and we can even use the
> > vdso clock_gettime() method to avoid the syscall. The only trade
> > off is you only get low-res tick grained time resolution.
> >
> > This isn't a new idea, I know Ingo pushed a patch (see commit
> > 5899a0f044f3c80e9f7262ec5bc7164773a4c28e) a little while ago that
> > made the vsyscall gettimeofday() return coarse grained time when
> > the vsyscall64 sysctrl was set to 2. However this affects all
> > applications on a system.
>
> Note, that patch is an -rt commit, right? I.e. not yet upstream.

Oh yikes.. Somehow I cloned off of the wrong tree. Yes, you're right, I
was looking at the -rt git tree, your patch has not gone upstream yet.

My patch does still apply to mainline with offsets.


> > With this method, applications can choose the proper
> > speed/granularity trade-off for themselves.
> >
> > This is a first pass on this implementation, and while I did test
> > it, the box I tested it with did not have a glibc new enough to
> > utilize the vdso clock_gettime(), so there may still be issues
> > there. I'll find a newer box for testing shortly.
> >
> > Any thoughts or feedback will be appreciated!
>
> Looks good. I think we should offer both methods: your patch as an
> unconditional 'coarse time' approximator always available
> everywhere, plus the vsyscall redirector as well from -rt, to allow
> admins/users to tweak in a global way on apps that cannot be
> changed.

I was thinking for users of apps that cannot be changed, an LD_PRELOAD
redirector that changed the clock_id to a _COARSE might be a little
cleaner then whole-sale forcing all of user-land to be low-res. But I'm
not terribly opinionated on it.

-john

2009-07-18 22:28:33

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 18 Jul 2009 15:09:38 -0700
john stultz <[email protected]> wrote:

> After talking with some application writers who want very fast, but
> not fine-grained timestamps, I decided to try to implement a new
> clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and
> CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This
> is very fast as we don't have to access any hardware (which can be
> very painful if you're using something like the acpi_pm clocksource),
> and we can even use the vdso clock_gettime() method to avoid the
> syscall. The only trade off is you only get low-res tick grained time
> resolution.

Does this tie us to having a tick? I still have hope that we can get
rid of the tick even when apps are running .... since with CFS we don't
really need the tick for the scheduler anymore for example....




> me_lock, seq));
> +
> + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
> + now.tv_nsec + mono.tv_nsec);
> + return now;
> +}
> +EXPORT_SYMBOL(get_monotonic_coarse);
> +

why does this need to be exported ?

--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-07-19 03:08:29

by Chris Snook

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Fri, Jul 17, 2009 at 7:39 PM, john stultz<[email protected]> wrote:
> - ? ? ? if (likely(gtod->sysctl_enabled && gtod->clock.vread))
> + ? ? ? if (likely(gtod->sysctl_enabled))

This irks me. If the sysctl is enabled and the codepath is getting
used often enough that we care about performance, branch prediction
should do the right thing without compiler hints. On the other hand,
if the sysctl is disabled, and the compiler is telling the cpu to
ignore its branch predictor, it'll hurt. I don't think we should be
wrapping (un)likely annotations around configuration options, unless
we're biasing against debug conditions where we definitely don't care
about performance. The patch is certainly no worse than the existing
code, but while we have the hood up, it might be nice to remove the
annotation, unless we're sure that it does no harm, and does some
good.

-- Chris

2009-07-19 06:12:56

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 18 Jul 2009 23:00:55 -0400
Chris Snook <[email protected]> wrote:

> On Fri, Jul 17, 2009 at 7:39 PM, john stultz<[email protected]>
> wrote:
> > -       if (likely(gtod->sysctl_enabled && gtod->clock.vread))
> > +       if (likely(gtod->sysctl_enabled))
>
> This irks me. If the sysctl is enabled and the codepath is getting
> used often enough that we care about performance, branch prediction
> should do the right thing without compiler hints. On the other hand,
> if the sysctl is disabled, and the compiler is telling the cpu to
> ignore its branch predictor, it'll hurt. I don't think we should be
> wrapping (un)likely annotations around configuration options, unless
> we're biasing against debug conditions where we definitely don't care
> about performance. The patch is certainly no worse than the existing
> code, but while we have the hood up, it might be nice to remove the
> annotation, unless we're sure that it does no harm, and does some
> good.

it's on x86.. likely/unlikely don't impact the CPU (since there are no
"ignore the branch predictor" hints), only the code placement.....

(and that's probably a good thing; CPU branch predictors are pretty
good, I'd not be surprised if they're at least as good as the
programmers who think how they code is used)


--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-07-19 06:48:07

by Nicholas Miell

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 2009-07-18 at 23:14 -0700, Arjan van de Ven wrote:
> it's on x86.. likely/unlikely don't impact the CPU (since there are no
> "ignore the branch predictor" hints), only the code placement.....

But code placement does affect branch prediction (e.g. the Pentium 4
static predictor assumes backward branches are taken, recent AMD
processors prefer you to arrange code such that conditional branches are
not taken, etc.)

--
Nicholas Miell <[email protected]>

2009-07-20 11:16:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 2009-07-18 at 15:30 -0700, Arjan van de Ven wrote:
> On Sat, 18 Jul 2009 15:09:38 -0700
> john stultz <[email protected]> wrote:
>
> > After talking with some application writers who want very fast, but
> > not fine-grained timestamps, I decided to try to implement a new
> > clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and
> > CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This
> > is very fast as we don't have to access any hardware (which can be
> > very painful if you're using something like the acpi_pm clocksource),
> > and we can even use the vdso clock_gettime() method to avoid the
> > syscall. The only trade off is you only get low-res tick grained time
> > resolution.
>
> Does this tie us to having a tick? I still have hope that we can get
> rid of the tick even when apps are running .... since with CFS we don't
> really need the tick for the scheduler anymore for example....

On the hardware side to make this happen we'd need a platform that has:

- cheap, high-res, cross-cpu synced, clocksource
- cheap, high-res, clockevents

Maybe power64, sparc64 and s390x qualify, but certainly nothing on x86
does.

Furthermore, on the software side we'd need a few modifications, such as
doing lazy accounting for things like u/s-time which currently rely on
the tick and moving the load-balancing into a hrtimer.

Also, even with the above done, we'd probably want to tinker with the
clockevent/hrtimer code and possibly use a second per-cpu hardware timer
for the scheduler, since doing the whole hrtimer rb-tree dance for every
context switch is simply way too expensive.

But even with all that manged, there's still other bits that rely on the
tick -- RCU being one of the more interesting ones.

2009-07-20 12:25:31

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Mon, Jul 20, 2009 at 01:17:02PM +0200, Peter Zijlstra wrote:
> On Sat, 2009-07-18 at 15:30 -0700, Arjan van de Ven wrote:
> > On Sat, 18 Jul 2009 15:09:38 -0700
> > john stultz <[email protected]> wrote:
> >
> > > After talking with some application writers who want very fast, but
> > > not fine-grained timestamps, I decided to try to implement a new
> > > clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and
> > > CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This
> > > is very fast as we don't have to access any hardware (which can be
> > > very painful if you're using something like the acpi_pm clocksource),
> > > and we can even use the vdso clock_gettime() method to avoid the
> > > syscall. The only trade off is you only get low-res tick grained time
> > > resolution.
> >
> > Does this tie us to having a tick? I still have hope that we can get
> > rid of the tick even when apps are running .... since with CFS we don't
> > really need the tick for the scheduler anymore for example....
>
> On the hardware side to make this happen we'd need a platform that has:
>
> - cheap, high-res, cross-cpu synced, clocksource
> - cheap, high-res, clockevents
>
> Maybe power64, sparc64 and s390x qualify, but certainly nothing on x86
> does.
>
> Furthermore, on the software side we'd need a few modifications, such as
> doing lazy accounting for things like u/s-time which currently rely on
> the tick and moving the load-balancing into a hrtimer.
>
> Also, even with the above done, we'd probably want to tinker with the
> clockevent/hrtimer code and possibly use a second per-cpu hardware timer
> for the scheduler, since doing the whole hrtimer rb-tree dance for every
> context switch is simply way too expensive.
>
> But even with all that manged, there's still other bits that rely on the
> tick -- RCU being one of the more interesting ones.

On alternative to the tick is to inform RCU of each transition to/from
userspace, so that RCU would treat user-mode execution as it currently
does dyntick-idle state. If there is -never- to be any scheduling-clock
interrupts, then RCU would need to also know about transitions to/from
the idle loop -- which happens automatically if CONFIG_NO_HZ, of course.

But I expect that there would be some additional excitement elsewhere...
And given the large number of transitions to/from userspace, getting all
of them noted in the RCU case might be non-trivial as well.

Thanx, Paul

2009-07-20 13:31:29

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Mon, 20 Jul 2009 13:17:02 +0200
Peter Zijlstra <[email protected]> wrote:

> On Sat, 2009-07-18 at 15:30 -0700, Arjan van de Ven wrote:
> > On Sat, 18 Jul 2009 15:09:38 -0700
> > john stultz <[email protected]> wrote:
> >
> > > After talking with some application writers who want very fast,
> > > but not fine-grained timestamps, I decided to try to implement a
> > > new clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and
> > > CLOCK_MONOTONIC_COARSE which returns the time at the last tick.
> > > This is very fast as we don't have to access any hardware (which
> > > can be very painful if you're using something like the acpi_pm
> > > clocksource), and we can even use the vdso clock_gettime() method
> > > to avoid the syscall. The only trade off is you only get low-res
> > > tick grained time resolution.
> >
> > Does this tie us to having a tick? I still have hope that we can get
> > rid of the tick even when apps are running .... since with CFS we
> > don't really need the tick for the scheduler anymore for example....
>
> On the hardware side to make this happen we'd need a platform that
> has:
>
> - cheap, high-res, cross-cpu synced, clocksource
> - cheap, high-res, clockevents
>
> Maybe power64, sparc64 and s390x qualify, but certainly nothing on x86
> does.

the x86 on my desk disagrees.

> Furthermore, on the software side we'd need a few modifications, such
> as doing lazy accounting for things like u/s-time which currently
> rely on the tick and moving the load-balancing into a hrtimer.

I thought the load balancer no longer runs as a timer.. but I could
well be wrong.

> Also, even with the above done, we'd probably want to tinker with the
> clockevent/hrtimer code and possibly use a second per-cpu hardware
> timer for the scheduler, since doing the whole hrtimer rb-tree dance
> for every context switch is simply way too expensive.
>
> But even with all that manged, there's still other bits that rely on
> the tick -- RCU being one of the more interesting ones.

we need to at least keep our options open to go there, since even the
early measurements (iirc from Andrea 5 years ago) of the 1 KHz time show
that it has a real performance impact, as much as 1%. While we may not
need to switch over RIGHT NOW, adding more dependencies on this timer
is just not a good idea...


--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2009-07-20 13:48:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Mon, 2009-07-20 at 06:33 -0700, Arjan van de Ven wrote:
> > Maybe power64, sparc64 and s390x qualify, but certainly nothing on x86
> > does.
>
> the x86 on my desk disagrees.

>From what I know even nehalem doesn't have fully synced tscs when your
machine is large enough, and the timers are still a tad expensive.

Maybe your desktop is next-gen? 't would be nice to finally have an x86
that has usable clock and timer hardware.

I'm sure tglx would be pleasantly surprised :-)

> > Furthermore, on the software side we'd need a few modifications, such
> > as doing lazy accounting for things like u/s-time which currently
> > rely on the tick and moving the load-balancing into a hrtimer.
>
> I thought the load balancer no longer runs as a timer.. but I could
> well be wrong.

It doesn't but it does need wakeup kicks, which are currently done from
the tick.

And I'm not at all disagreeing that we want the tick gone, I'm just
pointing out there's some challenges ahead still ;-)

2009-07-21 22:31:24

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Sat, 2009-07-18 at 15:30 -0700, Arjan van de Ven wrote:
> On Sat, 18 Jul 2009 15:09:38 -0700
> john stultz <[email protected]> wrote:
>
> > After talking with some application writers who want very fast, but
> > not fine-grained timestamps, I decided to try to implement a new
> > clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and
> > CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This
> > is very fast as we don't have to access any hardware (which can be
> > very painful if you're using something like the acpi_pm clocksource),
> > and we can even use the vdso clock_gettime() method to avoid the
> > syscall. The only trade off is you only get low-res tick grained time
> > resolution.
>
> Does this tie us to having a tick? I still have hope that we can get
> rid of the tick even when apps are running .... since with CFS we don't
> really need the tick for the scheduler anymore for example....

So it does require some sort of periodic interval. But the granularity
is probably flexible, although I'm not sure it would be of much use if
the granularity gets to be lower then 100hz.

While being 100% tickless, even when non-idle would be nice, there will
be some need for timekeeping events to prevent clocksource counters from
wrapping, and to do accurate NTP steering.

Even so, the value we're exporting in this patch is the xtime_cache,
which is updated every tick. This is currently used in file
timestamping, so if we go 100% tickless, we'll have to change the file
timestamping to use the actual CLOCK_REALTIME clock_id, which requires a
possibly slow hardware read and would likely hurt fs performance.

So this patch doesn't so much tie us to having a tick or periodic event
any more the the fs timestamping does.


> > me_lock, seq));
> > +
> > + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
> > + now.tv_nsec + mono.tv_nsec);
> > + return now;
> > +}
> > +EXPORT_SYMBOL(get_monotonic_coarse);
> > +
>
> why does this need to be exported ?

Doesn't. Thanks for noticing!

thanks
-john


2009-07-22 01:29:26

by john stultz

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Tue, 2009-07-21 at 15:31 -0700, john stultz wrote:
> On Sat, 2009-07-18 at 15:30 -0700, Arjan van de Ven wrote:
> > On Sat, 18 Jul 2009 15:09:38 -0700
> > john stultz <[email protected]> wrote:
> >
> > > me_lock, seq));
> > > +
> > > + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
> > > + now.tv_nsec + mono.tv_nsec);
> > > + return now;
> > > +}
> > > +EXPORT_SYMBOL(get_monotonic_coarse);
> > > +
> >
> > why does this need to be exported ?
>
> Doesn't. Thanks for noticing!

Here's an updated version of the patch that fixes the following:
1) Remove EXPORT_SYMBOL for get_monotonic_coarse
2) The x86_64 vsyscall was using xtime instead of xtime_cache for the
calculations. This would cause the resolution to be actually
NTP_INTERVAL_FREQ (possibly 2/sec) instead of HZ.

After talking with some application writers who want very fast, but not
fine-grained timestamps, I decided to try to implement a new clock_ids
to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE
which returns the time at the last tick. This is very fast as we don't
have to access any hardware (which can be very painful if you're using
something like the acpi_pm clocksource), and we can even use the vdso
clock_gettime() method to avoid the syscall. The only trade off is you
only get low-res tick grained time resolution.

This isn't a new idea, I know Ingo pushed a patch (see commit
5899a0f044f3c80e9f7262ec5bc7164773a4c28e) a little while ago that made
the vsyscall gettimeofday() return coarse grained time when the
vsyscall64 sysctrl was set to 2. However this affects all applications
on a system.

With this method, applications can choose the proper speed/granularity
trade-off for themselves.

Any thoughts or feedback will be appreciated!

thanks
-john

Signed-off-by: John Stultz <[email protected]>

diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index dc27a69..3d61e20 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
u32 shift;
} clock;
struct timespec wall_to_monotonic;
+ struct timespec wall_time_coarse;
};
extern struct vsyscall_gtod_data __vsyscall_gtod_data
__section_vsyscall_gtod_data;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a..cf53a78 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+ vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6a40b78..ee55754 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts)
return 0;
}

+notrace static noinline int do_realtime_coarse(struct timespec *ts)
+{
+ unsigned long seq;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ ts->tv_sec = gtod->wall_time_coarse.tv_sec;
+ ts->tv_nsec = gtod->wall_time_coarse.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ return 0;
+}
+
+notrace static noinline int do_monotonic_coarse(struct timespec *ts)
+{
+ unsigned long seq, ns, secs;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ secs = gtod->wall_time_coarse.tv_sec;
+ ns = gtod->wall_time_coarse.tv_nsec;
+ secs += gtod->wall_to_monotonic.tv_sec;
+ ns += gtod->wall_to_monotonic.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ vset_normalized_timespec(ts, secs, ns);
+ return 0;
+}
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
- if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+ if (likely(gtod->sysctl_enabled))
switch (clock) {
case CLOCK_REALTIME:
- return do_realtime(ts);
+ if (likely(gtod->clock.vread))
+ return do_realtime(ts);
+ break;
case CLOCK_MONOTONIC:
- return do_monotonic(ts);
+ if (likely(gtod->clock.vread))
+ return do_monotonic(ts);
+ break;
+ case CLOCK_REALTIME_COARSE:
+ return do_realtime_coarse(ts);
+ case CLOCK_MONOTONIC_COARSE:
+ return do_monotonic_coarse(ts);
}
return vdso_fallback_gettime(clock, ts);
}
diff --git a/include/linux/time.h b/include/linux/time.h
index ea16c1a..ce13067 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -109,6 +109,8 @@ extern int timekeeping_suspended;

unsigned long get_seconds(void);
struct timespec current_kernel_time(void);
+struct timespec __current_kernel_time(void); /* does not hold xtime_lock */
+struct timespec get_monotonic_coarse(void);

#define CURRENT_TIME (current_kernel_time())
#define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 })
@@ -241,6 +243,8 @@ struct itimerval {
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6

/*
* The IDs of various hardware clocks:
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d..ddf33bf 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -236,6 +236,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
return 0;
}

+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+ *tp = current_kernel_time();
+ return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+ struct timespec *tp)
+{
+ *tp = get_monotonic_coarse();
+ return 0;
+}
+
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+ *tp = ktime_to_timespec(KTIME_LOW_RES);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
@@ -255,10 +274,24 @@ static __init int init_posix_timers(void)
.clock_set = do_posix_clock_nosettime,
.timer_create = no_timer_create,
};
+ struct k_clock clock_realtime_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };
+ struct k_clock clock_monotonic_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ };

register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);

posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9..d4735d7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -603,6 +603,10 @@ unsigned long get_seconds(void)
}
EXPORT_SYMBOL(get_seconds);

+struct timespec __current_kernel_time(void)
+{
+ return xtime_cache;
+}

struct timespec current_kernel_time(void)
{
@@ -618,3 +622,20 @@ struct timespec current_kernel_time(void)
return now;
}
EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+ struct timespec now, mono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ now = xtime_cache;
+ mono = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ now.tv_nsec + mono.tv_nsec);
+ return now;
+}

2009-07-22 21:40:53

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC][PATCH] Introduce CLOCK_REALTIME_COARSE

On Mon, Jul 20, 2009 at 03:49:14PM +0200, Peter Zijlstra wrote:
> On Mon, 2009-07-20 at 06:33 -0700, Arjan van de Ven wrote:
> > > Maybe power64, sparc64 and s390x qualify, but certainly nothing on x86
> > > does.
> >
> > the x86 on my desk disagrees.
>
> >From what I know even nehalem doesn't have fully synced tscs when your
> machine is large enough, and the timers are still a tad expensive.

All single-socket Nehalem boards have synced TSCs, and some multi-socket
Nehalem boards have synced TSCs, notably Intel's. I don't know if you
can safely rely on the latter, though.

> Maybe your desktop is next-gen? 't would be nice to finally have an x86
> that has usable clock and timer hardware.

x86 CPUs with the ARAT feature ("always running APIC timer") will let
you use the local APIC even in deep C-states. I don't know about the
overhead of the local APIC.

- Josh Triplett