This is an alternative to Christoph Lameter's proposals. I think it
provides better functionality, and a more robust implementation. (The last
version of Christoph's patch I saw had a notable lack of necessary locking,
for example.) I am working on an expansion of this to implement timers
too. But since the code I've already finished and tested does more than
the proposal that's been floated, it seemed appropriate to separate out the
clock sampling functionality and post this.
This patch implements CPU time clocks in the clock_* interface
(posix-timers) for threads and processes (thread groups). These are
clockid_t values with the high bits set (i.e. negative), which encode a PID
(which can be zero to indicate the caller), and a flag indicating a
per-thread or per-process clock.
There are three flavors of clock supported, each in thread and process
variants. The PROF clock counts utime + stime; this is the same clock by
which ITIMER_PROF is defined. The VIRT clock counts just utime; this is
the same clock by which ITIMER_VIRTUAL is defined. These utime and stime
are the same values reported e.g. by getrusage and times. That is, the
clock accumulates 1sec/HZ for each 1sec/HZ period in which that thread was
running when the system timer interrupt came. The final flavor of clock is
SCHED, which uses time as reported by `sched_clock' and count the elapsed
time that the thread was scheduled on the CPU. For many platforms,
sched_clock just uses jiffies (i.e. the system timer interrupt) and so this
is the same as the PROF clock. But when a high-resolution cycle counter is
available (e.g. TSC on x86), then the SCHED clock records finer-grained and
more accurate information, because it charges a thread for the time it was
actually on a CPU between context switches, not the whole 1sec/HZ period.
When examining the SCHED clocks of other threads that are currently
scheduled on other CPUs, you only see the time accumulated as of the last
timer interrupt or context switch, but when asking about the current thread
it also adds in the current delta since the last context switch.
The PROF and VIRT clocks are the information that's traditionally available
and used for getrusage, SIGPROF/SIGVTALRM, etc. The SCHED clock is the
best information available. I figured I'd provide all three and let
userland decide when it wants to use which.
Thanks,
Roland
Signed-off-by: Roland McGrath <[email protected]>
--- linux-2.6/include/linux/posix-timers.h 2 Jul 2004 17:31:23 -0000 1.3
+++ linux-2.6/include/linux/posix-timers.h 5 Oct 2004 04:36:57 -0000
@@ -4,6 +4,23 @@
#include <linux/spinlock.h>
#include <linux/list.h>
+#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+ (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+#define CPUCLOCK_PID_MASK 7
+#define CPUCLOCK_PERTHREAD_MASK 4
+#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK 3
+#define CPUCLOCK_PROF 0
+#define CPUCLOCK_VIRT 1
+#define CPUCLOCK_SCHED 2
+#define CPUCLOCK_MAX 3
+
+#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
+ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock))
+#define MAKE_THREAD_CPUCLOCK(tid, clock) \
+ MAKE_PROCESS_CPUCLOCK((tid), (clock) | CPUCLOCK_PERTHREAD_MASK)
+
struct k_clock_abs {
struct list_head list;
spinlock_t lock;
@@ -41,4 +58,9 @@ struct now_struct {
(timr)->it_overrun += orun; \
} \
}while (0)
+
+int posix_cpu_clock_getres(clockid_t, struct timespec __user *);
+int posix_cpu_clock_gettime(clockid_t, struct timespec __user *);
+int posix_cpu_clock_settime(clockid_t, const struct timespec __user *);
+
#endif
--- linux-2.6/include/linux/sched.h 13 Sep 2004 21:05:18 -0000 1.272
+++ linux-2.6/include/linux/sched.h 5 Oct 2004 01:08:31 -0000
@@ -312,6 +312,14 @@ struct signal_struct {
unsigned long utime, stime, cutime, cstime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+
+ /*
+ * Cumulative ns of scheduled CPU time for dead threads in the
+ * group, not including a zombie group leader. (This only differs
+ * from jiffies_to_ns(utime + stime) if sched_clock uses something
+ * other than jiffies.)
+ */
+ unsigned long long sched_time;
};
/*
@@ -450,6 +458,7 @@ struct task_struct {
unsigned long sleep_avg;
long interactive_credit;
unsigned long long timestamp;
+ unsigned long long sched_time; /* sched_clock time spent running */
int activated;
unsigned long policy;
@@ -632,6 +641,7 @@ static inline int set_cpus_allowed(task_
#endif
extern unsigned long long sched_clock(void);
+extern unsigned long long current_sched_time(const task_t *current_task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
--- linux-2.6/kernel/Makefile 4 Sep 2004 23:18:26 -0000 1.43
+++ linux-2.6/kernel/Makefile 5 Oct 2004 01:04:59 -0000
@@ -7,7 +7,7 @@ obj-y = sched.o fork.o exec_domain.o
sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o intermodule.o extable.o params.o posix-timers.o \
- kthread.o
+ kthread.o posix-cpu-timers.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
--- linux-2.6/kernel/fork.c 23 Sep 2004 01:21:19 -0000 1.216
+++ linux-2.6/kernel/fork.c 5 Oct 2004 01:25:58 -0000
@@ -871,6 +871,7 @@ static inline int copy_signal(unsigned l
sig->utime = sig->stime = sig->cutime = sig->cstime = 0;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+ sig->sched_time = 0;
return 0;
}
@@ -991,6 +992,7 @@ static task_t *copy_process(unsigned lon
p->real_timer.data = (unsigned long) p;
p->utime = p->stime = 0;
+ p->sched_time = 0;
p->lock_depth = -1; /* -1 = no lock */
p->start_time = get_jiffies_64();
p->security = NULL;
--- linux-2.6/kernel/posix-timers.c 11 Sep 2004 06:20:07 -0000 1.36
+++ linux-2.6/kernel/posix-timers.c 5 Oct 2004 01:14:04 -0000
@@ -133,13 +133,9 @@ static spinlock_t idr_lock = SPIN_LOCK_U
* resolution. Here we define the standard CLOCK_REALTIME as a
* 1/HZ resolution clock.
*
- * CPUTIME & THREAD_CPUTIME: We are not, at this time, definding these
- * two clocks (and the other process related clocks (Std
- * 1003.1d-1999). The way these should be supported, we think,
- * is to use large negative numbers for the two clocks that are
- * pinned to the executing process and to use -pid for clocks
- * pinned to particular pids. Calls which supported these clock
- * ids would split early in the function.
+ * CPUTIME: These clocks have clockid_t values < -1. All the clock
+ * and timer system calls here just punt to posix_cpu_*
+ * functions defined in posix-cpu-timers.c, which see.there.
*
* RESOLUTION: Clock resolution is used to round up timer and interval
* times, NOT to report clock times, which are reported with as
@@ -1232,6 +1228,9 @@ sys_clock_settime(clockid_t which_clock,
{
struct timespec new_tp;
+ if (which_clock < 0)
+ return posix_cpu_clock_settime(which_clock, tp);
+
if ((unsigned) which_clock >= MAX_CLOCKS ||
!posix_clocks[which_clock].res)
return -EINVAL;
@@ -1249,6 +1248,9 @@ sys_clock_gettime(clockid_t which_clock,
struct timespec rtn_tp;
int error = 0;
+ if (which_clock < 0)
+ return posix_cpu_clock_gettime(which_clock, tp);
+
if ((unsigned) which_clock >= MAX_CLOCKS ||
!posix_clocks[which_clock].res)
return -EINVAL;
@@ -1267,6 +1269,9 @@ sys_clock_getres(clockid_t which_clock,
{
struct timespec rtn_tp;
+ if (which_clock < 0)
+ return posix_cpu_clock_getres(which_clock, tp);
+
if ((unsigned) which_clock >= MAX_CLOCKS ||
!posix_clocks[which_clock].res)
return -EINVAL;
--- linux-2.6/kernel/sched.c 24 Sep 2004 16:31:53 -0000 1.353
+++ linux-2.6/kernel/sched.c 5 Oct 2004 01:03:34 -0000
@@ -2358,6 +2358,31 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);
/*
+ * This is called on clock ticks and on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ */
+static void update_cpu_clock(task_t *p, runqueue_t *rq,
+ unsigned long long now)
+{
+ unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
+ p->sched_time += now - last;
+}
+
+/*
+ * Return current->sched_time plus any more ns on the sched_clock
+ * that have not yet been banked.
+ */
+unsigned long long current_sched_time(const task_t *tsk)
+{
+ unsigned long long ns;
+ local_irq_disable();
+ ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
+ ns = tsk->sched_time + (sched_clock() - ns);
+ local_irq_enable();
+ return ns;
+}
+
+/*
* We place interactive tasks back into the active array, if possible.
*
* To guarantee that this does not starve expired tasks we ignore the
@@ -2386,8 +2411,11 @@ void scheduler_tick(int user_ticks, int
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
runqueue_t *rq = this_rq();
task_t *p = current;
+ unsigned long long now = sched_clock();
+
+ update_cpu_clock(p, rq, now);
- rq->timestamp_last_tick = sched_clock();
+ rq->timestamp_last_tick = now;
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);
@@ -2758,6 +2786,8 @@ switch_tasks:
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
+ update_cpu_clock(prev, rq, now);
+
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0) {
prev->sleep_avg = 0;
--- linux-2.6/kernel/signal.c 16 Sep 2004 14:12:54 -0000 1.139
+++ linux-2.6/kernel/signal.c 5 Oct 2004 05:11:59 -0000
@@ -385,6 +385,7 @@ void __exit_signal(struct task_struct *t
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
+ sig->sched_time += tsk->sched_time;
spin_unlock(&sighand->siglock);
sig = NULL; /* Marker for below. */
}
--- /dev/null 2004-02-23 13:02:56.000000000 -0800
+++ linux-2.6/kernel/posix-cpu-timers.c 2004-10-04 18:28:40.000000000 -0700
@@ -0,0 +1,234 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+
+union cpu_time_count {
+ unsigned long cpu;
+ unsigned long long sched;
+};
+
+static int check_clock(clockid_t which_clock)
+{
+ int error = 0;
+ struct task_struct *p;
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+
+ if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ if (pid == 0)
+ return 0;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
+ p->tgid != current->tgid : p->tgid != pid)) {
+ error = -EINVAL;
+ }
+ read_unlock(&tasklist_lock);
+
+ return error;
+}
+
+static void sample_to_timespec(clockid_t which_clock,
+ union cpu_time_count cpu,
+ struct timespec *tp)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ tp->tv_sec = div_long_long_rem(cpu.sched,
+ NSEC_PER_SEC, &tp->tv_nsec);
+ } else {
+ jiffies_to_timespec(cpu.cpu, tp);
+ }
+}
+
+static int sample_to_user(clockid_t which_clock,
+ union cpu_time_count cpu,
+ struct timespec __user *tp)
+{
+ struct timespec ts;
+ sample_to_timespec(which_clock, cpu, &ts);
+ return copy_to_user(tp, &ts, sizeof *tp) ? -EFAULT : 0;
+}
+
+static inline unsigned long prof_ticks(struct task_struct *p)
+{
+ return p->utime + p->stime;
+}
+static inline unsigned long virt_ticks(struct task_struct *p)
+{
+ return p->utime;
+}
+static inline unsigned long long sched_ns(struct task_struct *p)
+{
+ return (p == current) ? current_sched_time(p) : p->sched_time;
+}
+
+int posix_cpu_clock_getres(clockid_t which_clock, struct timespec __user *tp)
+{
+ int error = check_clock(which_clock);
+ if (!error && tp) {
+ struct timespec rtn_tp = { 0, ((NSEC_PER_SEC + HZ - 1) / HZ) };
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ /*
+ * If sched_clock is using a cycle counter, we
+ * don't have any idea of its true resolution
+ * exported, but it is much more than 1s/HZ.
+ */
+ rtn_tp.tv_nsec = 1;
+ }
+ if (copy_to_user(tp, &rtn_tp, sizeof *tp))
+ error = -EFAULT;
+ }
+ return error;
+}
+
+int posix_cpu_clock_settime(clockid_t which_clock,
+ const struct timespec __user *tp)
+{
+ /*
+ * You can never reset a CPU clock, but we check for other errors
+ * in the call before failing with EPERM.
+ */
+ int error = check_clock(which_clock);
+ if (error == 0) {
+ struct timespec new_tp;
+ error = -EPERM;
+ if (copy_from_user(&new_tp, tp, sizeof *tp))
+ error = -EFAULT;
+ }
+ return error;
+}
+
+static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+ union cpu_time_count *cpu)
+{
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ cpu->cpu = prof_ticks(p);
+ break;
+ case CPUCLOCK_VIRT:
+ cpu->cpu = virt_ticks(p);
+ break;
+ case CPUCLOCK_SCHED:
+ cpu->sched = sched_ns(p);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_clock_sample_group(clockid_t which_clock,
+ struct task_struct *p,
+ union cpu_time_count *cpu,
+ unsigned int *nthreads)
+{
+ struct task_struct *t = p;
+ unsigned long flags;
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ *nthreads = 0;
+ cpu->cpu = p->signal->utime + p->signal->stime;
+ do {
+ if (!unlikely(t->state & (TASK_DEAD|TASK_ZOMBIE)))
+ ++*nthreads;
+ cpu->cpu += prof_ticks(t);
+ t = next_thread(t);
+ } while (t != p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ break;
+ case CPUCLOCK_VIRT:
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ *nthreads = 0;
+ cpu->cpu = p->signal->utime;
+ do {
+ if (!unlikely(t->state & (TASK_DEAD|TASK_ZOMBIE)))
+ ++*nthreads;
+ cpu->cpu += virt_ticks(t);
+ t = next_thread(t);
+ } while (t != p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ break;
+ case CPUCLOCK_SCHED:
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ cpu->sched = p->signal->sched_time;
+ *nthreads = 1;
+ /* Add in each other live thread. */
+ while ((t = next_thread(t)) != p) {
+ if (!unlikely(t->state & (TASK_DEAD|TASK_ZOMBIE)))
+ ++*nthreads;
+ cpu->sched += t->sched_time;
+ }
+ if (p->tgid == current->tgid) {
+ /*
+ * We're sampling ourselves, so include the
+ * cycles not yet banked. We still omit
+ * other threads running on other CPUs,
+ * so the total can always be behind as
+ * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
+ */
+ cpu->sched += current_sched_time(current);
+ } else {
+ cpu->sched += p->sched_time;
+ }
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ break;
+ }
+ return 0;
+}
+
+int posix_cpu_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
+{
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+ int error = -EINVAL;
+ union cpu_time_count rtn;
+
+ if (pid == 0) {
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ /*
+ * Sampling just ourselves we can do with no locking.
+ */
+ error = cpu_clock_sample(which_clock,
+ current, &rtn);
+ } else {
+ unsigned int n;
+ read_lock(&tasklist_lock);
+ error = cpu_clock_sample_group(which_clock,
+ current, &rtn, &n);
+ read_unlock(&tasklist_lock);
+ }
+ } else {
+ struct task_struct *p;
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (p) {
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ if (p->tgid == current->tgid) {
+ error = cpu_clock_sample(which_clock,
+ p, &rtn);
+ }
+ } else if (p->tgid == pid && p->signal) {
+ unsigned int n;
+ error = cpu_clock_sample_group(which_clock,
+ p, &rtn, &n);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ if (error)
+ return error;
+ return sample_to_user(which_clock, rtn, tp);
+}
Roland McGrath <[email protected]> wrote:
>
> +int posix_cpu_clock_settime(clockid_t which_clock,
> + const struct timespec __user *tp)
> +{
> + /*
> + * You can never reset a CPU clock, but we check for other errors
> + * in the call before failing with EPERM.
> + */
> + int error = check_clock(which_clock);
> + if (error == 0) {
> + struct timespec new_tp;
> + error = -EPERM;
> + if (copy_from_user(&new_tp, tp, sizeof *tp))
> + error = -EFAULT;
> + }
> + return error;
> +}
This will always return an error.
> This will always return an error.
Yes, indeed. It's just picky about which one. Something I meant to
mention: I do not allow clock_settime on CPU timers, because I don't know
of any worthwhile reason to want to support it. You get EINVAL or EFAULT
if your arguments are bogus, and EPERM if they are valid. This is the
POSIX-compliant error code choice.
Thanks,
Roland
Roland McGrath wrote:
> /*
> + * This is called on clock ticks and on context switches.
> + * Bank in p->sched_time the ns elapsed since the last tick or switch.
> + */
> +static void update_cpu_clock(task_t *p, runqueue_t *rq,
> + unsigned long long now)
> +{
> + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
> + p->sched_time += now - last;
> +}
This looks wrong. But update_cpu_clock is never called from another
CPU. In which case you don't need to worry about timestamp_last_tick.
> +
> +/*
> + * Return current->sched_time plus any more ns on the sched_clock
> + * that have not yet been banked.
> + */
> +unsigned long long current_sched_time(const task_t *tsk)
> +{
> + unsigned long long ns;
> + local_irq_disable();
> + ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
> + ns = tsk->sched_time + (sched_clock() - ns);
> + local_irq_enable();
> + return ns;
> +}
> +
This doesn't perform the timestamp_last_tick "normalisation" properly
either.
It also seems to conveniently ignore locking when reading those values
off another CPU. Not a big deal for dynamic load calculations, but I'm
not so sure about your usage...?
Lastly, even when using timestamp_last_tick correctly, I think sched_clock
will still drift around slightly, especially if a task switches CPUs a lot
(but not restricted to moving CPUs). Again this is something that I
haven't thought about much because it is not a big deal for current usage.
On Mon, 4 Oct 2004, Roland McGrath wrote:
> This is an alternative to Christoph Lameter's proposals. I think it
> provides better functionality, and a more robust implementation. (The last
> version of Christoph's patch I saw had a notable lack of necessary locking,
> for example.) I am working on an expansion of this to implement timers
> too. But since the code I've already finished and tested does more than
> the proposal that's been floated, it seemed appropriate to separate out the
> clock sampling functionality and post this.
Thanks for posting this. My last version of the patch for
CLOCK_*_CPUTIME_ID--as in the mm kernel now--has the necessary locking.
The version you saw was an early draft and you were not cc'ed on the
later version since I did not know about your
work. If there is any locking issue remaining please point that out.
Your work on providing access to other processes timers is certainly much
more comprehensive than my draft that I whipped up yesterday to get this
thing rollling....
> This patch implements CPU time clocks in the clock_* interface
> (posix-timers) for threads and processes (thread groups). These are
> clockid_t values with the high bits set (i.e. negative), which encode a PID
> (which can be zero to indicate the caller), and a flag indicating a
> per-thread or per-process clock.
Having these bits restricts the numeric range from which you may
be able to obtain this information. pid is signed int. My patch preserves
the ability to use the full positive range of the int to access a
process clock.
Your approach means that only a part of the range may be used. What
happens if a pid of say 2^31-10 is used with your API?
> There are three flavors of clock supported, each in thread and process
> variants. The PROF clock counts utime + stime; this is the same clock by
> which ITIMER_PROF is defined. The VIRT clock counts just utime; this is
> the same clock by which ITIMER_VIRTUAL is defined. These utime and stime
> are the same values reported e.g. by getrusage and times. That is, the
> clock accumulates 1sec/HZ for each 1sec/HZ period in which that thread was
> running when the system timer interrupt came. The final flavor of clock is
> SCHED, which uses time as reported by `sched_clock' and count the elapsed
> time that the thread was scheduled on the CPU. For many platforms,
> sched_clock just uses jiffies (i.e. the system timer interrupt) and so this
> is the same as the PROF clock. But when a high-resolution cycle counter is
> available (e.g. TSC on x86), then the SCHED clock records finer-grained and
> more accurate information, because it charges a thread for the time it was
> actually on a CPU between context switches, not the whole 1sec/HZ period.
> When examining the SCHED clocks of other threads that are currently
> scheduled on other CPUs, you only see the time accumulated as of the last
> timer interrupt or context switch, but when asking about the current thread
> it also adds in the current delta since the last context switch.
Does this approach take into consideration that the TSC or cputimer may be
different on each CPU of a SMP system? There are systems with
unsynchronized TSCs, that may have different processor speeds in one SMP
system or systems may reduce the clock frequency to save power.
> The PROF and VIRT clocks are the information that's traditionally available
> and used for getrusage, SIGPROF/SIGVTALRM, etc. The SCHED clock is the
> best information available. I figured I'd provide all three and let
> userland decide when it wants to use which.
Thats quite impressive. Thanks for posting this.
I just reviewed the code and to my surprise the simple things like
clock_gettime(CLOCK_PROCESS_CPUTIME_ID) and
clock_gettime(CLOCK_THREAD_CPUTIME_ID) are not supported. I guess glibc
would have to do multiple calls to add up all the threads in order to get
these values. This also leaves glibc in control of clocks and does not
allow the kernel to define its own clocks.
The code is pretty invasive and I am not sure that this brings us much
nor that it is done the right way.
The kernel interface is rather strange with the bits that may be set for
each special clock which is then combined with the pid. It may be
advisable to have a separate function call to do this like
get_process_clock(pid,type,×pec)
instead of using the posix calls. The thread specific time
measurements have nothing to do with the posix standard and may best
be kept separate.
The benefit of the patches that I proposed is that it is a clean
implementation of the posix interface, the kernel has full
control over posix clocks and that driver specific clocks as well as other
time sources can be defined that may also utilize the timed interrupt
features of those clock chips. Rolands patch does not allow that and
instead will lead to more complex logic in glibc and a rather strange
kernel interface.
> Roland McGrath wrote:
>
> > /*
> > + * This is called on clock ticks and on context switches.
> > + * Bank in p->sched_time the ns elapsed since the last tick or switch.
> > + */
> > +static void update_cpu_clock(task_t *p, runqueue_t *rq,
> > + unsigned long long now)
> > +{
> > + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
> > + p->sched_time += now - last;
> > +}
>
> This looks wrong. But update_cpu_clock is never called from another
> CPU. In which case you don't need to worry about timestamp_last_tick.
I don't really understand this comment. update_cpu_clock is called from
schedule and from scheduler_tick. When it was last called by schedule,
p->timestamp will mark this time. When it was last called by
p->scheduler_tick, rq->timestamp_last_tick will mark this time.
Hence the max of the two is the last time update_cpu_clock was called.
> This doesn't perform the timestamp_last_tick "normalisation" properly
> either.
I don't know what you think is missing. If the "normalization" you are
talking about is this kind of thing:
p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+ rq_dest->timestamp_last_tick;
then that is not relevant here. That normalizes the timestamp to the new
CPU when changing from one CPU to another. This is not something that
matters for the sched_time tracking, because that only uses the difference
between a timestamp when the thread went on a CPU and the timestamp when it
went off. When a thread switches CPUs without yielding, this normalization
will happen to its ->timestamp, and then the next update_cpu_clock will be
taking the difference of the newly-appropriate ->timestamp value against
the current CPU's sched_clock value.
> It also seems to conveniently ignore locking when reading those values
> off another CPU. Not a big deal for dynamic load calculations, but I'm
> not so sure about your usage...?
Here again I don't know what you are talking about. Nothing is ever read
"off another CPU". A thread maintains its own sched_time counter while it
is running on a CPU.
> Lastly, even when using timestamp_last_tick correctly, I think sched_clock
> will still drift around slightly, especially if a task switches CPUs a lot
> (but not restricted to moving CPUs).
Please explain.
Thanks,
Roland
> Your approach means that only a part of the range may be used.
> What happens if a pid of say 2^31-10 is used with your API?
That is certainly true. On a 32-bit machine, a PID value above 2^29-1
cannot be used, by definition. So there is no "what happens", it just can't.
Userland will need to reject huge PIDs before trying to encode them.
PID_MAX_LIMIT is 2^22, so no actually valid PID can be a problem in practice.
> Does this approach take into consideration that the TSC or cputimer may be
> different on each CPU of a SMP system?
Yes. If you consider the methodology I described, you'll see that it does.
The absolute sched_clock time is never relevant here, so it doesn't matter
that it differs between CPUs. I take a sample when the thread gets
scheduled, and another when it gets descheduled (and perhaps others on the
timer interrupts in between). So all I ever use is the difference between
two samples taken on the same CPU.
> I just reviewed the code and to my surprise the simple things like
>
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID) and
> clock_gettime(CLOCK_THREAD_CPUTIME_ID) are not supported.
You seem to be confused. A clockid_t for a CPU clock encodes a PID, which
can be zero to indicate the current thread or current process.
> The thread specific time measurements have nothing to do with the posix
> standard and may best be kept separate.
Nonsense. POSIX defines the notion of CPU clocks for these calls, and that
is what I have implemented.
Of course glibc is in charge of what the meaning of the POSIX APIs is.
That is true for every call.
Thanks,
Roland
On Tue, 5 Oct 2004, Roland McGrath wrote:
> > I just reviewed the code and to my surprise the simple things like
> >
> > clock_gettime(CLOCK_PROCESS_CPUTIME_ID) and
> > clock_gettime(CLOCK_THREAD_CPUTIME_ID) are not supported.
>
> You seem to be confused. A clockid_t for a CPU clock encodes a PID, which
> can be zero to indicate the current thread or current process.
Is there a standard for that? Or is it an opaque type that you have
defined this way?
> > The thread specific time measurements have nothing to do with the posix
> > standard and may best be kept separate.
>
> Nonsense. POSIX defines the notion of CPU clocks for these calls, and that
> is what I have implemented.
Posix only defines a process and a thread clock. This is much more.
> Of course glibc is in charge of what the meaning of the POSIX APIs is.
> That is true for every call.
The proper information to realize these clocks is only available on the
kernel level. A clean API for that would remove lots of code that
currently exists in glibc.
I wonder how glibc will realize access to special timer hardware. Will
glibc be able load device drivers for timer chips?
> Is there a standard for that? Or is it an opaque type that you have
> defined this way?
Of course there is no standard for the bits used in a clockid_t. This is
an implementation detail in POSIX terms. POSIX defines function interfaces
to return clockid_t values (clock_getcpuclockid, pthread_getcpuclockid).
I have chosen the kernel-user ABI for Linux clockid_t's here, but that is
only of concern to the kernel and glibc.
> Posix only defines a process and a thread clock. This is much more.
Like I said the first time, it's three kinds of clocks. One is what we
will in future use to define POSIX's CPUTIME clocks in our POSIX
implementation, and the other two are what we already use to define
ITIMER_REAL/ITIMER_VIRTUAL in our existing POSIX implementation.
> I wonder how glibc will realize access to special timer hardware. Will
> glibc be able load device drivers for timer chips?
glibc has zero interest in doing any of that. It will use the single new
"best information" kernel interface when that is available, and it's the
kernel's concern what the best information available from the hardware is.
Thanks,
Roland
On Tue, 5 Oct 2004, Roland McGrath wrote:
> > Is there a standard for that? Or is it an opaque type that you have
> > defined this way?
>
> Of course there is no standard for the bits used in a clockid_t. This is
> an implementation detail in POSIX terms. POSIX defines function interfaces
> to return clockid_t values (clock_getcpuclockid, pthread_getcpuclockid).
> I have chosen the kernel-user ABI for Linux clockid_t's here, but that is
> only of concern to the kernel and glibc.
Maybe its best then to restrict this definition to glibc? Glibc can muck
around with these bits and then call either
clock_gettime(clocktype,×pec)
or
clock_process_gettimer(pid, type/clocktype,×pec)
That way the glibc guys have full control and the kernel does not have
to deal with the bit mongering.
> > Posix only defines a process and a thread clock. This is much more.
>
> Like I said the first time, it's three kinds of clocks. One is what we
> will in future use to define POSIX's CPUTIME clocks in our POSIX
> implementation, and the other two are what we already use to define
> ITIMER_REAL/ITIMER_VIRTUAL in our existing POSIX implementation.
How will you provide the necessary hardware interface for the kernel
clocks?
> > I wonder how glibc will realize access to special timer hardware. Will
> > glibc be able load device drivers for timer chips?
>
> glibc has zero interest in doing any of that. It will use the single new
> "best information" kernel interface when that is available, and it's the
> kernel's concern what the best information available from the hardware is.
Ok then we might best follow my above suggestion. Put the timer stuff
into a separate .c file as you have already done. Rename it to something
like process_clocks.c. posix-timers.c will then only be an interface to a
variety of hardware clocks that glibc may use at will.
Roland McGrath wrote:
>>CPU. In which case you don't need to worry about timestamp_last_tick.
>
>
> I don't really understand this comment. update_cpu_clock is called from
> schedule and from scheduler_tick. When it was last called by schedule,
> p->timestamp will mark this time. When it was last called by
> p->scheduler_tick, rq->timestamp_last_tick will mark this time.
> Hence the max of the two is the last time update_cpu_clock was called.
>
OK I see what its doing - ignore my comments then :P
>
>
>>It also seems to conveniently ignore locking when reading those values
>>off another CPU. Not a big deal for dynamic load calculations, but I'm
>>not so sure about your usage...?
>
>
> Here again I don't know what you are talking about. Nothing is ever read
> "off another CPU". A thread maintains its own sched_time counter while it
> is running on a CPU.
>
It seemed like a syscall could read the values from a task currently
running on another CPU. If not, great.
>
>>Lastly, even when using timestamp_last_tick correctly, I think sched_clock
>>will still drift around slightly, especially if a task switches CPUs a lot
>>(but not restricted to moving CPUs).
>
>
> Please explain.
>
As you pointed out, you are only measuring on-cpu time so this shouldn't
be a problem either.
Nick
> It seemed like a syscall could read the values from a task currently
> running on another CPU. If not, great.
Indeed it can. And yes, there is no locking against updates for this. For
sched_time on 32-bit platforms, there is the possibility it could be read
during an update and give a bogus value if the high half is updated before
the low half. Since there are no guarantees about accuracy, period, I
decided not to worry about such an anomaly. Perhaps it would be better to
do something about this, but AFAIK nothing perfect can be done without
adding more words to task_struct (e.g. seqcount). I don't know if the
nature of SMP cache behavior makes something like:
do {
sample = p->sched_time;
} while (p->sched_time != sample);
sufficient. That would certainly be easy to do.
Thanks,
Roland
> Maybe its best then to restrict this definition to glibc?
I can't figure out what problem you think exists or what you are trying to
gain here.
> How will you provide the necessary hardware interface for the kernel
> clocks?
You again seem to be asking a question that makes no sense in the context
that I have already explained.
Roland McGrath wrote:
>>It seemed like a syscall could read the values from a task currently
>>running on another CPU. If not, great.
>
>
> Indeed it can. And yes, there is no locking against updates for this. For
> sched_time on 32-bit platforms, there is the possibility it could be read
> during an update and give a bogus value if the high half is updated before
> the low half. Since there are no guarantees about accuracy, period, I
> decided not to worry about such an anomaly. Perhaps it would be better to
> do something about this, but AFAIK nothing perfect can be done without
> adding more words to task_struct (e.g. seqcount). I don't know if the
> nature of SMP cache behavior makes something like:
>
> do {
> sample = p->sched_time;
> } while (p->sched_time != sample);
>
> sufficient. That would certainly be easy to do.
>
I don't think that will be quite sufficient.
A seqcount would probably be the way to go.
On Tue, 5 Oct 2004, Roland McGrath wrote:
> > Maybe its best then to restrict this definition to glibc?
>
> I can't figure out what problem you think exists or what you are trying to
> gain here.
If clockid_t with your proposed bitfields would be restricted to glibc
alone then we would not have to deal with these bits on the kernel level.
The clock_gettime function calls would take a clock number, not a
clockid_t in the way that you want to use it. That would allow us to keep
the interface clear of pids mixed with clock numbers.
> > How will you provide the necessary hardware interface for the kernel
> > clocks?
>
> You again seem to be asking a question that makes no sense in the context
> that I have already explained.
The question is pretty acute. Currently glibc accesses CPU timers on its
own bypassing the kernel. Most of the times it works but on lots of
systems this is royally screwed up and CLOCK_*_CPUTIME_ID returns funky
values.
User needs to be able to use the specialized clocks provided by the
kernel. Currently CLOCK_REALTIME and CLOCK_MONOTONIC are the only clocks
that glibc will use. It seems that some people have tried to convince the
glibc maintainers to allow more clocks (in particular CLOCK_REALTIME_HR
and CLOCK_REALTIME_HR). The current patch in mm provides for
CLOCK_SGI_CYCLE.
We have been trying to get a resolution on these issues for a long time
from the glibc folks (I can trace the CPUTIME issue back at least 1 1/2
years).
Would be possible to define a clean interface that would get glibc out of
the business of accessing hardware directly and allow some sanity in terms
of clock access?
Does this fit not into your context? I have stated the problem a gazillion
times, I probably have no problem in explaining it another ten times to you.
I do not see that you are solving the most urgent problems. Instead we get
funky forms of pid's mixed with clockid's.
> If clockid_t with your proposed bitfields would be restricted to glibc
> alone then we would not have to deal with these bits on the kernel level.
>
> The clock_gettime function calls would take a clock number, not a
> clockid_t in the way that you want to use it. That would allow us to keep
> the interface clear of pids mixed with clock numbers.
That is true. I am not convinced that it is worth adding this hassle based
on the purely academic concern about PID_MAX_LIMIT being raised to 2^29 on
32-bit machines or 2^61 on 64-bit machines. That would be 2TB of 4k kernel
stacks alone on the 32-bit machine. It's not going to happen.
Any plan that doesn't encode all the information into the clockid_t value
necessarily requires either adding new system calls or adding new
parameters to the existing ones. Either way, it's a whole bunch of
annoyance to change, support binary compatibility properly, etc, etc.
There has to be an actually worthwhile reason not to just go with my scheme.
> The question is pretty acute. Currently glibc accesses CPU timers on its
> own bypassing the kernel. Most of the times it works but on lots of
> systems this is royally screwed up and CLOCK_*_CPUTIME_ID returns funky
> values.
This is what is no longer relevant with the kernel work I've now done.
In fact, it never works accurately if there are any context switches going on.
> User needs to be able to use the specialized clocks provided by the kernel.
All glibc users need is one definition of the most accurate CPU time clock
available, and that will be provided through the POSIX interfaces
(CLOCK_THREAD_CPUTIME_ID, CLOCK_PROCESS_CPUTIME_ID, clock_getcpuclockid,
and pthread_getcpuclockid). We have no intention of providing detailed
access to different kinds of hardware clocks in a glibc interface. If you
want that, write your own separate interface. Users I know about are
interested in the most accurate CPU time tracking that is available on the
system they are using in a given instance. This is what they get by
letting the kernel implementation, configuration, and hardware detection
choose the optimal way to implement sched_clock, which is what they now get.
By contrast, your patch provides a new way to access the
least-common-denominator low-resolution information already available by
other means, and nothing more.
> We have been trying to get a resolution on these issues for a long time
> from the glibc folks (I can trace the CPUTIME issue back at least 1 1/2
> years).
>
> Would be possible to define a clean interface that would get glibc out of
> the business of accessing hardware directly and allow some sanity in terms
> of clock access?
This is exactly what I have done, as I have already said more than once.
I am glibc folks. This kernel implementation is motivated by glibc's needs
to better implement the POSIX interfaces. It is what we want.
Your characterization of what has happened on this issue is rather skewed,
but I won't go into that since it's moot. We are trying to move forward
now on schemes that really work, and there has been no prior concrete
proposal from any quarter that met the functional requirements. My kernel
implementation is intended to do so, but I am certainly open to meaningful
critique of its efficacy.
> Does this fit not into your context? I have stated the problem a gazillion
> times, I probably have no problem in explaining it another ten times to you.
>
> I do not see that you are solving the most urgent problems. Instead we get
> funky forms of pid's mixed with clockid's.
I'm sorry that you do not see it. "My context" is that the actual
problems, some of which you've talked about and some of which you seem to
have misunderstood, are indeed solved by the work I've done, and I've
explained more than once how that is the case. I'm glad that you are not
unhappy continuing to discuss the issue. I don't mind continuing any
discussion that is making any progress. I hope you also don't have a
problem foregoing the ten repetitions of your understanding of the
situation, which I believe you have already communicated successfully so I
understand what you think about it, and addressing yourself instead to any
concrete concerns that remain given a clear understanding of what the new
situation will in fact be given my kernel implementation and appropriate
glibc support tailored for it.
Thanks,
Roland
On Thu, 7 Oct 2004, Roland McGrath wrote:
> All glibc users need is one definition of the most accurate CPU time clock
> available, and that will be provided through the POSIX interfaces
> (CLOCK_THREAD_CPUTIME_ID, CLOCK_PROCESS_CPUTIME_ID, clock_getcpuclockid,
> and pthread_getcpuclockid). We have no intention of providing detailed
> access to different kinds of hardware clocks in a glibc interface. If you
> want that, write your own separate interface. Users I know about are
> interested in the most accurate CPU time tracking that is available on the
> system they are using in a given instance. This is what they get by
> letting the kernel implementation, configuration, and hardware detection
> choose the optimal way to implement sched_clock, which is what they now get.
CLOCK_*_CPUTIME_ID are not clocks in a real sense. The user should
never choose those for real time. The most accurate time is provided by
CLOCK_REALTIME by the kernel.
> By contrast, your patch provides a new way to access the
> least-common-denominator low-resolution information already available by
> other means, and nothing more.
Yes the information is available also through the /proc filesystem. But it
should also be available through clock_gettime and friends. The patch
makes these calls posix compliant nothing more. Increasing the accuracy
of the timers better be done with the revision of the time handling
proposed by John Stultz.
> I'm sorry that you do not see it. "My context" is that the actual
> problems, some of which you've talked about and some of which you seem to
> have misunderstood, are indeed solved by the work I've done, and I've
> explained more than once how that is the case. I'm glad that you are not
> unhappy continuing to discuss the issue. I don't mind continuing any
> discussion that is making any progress. I hope you also don't have a
> problem foregoing the ten repetitions of your understanding of the
> situation, which I believe you have already communicated successfully so I
> understand what you think about it, and addressing yourself instead to any
> concrete concerns that remain given a clear understanding of what the new
> situation will in fact be given my kernel implementation and appropriate
> glibc support tailored for it.
I am happy that you are at least discussing the issue.
I did not see a clean implementation of CLOCK_*_CPUTIME_ID,
nor that the issue of providing additional clocks via clock_gettime
was addressed.