2009-04-02 09:15:11

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 2/6] RFC perf_counter: singleshot support

By request, provide a way for counters to disable themselves and signal
at the first counter overflow.

This isn't complete, we really want pending work to be done ASAP after
queueing it. My preferred method would be a self-IPI, that would ensure
we run the code in a usable context right after the current (IRQ-off,
NMI) context is done.

Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/powerpc/kernel/perf_counter.c | 2
arch/x86/kernel/cpu/perf_counter.c | 2
include/linux/perf_counter.h | 21 +++++---
kernel/perf_counter.c | 94 ++++++++++++++++++++++++++++---------
4 files changed, 89 insertions(+), 30 deletions(-)

Index: linux-2.6/arch/powerpc/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_counter.c
+++ linux-2.6/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct pe
* Finally record data if requested.
*/
if (record)
- perf_counter_output(counter, 1, regs);
+ perf_counter_overflow(counter, 1, regs);
}

/*
Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ again:
continue;

perf_save_and_restart(counter);
- perf_counter_output(counter, nmi, regs);
+ perf_counter_overflow(counter, nmi, regs);
}

hw_perf_ack_status(ack);
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -141,13 +141,19 @@ struct perf_counter_hw_event {
exclude_idle : 1, /* don't count when idle */
mmap : 1, /* include mmap data */
munmap : 1, /* include munmap data */
+ singleshot : 1, /* singleshot overflow */

- __reserved_1 : 53;
+ __reserved_1 : 52;

__u32 extra_config_len;
__u32 __reserved_4;

- __u64 __reserved_2;
+ /*
+ * Singleshot signal information.
+ */
+ __u32 signal_nr;
+ __u32 signal_tid;
+
__u64 __reserved_3;
};

@@ -325,8 +331,9 @@ struct perf_mmap_data {
void *data_pages[0];
};

-struct perf_wakeup_entry {
- struct perf_wakeup_entry *next;
+struct perf_pending_entry {
+ struct perf_pending_entry *next;
+ void (*func)(struct perf_pending_entry *);
};

/**
@@ -404,7 +411,7 @@ struct perf_counter {
/* poll related */
wait_queue_head_t waitq;
/* optional: for NMIs */
- struct perf_wakeup_entry wakeup;
+ struct perf_pending_entry pending;

void (*destroy)(struct perf_counter *);
struct rcu_head rcu_head;
@@ -493,8 +500,8 @@ extern int hw_perf_group_sched_in(struct
struct perf_counter_context *ctx, int cpu);
extern void perf_counter_update_userpage(struct perf_counter *counter);

-extern void perf_counter_output(struct perf_counter *counter,
- int nmi, struct pt_regs *regs);
+extern void perf_counter_overflow(struct perf_counter *counter,
+ int nmi, struct pt_regs *regs);
/*
* Return 1 for a software counter, 0 for a hardware counter
*/
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1563,6 +1563,14 @@ void perf_counter_wakeup(struct perf_cou
wake_up_all(&counter->waitq);
}

+static void perf_pending_wakeup(struct perf_pending_entry *entry)
+{
+ struct perf_counter *counter = container_of(entry,
+ struct perf_counter, pending);
+
+ perf_counter_wakeup(counter);
+}
+
/*
* Pending wakeups
*
@@ -1572,45 +1580,47 @@ void perf_counter_wakeup(struct perf_cou
* single linked list and use cmpxchg() to add entries lockless.
*/

-#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)

-static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
PENDING_TAIL,
};

-static void perf_pending_queue(struct perf_counter *counter)
+static void perf_pending_queue(struct perf_pending_entry *entry,
+ void (*func)(struct perf_pending_entry *))
{
- struct perf_wakeup_entry **head;
- struct perf_wakeup_entry *prev, *next;
+ struct perf_pending_entry **head;

- if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+ if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
return;

- head = &get_cpu_var(perf_wakeup_head);
+ entry->func = func;
+
+ head = &get_cpu_var(perf_pending_head);

do {
- prev = counter->wakeup.next = *head;
- next = &counter->wakeup;
- } while (cmpxchg(head, prev, next) != prev);
+ entry->next = *head;
+ } while (cmpxchg(head, entry->next, entry) != entry->next);

set_perf_counter_pending();

- put_cpu_var(perf_wakeup_head);
+ put_cpu_var(perf_pending_head);
}

static int __perf_pending_run(void)
{
- struct perf_wakeup_entry *list;
+ struct perf_pending_entry *list;
int nr = 0;

- list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+ list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
while (list != PENDING_TAIL) {
- struct perf_counter *counter = container_of(list,
- struct perf_counter, wakeup);
+ void (*func)(struct perf_pending_entry *);
+ struct perf_pending_entry *entry = list;

list = list->next;

- counter->wakeup.next = NULL;
+ entry->next = NULL;
+ func = entry->func;
/*
* Ensure we observe the unqueue before we issue the wakeup,
* so that we won't be waiting forever.
@@ -1618,7 +1628,7 @@ static int __perf_pending_run(void)
*/
smp_wmb();

- perf_counter_wakeup(counter);
+ func(entry);
nr++;
}

@@ -1640,7 +1650,7 @@ static inline int perf_not_pending(struc
* so that we do not miss the wakeup. -- see perf_pending_handle()
*/
smp_rmb();
- return counter->wakeup.next == NULL;
+ return counter->pending.next == NULL;
}

static void perf_pending_sync(struct perf_counter *counter)
@@ -1679,9 +1689,10 @@ struct perf_output_handle {

static inline void __perf_output_wakeup(struct perf_output_handle *handle)
{
- if (handle->nmi)
- perf_pending_queue(handle->counter);
- else
+ if (handle->nmi) {
+ perf_pending_queue(&handle->counter->pending,
+ perf_pending_wakeup);
+ } else
perf_counter_wakeup(handle->counter);
}

@@ -1999,6 +2010,47 @@ void perf_counter_munmap(unsigned long a
}

/*
+ * Generic counter overflow handling.
+ */
+
+void perf_counter_singleshot(struct perf_counter *counter)
+{
+ struct pid *pid;
+
+ perf_counter_disable(counter);
+
+ rcu_read_lock();
+ pid = find_vpid(counter->hw_event.signal_tid);
+ if (pid)
+ kill_pid(pid, counter->hw_event.signal_nr, 1);
+ rcu_read_unlock();
+}
+
+void perf_pending_singleshot(struct perf_pending_entry *entry)
+{
+ struct perf_counter *counter = container_of(entry,
+ struct perf_counter, pending);
+
+ perf_counter_singleshot(counter);
+}
+
+void perf_counter_overflow(struct perf_counter *counter,
+ int nmi, struct pt_regs *regs)
+{
+ if (counter->hw_event.singleshot) {
+ if (nmi) {
+ perf_pending_queue(&counter->pending,
+ perf_pending_singleshot);
+ } else
+ perf_counter_singleshot(counter);
+
+ return;
+ }
+
+ perf_counter_output(counter, nmi, regs);
+}
+
+/*
* Generic software counter infrastructure
*/


--


2009-04-02 10:52:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support


* Peter Zijlstra <[email protected]> wrote:

> By request, provide a way for counters to disable themselves and
> signal at the first counter overflow.
>
> This isn't complete, we really want pending work to be done ASAP
> after queueing it. My preferred method would be a self-IPI, that
> would ensure we run the code in a usable context right after the
> current (IRQ-off, NMI) context is done.

Hm. I do think self-IPIs can be fragile but the more work we do in
NMI context the more compelling of a case can be made for a
self-IPI. So no big arguments against that.

Regarding single-shot - i think the code you posted fails to do the
single-shot aspect: the counter does not stop, to wait for the
signal handler to get its act together.

So i'd suggest to separate the two concepts: add signal support and
single-shot via two attributes. User-space might decide to use
single-shot if it does not want to deal with queued signals or lost
events due to not processing the previous signal fast enough.

Also, user-space might want to use single-shot _without_ signals
perhaps.

And that brings up that user-space might want to generate N events
and stop then, reliably. I.e. single-shot is a specific form of a
"trigger limit".

Plus the question comes up: dont we need an ioctl to let user-space
refill/re-enable the trigger limit?

That is an issue for the signal case as well: we stop the counter
... what restarts it? Does the signal handler have to close the
counter and re-create it just to get the next single-shot event?

So i think we need 3 separate things:

- the ability to set a signal attribute of the counter (during
creation) via a (signo,tid) pair.

Semantics:

- it can be a regular signal (signo < 32),
or an RT/queued signal (signo >= 32).

- It may be sent to the task that generated the event (tid == 0),
or it may be sent to a specific task (tid > 0),
or it may be sent to a task group (tid < 0).

- 'event limit' attribute: the ability to pause new events after N
events. This limit auto-decrements on each event.
limit==1 is the special case for single-shot.

- new ioctl method to refill the limit, when user-space is ready to
receive new events. A special-case of this is when a signal
handler calls ioctl(refill_limit, 1) in the single-shot case -
this re-enables events after the signal has been handled.

Another observation: i think perf_counter_output() needs to depend
on whether the counter is signalling, not on the single-shot-ness of
the counter.

A completely valid use of this would be for user-space to create an
mmap() buffer of 1024 events, then set the limit to 1024, and wait
for the 1024 events to happen - process them and close the counter.
Without any signalling.

Basically the 'limit' allows counter events to be used as a tracer
in essence. So i think it's a nice touch of the whole scheme, and it
should be decoupled from the signal attribute.

Hm?

Ingo

2009-04-02 11:48:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support

On Thu, 2009-04-02 at 12:51 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > By request, provide a way for counters to disable themselves and
> > signal at the first counter overflow.
> >
> > This isn't complete, we really want pending work to be done ASAP
> > after queueing it. My preferred method would be a self-IPI, that
> > would ensure we run the code in a usable context right after the
> > current (IRQ-off, NMI) context is done.
>
> Hm. I do think self-IPIs can be fragile but the more work we do in
> NMI context the more compelling of a case can be made for a
> self-IPI. So no big arguments against that.

Its not only NMI, but also things like software events in the scheduler
under rq->lock, or hrtimers in irq context. You cannot do a wakeup from
under rq->lock, nor hrtimer_cancel() from within the timer handler.

All these nasty little issues stack up and could be solved with a
self-IPI.


Then there is the software task-time clock which uses
p->se.sum_exec_runtime which requires the rq->lock to be read. Coupling
this with for example an NMI overflow handler gives an instant deadlock.

Would you terribly mind if I remove all that sum_exec_runtime and
rq->lock stuff and simply use cpu_clock() to keep count. These things
get context switched along with tasks anyway.



> So i think we need 3 separate things:
>
> - the ability to set a signal attribute of the counter (during
> creation) via a (signo,tid) pair.
>
> Semantics:
>
> - it can be a regular signal (signo < 32),
> or an RT/queued signal (signo >= 32).
>
> - It may be sent to the task that generated the event (tid == 0),
> or it may be sent to a specific task (tid > 0),
> or it may be sent to a task group (tid < 0).

kill_pid() seems to be able to do all of that:

struct pid *pid;
int tid, priv;

perf_counter_disable(counter);

rcu_read_lock();
tid = counter->hw_event.signal_tid;
if (!tid)
tid = current->pid;
priv = 1;
if (tid < 0) {
priv = 0;
tid = -tid;
}
pid = find_vpid(tid);
if (pid)
kill_pid(pid, counter->hw_event.signal_nr, priv);
rcu_read_unlock();

Should do I afaict.

Except I probably should look into this pid-namespace mess and clean all
that up.

> - 'event limit' attribute: the ability to pause new events after N
> events. This limit auto-decrements on each event.
> limit==1 is the special case for single-shot.

That should go along with a toggle on what an event is I suppose, either
an 'output' event or a filled page?

Or do we want to limit that to counter overflow?

> - new ioctl method to refill the limit, when user-space is ready to
> receive new events. A special-case of this is when a signal
> handler calls ioctl(refill_limit, 1) in the single-shot case -
> this re-enables events after the signal has been handled.

Right, with the method implemented above, its simply a matter of the
enable ioctl.

> Another observation: i think perf_counter_output() needs to depend
> on whether the counter is signalling, not on the single-shot-ness of
> the counter.
>
> A completely valid use of this would be for user-space to create an
> mmap() buffer of 1024 events, then set the limit to 1024, and wait
> for the 1024 events to happen - process them and close the counter.
> Without any signalling.

Say we have a limit > 1, and a signal, that would mean we do not
generate event output?


2009-04-02 12:17:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support

OK, so I just remembered the sigio thing mentioned by Paul a while back.

and this should do I suppose...

---
Subject: perf_counter: SIGIO support
From: Peter Zijlstra <[email protected]>
Date: Thu Apr 02 14:15:54 CEST 2009


Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/perf_counter.h | 2 ++
kernel/perf_counter.c | 20 +++++++++++++++++++-
2 files changed, 21 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -238,6 +238,7 @@ enum perf_event_type {
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
+#include <linux/fs.h>
#include <asm/atomic.h>

struct task_struct;
@@ -397,6 +398,7 @@ struct perf_counter {

/* poll related */
wait_queue_head_t waitq;
+ struct fasync_struct *fasync;
/* optional: for NMIs */
struct perf_wakeup_entry wakeup;

Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1526,6 +1526,22 @@ out:
return ret;
}

+static int perf_fasync(int fd, struct file *filp, int on)
+{
+ struct perf_counter *counter = filp->private_data;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ int retval;
+
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &counter->fasync);
+ mutex_unlock(&inode->i_mutex);
+
+ if (retval < 0)
+ return retval;
+
+ return 0;
+}
+
static const struct file_operations perf_fops = {
.release = perf_release,
.read = perf_read,
@@ -1533,6 +1549,7 @@ static const struct file_operations perf
.unlocked_ioctl = perf_ioctl,
.compat_ioctl = perf_ioctl,
.mmap = perf_mmap,
+ .fasync = perf_fasync,
};

/*
@@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_cou
rcu_read_lock();
data = rcu_dereference(counter->data);
if (data) {
- (void)atomic_xchg(&data->wakeup, POLL_IN);
+ atomic_set(&data->wakeup, POLL_IN);
/*
* Ensure all data writes are issued before updating the
* user-space data head information. The matching rmb()
@@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_cou
rcu_read_unlock();

wake_up_all(&counter->waitq);
+ kill_fasync(&counter->fasync, SIGIO, POLL_IN);
}

/*

2009-04-02 12:26:59

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support


* Peter Zijlstra <[email protected]> wrote:

> On Thu, 2009-04-02 at 12:51 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> > > By request, provide a way for counters to disable themselves and
> > > signal at the first counter overflow.
> > >
> > > This isn't complete, we really want pending work to be done ASAP
> > > after queueing it. My preferred method would be a self-IPI, that
> > > would ensure we run the code in a usable context right after the
> > > current (IRQ-off, NMI) context is done.
> >
> > Hm. I do think self-IPIs can be fragile but the more work we do
> > in NMI context the more compelling of a case can be made for a
> > self-IPI. So no big arguments against that.
>
> Its not only NMI, but also things like software events in the
> scheduler under rq->lock, or hrtimers in irq context. You cannot
> do a wakeup from under rq->lock, nor hrtimer_cancel() from within
> the timer handler.
>
> All these nasty little issues stack up and could be solved with a
> self-IPI.
>
> Then there is the software task-time clock which uses
> p->se.sum_exec_runtime which requires the rq->lock to be read.
> Coupling this with for example an NMI overflow handler gives an
> instant deadlock.

Ok, convinced.

> Would you terribly mind if I remove all that sum_exec_runtime and
> rq->lock stuff and simply use cpu_clock() to keep count. These
> things get context switched along with tasks anyway.

Sure. One sidenote - the precision of sw clocks has dropped a bit
lately:

aldebaran:~/linux/linux/Documentation/perf_counter> ./perfstat -e
1:0 -e 1:0 -e 1:0 -e 1:0 -e 1:0 sleep 1

Performance counter stats for 'sleep':

0.762664 cpu clock ticks (msecs)
0.761440 cpu clock ticks (msecs)
0.760977 cpu clock ticks (msecs)
0.760587 cpu clock ticks (msecs)
0.760287 cpu clock ticks (msecs)

Wall-clock time elapsed: 1003.139373 msecs

See that slight but noticeable skew? This used to work fine and we
had the exact same value everywhere. Can we fix that while still
keeping the code nice?

> Except I probably should look into this pid-namespace mess and
> clean all that up.

yeah. Hopefully it's all just a matter of adding or removing a 'v'
somewhere. Get a bit more complicated with system-wide counters
though.

> > - 'event limit' attribute: the ability to pause new events after N
> > events. This limit auto-decrements on each event.
> > limit==1 is the special case for single-shot.
>
> That should go along with a toggle on what an event is I suppose,
> either an 'output' event or a filled page?
>
> Or do we want to limit that to counter overflow?

I think the proper form to rate-limit events and do buffering,
without losing events, is to have an attribute that sets a
buffer-full event threshold in bytes. That works well with variable
sized records. That threshold would normally be set to a multiple of
PAGE_SIZE - with a sensible default of half the mmap area or so?

Right?

> > - new ioctl method to refill the limit, when user-space is ready to
> > receive new events. A special-case of this is when a signal
> > handler calls ioctl(refill_limit, 1) in the single-shot case -
> > this re-enables events after the signal has been handled.
>
> Right, with the method implemented above, its simply a matter of
> the enable ioctl.

ok.

> > Another observation: i think perf_counter_output() needs to
> > depend on whether the counter is signalling, not on the
> > single-shot-ness of the counter.
> >
> > A completely valid use of this would be for user-space to create
> > an mmap() buffer of 1024 events, then set the limit to 1024, and
> > wait for the 1024 events to happen - process them and close the
> > counter. Without any signalling.
>
> Say we have a limit > 1, and a signal, that would mean we do not
> generate event output?

I think we should have two independent limits that both may generate
wakeups.

We have a stream of events filling in records in a buffer area. That
is a given and we have no real influence over them happening (in a
loss free model).

There's two further, independent properties here that make further
sense to manage:

1) what happens on the events themselves

2) the buffer space gets squeezed

Here we have buffering and hence discretion over what happens, how
frequently we wake up and what we do on each individual event.

For the #2 buffer space, in the view of variable size records, the
best metric is bytes i think. The best default is 'half of the mmap
area'. This should influence the wakeup behavior IMO. We only wake
up if buffer space gets tight. (User-space can time out its poll()
call and thus get a timely recording of even smaller-than-threshold
events)

For the #1 'what happens on events' independent case, by default is
that nothing happens. If the signal number is set, we send a signal
- but the buffer space management itself remains independent and we
may or may not wake up, depending on the 'bytes left' metric.

I think the 'trigger limit' threshold is a third independent
attribute which actively throttles output [be that a signal, output
into the buffer space, or both] - if despite the wakeup (or us
sending a signal) nothing happened and we've got too much overlap.

The most common special case for the trigger limit would be in
signal generation mode, with a value of 1. This means the counter
turns off after each signal.

Remember the 'lost events' value patch in the header mmap area? This
would be useful here: if the kernel has to throttle due to hitting
the limit, it would set the overflow counter?

If this gets needlessly complex/weird in the code itself then i made
a thinko somewhere and we need to reconsider. :-)

Ingo

2009-04-02 18:10:54

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support


* Peter Zijlstra <[email protected]> wrote:

> OK, so I just remembered the sigio thing mentioned by Paul a while
> back.
>
> and this should do I suppose...

> @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_cou
> rcu_read_unlock();
>
> wake_up_all(&counter->waitq);
> + kill_fasync(&counter->fasync, SIGIO, POLL_IN);

ah, yes.

Do we even need the explicit signo method this way? The SIGIO target
is configurable via fcntl(F_SETSIG), right?

Ingo

2009-04-02 18:33:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support

On Thu, 2009-04-02 at 20:10 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <[email protected]> wrote:
>
> > OK, so I just remembered the sigio thing mentioned by Paul a while
> > back.
> >
> > and this should do I suppose...
>
> > @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_cou
> > rcu_read_unlock();
> >
> > wake_up_all(&counter->waitq);
> > + kill_fasync(&counter->fasync, SIGIO, POLL_IN);
>
> ah, yes.
>
> Do we even need the explicit signo method this way? The SIGIO target
> is configurable via fcntl(F_SETSIG), right?

F_SETSIG allows you to change the signal that is delivered, SIGIO by
default, F_SETOWN allows you to specify a target.


2009-04-02 21:24:05

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 2/6] RFC perf_counter: singleshot support

Ingo Molnar writes:

> Sure. One sidenote - the precision of sw clocks has dropped a bit
> lately:
>
> aldebaran:~/linux/linux/Documentation/perf_counter> ./perfstat -e
> 1:0 -e 1:0 -e 1:0 -e 1:0 -e 1:0 sleep 1
>
> Performance counter stats for 'sleep':
>
> 0.762664 cpu clock ticks (msecs)
> 0.761440 cpu clock ticks (msecs)
> 0.760977 cpu clock ticks (msecs)
> 0.760587 cpu clock ticks (msecs)
> 0.760287 cpu clock ticks (msecs)
>
> Wall-clock time elapsed: 1003.139373 msecs
>
> See that slight but noticeable skew? This used to work fine and we
> had the exact same value everywhere. Can we fix that while still
> keeping the code nice?

I suggest basing the software clock on counter->ctx->time_now, and
make get_context_time use cpu_clock() always. That way we will only
call cpu_clock() once even if we have multiple cpu clock counters, and
that will eliminate the skew as well as being more efficient.

Paul.