v2:
- Patch 1: Some cosmetic updates to address Peter's feedback
- Patch 2: Add unlikely() hint for test in __perf_output_begin() so as
to minimize impact in normal cases.
- Patch 2: Factor out handling of wakeup_events into a separate helper.
This series has also been tested with perf_event_tests testsuite to
ensure there are no regressions.
- Naveen
---
v1:
https://www.mail-archive.com/[email protected]/msg1424933.html
Currently, there is no way to ask for signals to be delivered when a
certain number of sideband events have been logged into the ring buffer.
This is problematic if we are only interested in, say, context switch
events. Furthermore, signals are more useful (rather than polling) for
self-profiling. This series provides for a way to achieve this.
We ride on top of the existing support for ring buffer wakeup to
generate signals as desired. Counting sideband events still requires
some changes in the output path, but in normal cases, it ends up being
just a comparison.
The test program below demonstrates how a process can profile itself for
context switch events and how it can control notification through
signals. The key changes include the below perf_event_attr settings as
well as use of IOC_ENABLE:
pe.signal_on_wakeup = 1;
pe.count_sb_events = 1;
pe.wakeup_events = 2;
To keep things simple, PERF_EVENT_IOC_REFRESH cannot be used if any of
the new attributes are set.
- Naveen
---
Here is a sample program demonstrating the same:
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <signal.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
return syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
}
static void sigio_handler(int n, siginfo_t *info, void *uc)
{
fprintf (stderr, "Caught %s\n", info->si_code == POLL_HUP ? "POLL_HUP" :
(info->si_code == POLL_IN ? "POLL_IN" : "other signal"));
}
int main(int argc, char **argv)
{
struct perf_event_attr pe;
struct sigaction act;
int fd;
void *buf;
memset(&act, 0, sizeof(act));
act.sa_sigaction = sigio_handler;
act.sa_flags = SA_SIGINFO;
sigaction(SIGIO, &act, 0);
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.size = sizeof(struct perf_event_attr);
pe.type = PERF_TYPE_SOFTWARE;
pe.config = PERF_COUNT_SW_DUMMY;
pe.disabled = 1;
pe.sample_period = 1;
pe.context_switch = 1;
pe.signal_on_wakeup = 1;
pe.count_sb_events = 1;
pe.wakeup_events = 2;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %lx\n", (unsigned long)pe.config);
exit(EXIT_FAILURE);
}
buf = mmap(NULL, sysconf(_SC_PAGESIZE) * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (buf == MAP_FAILED) {
fprintf(stderr, "Can't mmap buffer\n");
return -1;
}
if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_ASYNC) == -1)
return -2;
if (fcntl(fd, F_SETSIG, SIGIO) == -1)
return -3;
if (fcntl(fd, F_SETOWN, getpid()) == -1)
return -4;
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1)
return -5;
fprintf (stderr, "Sleep 1\n");
sleep(1);
fprintf (stderr, "Sleep 2\n");
sleep(1);
fprintf (stderr, "Sleep 3\n");
sleep(1);
/* Disable the event counter */
ioctl(fd, PERF_EVENT_IOC_DISABLE, 1);
close(fd);
return 0;
}
A sample output:
$ time ./cs
Sleep 1
Caught POLL_IN
Sleep 2
Caught POLL_IN
Sleep 3
Caught POLL_IN
real 0m3.040s
user 0m0.001s
sys 0m0.003s
Naveen N. Rao (2):
kernel/events: Add option to notify through signals on wakeup
kernel/events: Add option to enable counting sideband events in
wakeup_events
include/uapi/linux/perf_event.h | 4 +++-
kernel/events/core.c | 39 ++++++++++++++++++---------------------
kernel/events/internal.h | 1 +
kernel/events/ring_buffer.c | 21 +++++++++++++++++++++
4 files changed, 43 insertions(+), 22 deletions(-)
--
2.13.3
Add a new option 'signal_on_wakeup' to request for a signal to be
delivered on ring buffer wakeup controlled through watermark and
{wakeup_events, wakeup_watermark}. HUP is signaled on exit.
Setting signal_on_wakeup disables use of IOC_REFRESH to control signal
delivery, instead relying on IOC_ENABLE/DISABLE.
Signed-off-by: Naveen N. Rao <[email protected]>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 22 ++++++++++++++--------
kernel/events/ring_buffer.c | 3 +++
3 files changed, 19 insertions(+), 9 deletions(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..e5810b1d74a4 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -345,7 +345,8 @@ struct perf_event_attr {
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
namespaces : 1, /* include namespaces data */
- __reserved_1 : 35;
+ signal_on_wakeup : 1, /* send signal on wakeup */
+ __reserved_1 : 34;
union {
__u32 wakeup_events; /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..4fe708a4fdee 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2680,9 +2680,11 @@ EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
- * not supported on inherited events
+ * not supported on inherited events or if user has requested for
+ * signals on ring buffer wakeup.
*/
- if (event->attr.inherit || !is_sampling_event(event))
+ if (event->attr.inherit || event->attr.signal_on_wakeup ||
+ !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
@@ -7341,7 +7343,6 @@ static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
{
- int events = atomic_read(&event->event_limit);
int ret = 0;
/*
@@ -7358,12 +7359,15 @@ static int __perf_event_overflow(struct perf_event *event,
* events
*/
- event->pending_kill = POLL_IN;
- if (events && atomic_dec_and_test(&event->event_limit)) {
- ret = 1;
- event->pending_kill = POLL_HUP;
+ if (!event->attr.signal_on_wakeup) {
+ int events = atomic_read(&event->event_limit);
+ event->pending_kill = POLL_IN;
+ if (events && atomic_dec_and_test(&event->event_limit)) {
+ ret = 1;
+ event->pending_kill = POLL_HUP;
- perf_event_disable_inatomic(event);
+ perf_event_disable_inatomic(event);
+ }
}
READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -10427,6 +10431,8 @@ perf_event_exit_event(struct perf_event *child_event,
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ if (child_event->attr.signal_on_wakeup)
+ child_event->pending_kill = POLL_HUP;
raw_spin_unlock_irq(&child_ctx->lock);
/*
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..e7a558cfcadb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -21,6 +21,9 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLLIN);
+ if (handle->event->attr.signal_on_wakeup)
+ handle->event->pending_kill = POLL_IN;
+
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
}
--
2.13.3
Many sideband events are interesting by themselves. When profiling only
for sideband events, it is useful to be able to control process wakeup
(wakeup_events) based on sideband events alone. Add a new option
'count_sb_events' to do the same.
IOC_REFRESH won't be supported with this, so disable that.
Signed-off-by: Naveen N. Rao <[email protected]>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 23 +++++++----------------
kernel/events/internal.h | 1 +
kernel/events/ring_buffer.c | 18 ++++++++++++++++++
4 files changed, 28 insertions(+), 17 deletions(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5810b1d74a4..ab4dc9a02151 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -346,7 +346,8 @@ struct perf_event_attr {
write_backward : 1, /* Write ring buffer from end to beginning */
namespaces : 1, /* include namespaces data */
signal_on_wakeup : 1, /* send signal on wakeup */
- __reserved_1 : 34;
+ count_sb_events : 1, /* wakeup_events also counts sideband events */
+ __reserved_1 : 33;
union {
__u32 wakeup_events; /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4fe708a4fdee..118a100108b1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2680,11 +2680,13 @@ EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
- * not supported on inherited events or if user has requested for
- * signals on ring buffer wakeup.
+ * not supported on:
+ * - inherited events
+ * - if user has requested for signals on ring buffer wakeup, or
+ * - if counting sideband events
*/
if (event->attr.inherit || event->attr.signal_on_wakeup ||
- !is_sampling_event(event))
+ event->attr.count_sb_events || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
@@ -5974,19 +5976,8 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
- if (!event->attr.watermark) {
- int wakeup_events = event->attr.wakeup_events;
-
- if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
- int events = local_inc_return(&rb->events);
-
- if (events >= wakeup_events) {
- local_sub(wakeup_events, &rb->events);
- local_inc(&rb->wakeup);
- }
- }
- }
+ if (!event->attr.count_sb_events)
+ rb_handle_wakeup_events(event, handle->rb);
}
void perf_prepare_sample(struct perf_event_header *header,
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..b75137ebe9f7 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -81,6 +81,7 @@ extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
extern void rb_free_aux(struct ring_buffer *rb);
extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
extern void ring_buffer_put(struct ring_buffer *rb);
+extern void rb_handle_wakeup_events(struct perf_event *event, struct ring_buffer *rb);
static inline bool rb_has_aux(struct ring_buffer *rb)
{
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e7a558cfcadb..a34f5c2e7ed1 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -116,6 +116,21 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,
return CIRC_SPACE(tail, head, data_size) >= size;
}
+void __always_inline
+rb_handle_wakeup_events(struct perf_event *event, struct ring_buffer *rb)
+{
+ int wakeup_events = event->attr.wakeup_events;
+
+ if (!event->attr.watermark && wakeup_events) {
+ int events = local_inc_return(&rb->events);
+
+ if (events >= wakeup_events) {
+ local_sub(wakeup_events, &rb->events);
+ local_inc(&rb->wakeup);
+ }
+ }
+}
+
static int __always_inline
__perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
@@ -197,6 +212,9 @@ __perf_output_begin(struct perf_output_handle *handle,
* none of the data stores below can be lifted up by the compiler.
*/
+ if (unlikely(event->attr.count_sb_events))
+ rb_handle_wakeup_events(event, rb);
+
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
--
2.13.3
On Tue, 1 Aug 2017, Naveen N. Rao wrote:
> Add a new option 'signal_on_wakeup' to request for a signal to be
> delivered on ring buffer wakeup controlled through watermark and
> {wakeup_events, wakeup_watermark}. HUP is signaled on exit.
>
> Setting signal_on_wakeup disables use of IOC_REFRESH to control signal
> delivery, instead relying on IOC_ENABLE/DISABLE.
so I probably missed the original thread on this new interface, but why is
IOC_REFRESH not being used?
For new interfaces like this it's also nice to have some text that can be
added to the perf_event_open() manpage, especially if there's weird
conditions like this.
Vince
Hi Vince,
Thanks for taking a look.
On 2017/08/03 01:57PM, Vince Weaver wrote:
> On Tue, 1 Aug 2017, Naveen N. Rao wrote:
>
> > Add a new option 'signal_on_wakeup' to request for a signal to be
> > delivered on ring buffer wakeup controlled through watermark and
> > {wakeup_events, wakeup_watermark}. HUP is signaled on exit.
> >
> > Setting signal_on_wakeup disables use of IOC_REFRESH to control signal
> > delivery, instead relying on IOC_ENABLE/DISABLE.
>
> so I probably missed the original thread on this new interface, but why is
> IOC_REFRESH not being used?
IOC_REFRESH is used to control the number of overflows before disabling
the event. It works outside of perf_event_attr in the sense that it
enables POLL_IN on each overflow and user specifies the number of
overflows after which to disable the event as part of the ioctl (when
HUP is signaled).
However, signal_on_wakeup is designed to work with the values in the
perf_event_attr structure itself. wakeup_events controls the number of
events after which to signal POLL_IN. signal_on_wakeup itself needs to
be specified in the perf_event_attr. As such, I felt it is better to
have all control through perf_event_attr.
But, if you think having IOC_REFRESH available in this scenario is
useful, we can revisit this. Ideally, we would have separate ioctls to
control signal delivery separate from perf_event_attr, but I am not sure
how useful that would be.
>
> For new interfaces like this it's also nice to have some text that can be
> added to the perf_event_open() manpage, especially if there's weird
> conditions like this.
Sure -- I will send an update to the manpage once this series gets
accepted.
Thanks,
Naveen
On Tue, Aug 01, 2017 at 08:14:03PM +0530, Naveen N. Rao wrote:
> Add a new option 'signal_on_wakeup' to request for a signal to be
> delivered on ring buffer wakeup controlled through watermark and
> {wakeup_events, wakeup_watermark}. HUP is signaled on exit.
This fails to tell us why you'd want this. What is wrong with the
existing POLL_IN notification?
On Tue, Aug 01, 2017 at 08:14:04PM +0530, Naveen N. Rao wrote:
> @@ -5974,19 +5976,8 @@ void perf_output_sample(struct perf_output_handle *handle,
> }
> }
>
> + if (!event->attr.count_sb_events)
> + rb_handle_wakeup_events(event, handle->rb);
> }
> +void __always_inline
> +rb_handle_wakeup_events(struct perf_event *event, struct ring_buffer *rb)
> +{
> + int wakeup_events = event->attr.wakeup_events;
> +
> + if (!event->attr.watermark && wakeup_events) {
> + int events = local_inc_return(&rb->events);
> +
> + if (events >= wakeup_events) {
> + local_sub(wakeup_events, &rb->events);
> + local_inc(&rb->wakeup);
> + }
> + }
> +}
> +
> static int __always_inline
> __perf_output_begin(struct perf_output_handle *handle,
> struct perf_event *event, unsigned int size,
> @@ -197,6 +212,9 @@ __perf_output_begin(struct perf_output_handle *handle,
> * none of the data stores below can be lifted up by the compiler.
> */
>
> + if (unlikely(event->attr.count_sb_events))
> + rb_handle_wakeup_events(event, rb);
> +
> if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
> local_add(rb->watermark, &rb->wakeup);
>
I'm still slightly uneasy over this.. Yes most of our events are
samples, so we'd pay the overhead already. But could you still look at
performance of this, see for example this commit:
9ecda41acb97 ("perf/core: Add ::write_backward attribute to perf event")
we went through a lot of variants to not hurt performance.
Hi Peter,
On 2017/08/04 12:20PM, Peter Zijlstra wrote:
> On Tue, Aug 01, 2017 at 08:14:03PM +0530, Naveen N. Rao wrote:
> > Add a new option 'signal_on_wakeup' to request for a signal to be
> > delivered on ring buffer wakeup controlled through watermark and
> > {wakeup_events, wakeup_watermark}. HUP is signaled on exit.
>
> This fails to tell us why you'd want this. What is wrong with the
> existing POLL_IN notification?
Yes, sorry - I should have included some of the explanation from the
cover letter as part of this patch description.
The primary use-case is for self-profiling. If a process wants to
profile itself and wishes to receive notification via signals, we do not
have any control over when we can receive such notifications -- a
POLL_IN is generated on each overflow. We may just want to be notified
every n events, rather than on each event.
Furthermore, some side band events are useful by themself and it will be
desirable to be notified on those events. This again is not possible
today with IOC_ENABLE/IOC_REFRESH (addressed with Patch 2/2 in this
series).
Regards,
Naveen
On 2017/08/04 12:59PM, Peter Zijlstra wrote:
> On Tue, Aug 01, 2017 at 08:14:04PM +0530, Naveen N. Rao wrote:
> > @@ -5974,19 +5976,8 @@ void perf_output_sample(struct perf_output_handle *handle,
> > }
> > }
> >
> > + if (!event->attr.count_sb_events)
> > + rb_handle_wakeup_events(event, handle->rb);
> > }
>
> > +void __always_inline
> > +rb_handle_wakeup_events(struct perf_event *event, struct ring_buffer *rb)
> > +{
> > + int wakeup_events = event->attr.wakeup_events;
> > +
> > + if (!event->attr.watermark && wakeup_events) {
> > + int events = local_inc_return(&rb->events);
> > +
> > + if (events >= wakeup_events) {
> > + local_sub(wakeup_events, &rb->events);
> > + local_inc(&rb->wakeup);
> > + }
> > + }
> > +}
> > +
> > static int __always_inline
> > __perf_output_begin(struct perf_output_handle *handle,
> > struct perf_event *event, unsigned int size,
> > @@ -197,6 +212,9 @@ __perf_output_begin(struct perf_output_handle *handle,
> > * none of the data stores below can be lifted up by the compiler.
> > */
> >
> > + if (unlikely(event->attr.count_sb_events))
> > + rb_handle_wakeup_events(event, rb);
> > +
> > if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
> > local_add(rb->watermark, &rb->wakeup);
> >
>
> I'm still slightly uneasy over this.. Yes most of our events are
> samples, so we'd pay the overhead already. But could you still look at
> performance of this, see for example this commit:
>
> 9ecda41acb97 ("perf/core: Add ::write_backward attribute to perf event")
>
> we went through a lot of variants to not hurt performance.
Sure. I'll run the tests and get back.
Thanks,
Naveen