Currently, there is no way to ask for signals to be delivered when a
certain number of sideband events have been logged into the ring buffer.
This is problematic if we are only interested in, say, context switch
events. Furthermore, signals are more useful (rather than polling) for
self-profiling. This series provides for a way to achieve this.
We ride on top of the existing support for ring buffer wakeup to
generate signals as desired. Counting sideband events still requires
some changes in the output path, but in normal cases, it ends up being
just a comparison.
The test program below demonstrates how a process can profile itself for
context switch events and how it can control notification through
signals. The key changes include the below perf_event_attr settings as
well as use of IOC_ENABLE:
pe.signal_on_wakeup = 1;
pe.count_sb_events = 1;
pe.wakeup_events = 2;
To keep things simple, PERF_EVENT_IOC_REFRESH cannot be used if any of
the new attributes are set.
RFC v2:
https://www.mail-archive.com/[email protected]/msg1420363.html
Changes:
- Send HUP on perf_event_exit_event() (suggested by Jiri)
- Disable use of IOC_REFRESH if signal_on_wakeup/count_sb_events is
set.
- Naveen
---
Here is a sample program demonstrating the same:
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <signal.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
return syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
}
static void sigio_handler(int n, siginfo_t *info, void *uc)
{
fprintf (stderr, "Caught %s\n", info->si_code == POLL_HUP ? "POLL_HUP" :
(info->si_code == POLL_IN ? "POLL_IN" : "other signal"));
}
int main(int argc, char **argv)
{
struct perf_event_attr pe;
struct sigaction act;
int fd;
void *buf;
memset(&act, 0, sizeof(act));
act.sa_sigaction = sigio_handler;
act.sa_flags = SA_SIGINFO;
sigaction(SIGIO, &act, 0);
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.size = sizeof(struct perf_event_attr);
pe.type = PERF_TYPE_SOFTWARE;
pe.config = PERF_COUNT_SW_DUMMY;
pe.disabled = 1;
pe.sample_period = 1;
pe.context_switch = 1;
pe.signal_on_wakeup = 1;
pe.count_sb_events = 1;
pe.wakeup_events = 2;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %lx\n", (unsigned long)pe.config);
exit(EXIT_FAILURE);
}
buf = mmap(NULL, sysconf(_SC_PAGESIZE) * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (buf == MAP_FAILED) {
fprintf(stderr, "Can't mmap buffer\n");
return -1;
}
if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_ASYNC) == -1)
return -2;
if (fcntl(fd, F_SETSIG, SIGIO) == -1)
return -3;
if (fcntl(fd, F_SETOWN, getpid()) == -1)
return -4;
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1)
return -5;
fprintf (stderr, "Sleep 1\n");
sleep(1);
fprintf (stderr, "Sleep 2\n");
sleep(1);
fprintf (stderr, "Sleep 3\n");
sleep(1);
/* Disable the event counter */
ioctl(fd, PERF_EVENT_IOC_DISABLE, 1);
close(fd);
return 0;
}
A sample output:
$ time ./cs
Sleep 1
Caught POLL_IN
Sleep 2
Caught POLL_IN
Sleep 3
Caught POLL_IN
real 0m3.040s
user 0m0.001s
sys 0m0.003s
Naveen N. Rao (2):
kernel/events: Add option to notify through signals on wakeup
kernel/events: Add option to enable counting sideband events in
wakeup_events
include/uapi/linux/perf_event.h | 4 +++-
kernel/events/core.c | 20 ++++++++++++--------
kernel/events/ring_buffer.c | 16 ++++++++++++++++
3 files changed, 31 insertions(+), 9 deletions(-)
--
2.13.1
Add a new option 'signal_on_wakeup' to request for a signal to be
delivered on ring buffer wakeup controlled through watermark and
{wakeup_events, wakeup_watermark}.
Signed-off-by: Naveen N. Rao <[email protected]>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 18 +++++++++++-------
kernel/events/ring_buffer.c | 3 +++
3 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..e5810b1d74a4 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -345,7 +345,8 @@ struct perf_event_attr {
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
namespaces : 1, /* include namespaces data */
- __reserved_1 : 35;
+ signal_on_wakeup : 1, /* send signal on wakeup */
+ __reserved_1 : 34;
union {
__u32 wakeup_events; /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6c4e523dc1e2..812fcfc767f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2679,7 +2679,8 @@ static int _perf_event_refresh(struct perf_event *event, int refresh)
/*
* not supported on inherited events
*/
- if (event->attr.inherit || !is_sampling_event(event))
+ if (event->attr.inherit || event->attr.signal_on_wakeup ||
+ !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
@@ -7339,7 +7340,6 @@ static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
{
- int events = atomic_read(&event->event_limit);
int ret = 0;
/*
@@ -7362,12 +7362,15 @@ static int __perf_event_overflow(struct perf_event *event,
* events
*/
- event->pending_kill = POLL_IN;
- if (events && atomic_dec_and_test(&event->event_limit)) {
- ret = 1;
- event->pending_kill = POLL_HUP;
+ if (!event->attr.signal_on_wakeup) {
+ int events = atomic_read(&event->event_limit);
+ event->pending_kill = POLL_IN;
+ if (events && atomic_dec_and_test(&event->event_limit)) {
+ ret = 1;
+ event->pending_kill = POLL_HUP;
- perf_event_disable_inatomic(event);
+ perf_event_disable_inatomic(event);
+ }
}
READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -10408,6 +10411,7 @@ perf_event_exit_event(struct perf_event *child_event,
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ child_event->pending_kill = POLL_HUP;
raw_spin_unlock_irq(&child_ctx->lock);
/*
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 2831480c63a2..4e7c728569a8 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -21,6 +21,9 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, POLLIN);
+ if (handle->event->attr.signal_on_wakeup)
+ handle->event->pending_kill = POLL_IN;
+
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
}
--
2.13.1
Many sideband events are interesting by themselves. When profiling only
for sideband events, it is useful to be able to control process wakeup
(wakeup_events) based on sideband events alone. Add a new option
'count_sb_events' to do the same.
Signed-off-by: Naveen N. Rao <[email protected]>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 4 ++--
kernel/events/ring_buffer.c | 13 +++++++++++++
3 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5810b1d74a4..ab4dc9a02151 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -346,7 +346,8 @@ struct perf_event_attr {
write_backward : 1, /* Write ring buffer from end to beginning */
namespaces : 1, /* include namespaces data */
signal_on_wakeup : 1, /* send signal on wakeup */
- __reserved_1 : 34;
+ count_sb_events : 1, /* wakeup_events also counts sideband events */
+ __reserved_1 : 33;
union {
__u32 wakeup_events; /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 812fcfc767f4..9ff9c0e37727 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2680,7 +2680,7 @@ static int _perf_event_refresh(struct perf_event *event, int refresh)
* not supported on inherited events
*/
if (event->attr.inherit || event->attr.signal_on_wakeup ||
- !is_sampling_event(event))
+ event->attr.count_sb_events || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
@@ -5956,7 +5956,7 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
- if (!event->attr.watermark) {
+ if (!event->attr.count_sb_events && !event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4e7c728569a8..f43a6081141f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -197,6 +197,19 @@ __perf_output_begin(struct perf_output_handle *handle,
* none of the data stores below can be lifted up by the compiler.
*/
+ if (event->attr.count_sb_events && !event->attr.watermark) {
+ int wakeup_events = event->attr.wakeup_events;
+
+ if (wakeup_events) {
+ int events = local_inc_return(&rb->events);
+
+ if (events >= wakeup_events) {
+ local_sub(wakeup_events, &rb->events);
+ local_inc(&rb->wakeup);
+ }
+ }
+ }
+
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
--
2.13.1
On 2017/06/19 08:01PM, Naveen N. Rao wrote:
> Currently, there is no way to ask for signals to be delivered when a
> certain number of sideband events have been logged into the ring buffer.
> This is problematic if we are only interested in, say, context switch
> events. Furthermore, signals are more useful (rather than polling) for
> self-profiling. This series provides for a way to achieve this.
>
> We ride on top of the existing support for ring buffer wakeup to
> generate signals as desired. Counting sideband events still requires
> some changes in the output path, but in normal cases, it ends up being
> just a comparison.
>
> for
> context switch events and how it can control notification through
> signals. The key changes include the below perf_event_attr settings as
> well as use of IOC_ENABLE:
> pe.signal_on_wakeup = 1;
> pe.count_sb_events = 1;
> pe.wakeup_events = 2;
>
> To keep things simple, PERF_EVENT_IOC_REFRESH cannot be used if any of
> the new attributes are set.
>
> RFC v2:
> https://www.mail-archive.com/[email protected]/msg1420363.html
> Changes:
> - Send HUP on perf_event_exit_event() (suggested by Jiri)
> - Disable use of IOC_REFRESH if signal_on_wakeup/count_sb_events is
> set.
Peter, Arnaldo, Jiri,
Can you please take a look at this patchset?
Thanks,
Naveen
>
>
> - Naveen
>
> ---
> Here is a sample program demonstrating the same:
>
> #define _GNU_SOURCE
>
> #include <stdlib.h>
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <string.h>
> #include <signal.h>
> #include <sys/ioctl.h>
> #include <sys/mman.h>
> #include <linux/perf_event.h>
> #include <asm/unistd.h>
>
> static long
> perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
> int cpu, int group_fd, unsigned long flags)
> {
> return syscall(__NR_perf_event_open, hw_event, pid, cpu,
> group_fd, flags);
> }
>
> static void sigio_handler(int n, siginfo_t *info, void *uc)
> {
> fprintf (stderr, "Caught %s\n", info->si_code == POLL_HUP ? "POLL_HUP" :
> (info->si_code == POLL_IN ? "POLL_IN" : "other signal"));
> }
>
> int main(int argc, char **argv)
> {
> struct perf_event_attr pe;
> struct sigaction act;
> int fd;
> void *buf;
>
> memset(&act, 0, sizeof(act));
> act.sa_sigaction = sigio_handler;
> act.sa_flags = SA_SIGINFO;
> sigaction(SIGIO, &act, 0);
>
> memset(&pe, 0, sizeof(struct perf_event_attr));
> pe.size = sizeof(struct perf_event_attr);
> pe.type = PERF_TYPE_SOFTWARE;
> pe.config = PERF_COUNT_SW_DUMMY;
> pe.disabled = 1;
> pe.sample_period = 1;
> pe.context_switch = 1;
> pe.signal_on_wakeup = 1;
> pe.count_sb_events = 1;
> pe.wakeup_events = 2;
>
> fd = perf_event_open(&pe, 0, -1, -1, 0);
> if (fd == -1) {
> fprintf(stderr, "Error opening leader %lx\n", (unsigned long)pe.config);
> exit(EXIT_FAILURE);
> }
>
> buf = mmap(NULL, sysconf(_SC_PAGESIZE) * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
> if (buf == MAP_FAILED) {
> fprintf(stderr, "Can't mmap buffer\n");
> return -1;
> }
>
> if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_ASYNC) == -1)
> return -2;
>
> if (fcntl(fd, F_SETSIG, SIGIO) == -1)
> return -3;
>
> if (fcntl(fd, F_SETOWN, getpid()) == -1)
> return -4;
>
> if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1)
> return -5;
>
> fprintf (stderr, "Sleep 1\n");
> sleep(1);
> fprintf (stderr, "Sleep 2\n");
> sleep(1);
> fprintf (stderr, "Sleep 3\n");
> sleep(1);
>
> /* Disable the event counter */
> ioctl(fd, PERF_EVENT_IOC_DISABLE, 1);
>
> close(fd);
>
> return 0;
> }
>
>
> A sample output:
> $ time ./cs
> Sleep 1
> Caught POLL_IN
> Sleep 2
> Caught POLL_IN
> Sleep 3
> Caught POLL_IN
>
> real 0m3.040s
> user 0m0.001s
> sys 0m0.003s
>
>
> Naveen N. Rao (2):
> kernel/events: Add option to notify through signals on wakeup
> kernel/events: Add option to enable counting sideband events in
> wakeup_events
>
> include/uapi/linux/perf_event.h | 4 +++-
> kernel/events/core.c | 20 ++++++++++++--------
> kernel/events/ring_buffer.c | 16 ++++++++++++++++
> 3 files changed, 31 insertions(+), 9 deletions(-)
>
> --
> 2.13.1
>
On Mon, Jun 19, 2017 at 08:01:08PM +0530, Naveen N. Rao wrote:
SNIP
> - if (!event->attr.watermark) {
> + if (!event->attr.count_sb_events && !event->attr.watermark) {
> int wakeup_events = event->attr.wakeup_events;
>
> if (wakeup_events) {
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 4e7c728569a8..f43a6081141f 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -197,6 +197,19 @@ __perf_output_begin(struct perf_output_handle *handle,
> * none of the data stores below can be lifted up by the compiler.
> */
>
> + if (event->attr.count_sb_events && !event->attr.watermark) {
> + int wakeup_events = event->attr.wakeup_events;
> +
> + if (wakeup_events) {
> + int events = local_inc_return(&rb->events);
> +
> + if (events >= wakeup_events) {
> + local_sub(wakeup_events, &rb->events);
> + local_inc(&rb->wakeup);
> + }
> + }
> + }
hum, so there's the same wakeup code in perf_output_sample,
but it's not called for non-sample (sideband) events
it'd be nice to have this wakeup only once in __perf_output_begin,
but it'd change the behaviour for sideband events, which would start to
follow the wakeup_events logic.. and possibly disturb Vince's tests?
jirka
On Mon, Jun 19, 2017 at 08:01:06PM +0530, Naveen N. Rao wrote:
> Currently, there is no way to ask for signals to be delivered when a
> certain number of sideband events have been logged into the ring buffer.
> This is problematic if we are only interested in, say, context switch
> events. Furthermore, signals are more useful (rather than polling) for
> self-profiling. This series provides for a way to achieve this.
>
> We ride on top of the existing support for ring buffer wakeup to
> generate signals as desired. Counting sideband events still requires
> some changes in the output path, but in normal cases, it ends up being
> just a comparison.
>
> The test program below demonstrates how a process can profile itself for
> context switch events and how it can control notification through
> signals. The key changes include the below perf_event_attr settings as
> well as use of IOC_ENABLE:
> pe.signal_on_wakeup = 1;
> pe.count_sb_events = 1;
> pe.wakeup_events = 2;
Vince,
could you please check on this? thanks
Naveen,
have you run Vince's test suite on this?
http://github.com/deater/perf_event_tests.git
jirka
>
> To keep things simple, PERF_EVENT_IOC_REFRESH cannot be used if any of
> the new attributes are set.
>
> RFC v2:
> https://www.mail-archive.com/[email protected]/msg1420363.html
> Changes:
> - Send HUP on perf_event_exit_event() (suggested by Jiri)
> - Disable use of IOC_REFRESH if signal_on_wakeup/count_sb_events is
> set.
>
>
> - Naveen
>
> ---
> Here is a sample program demonstrating the same:
>
> #define _GNU_SOURCE
>
> #include <stdlib.h>
> #include <stdio.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <string.h>
> #include <signal.h>
> #include <sys/ioctl.h>
> #include <sys/mman.h>
> #include <linux/perf_event.h>
> #include <asm/unistd.h>
>
> static long
> perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
> int cpu, int group_fd, unsigned long flags)
> {
> return syscall(__NR_perf_event_open, hw_event, pid, cpu,
> group_fd, flags);
> }
>
> static void sigio_handler(int n, siginfo_t *info, void *uc)
> {
> fprintf (stderr, "Caught %s\n", info->si_code == POLL_HUP ? "POLL_HUP" :
> (info->si_code == POLL_IN ? "POLL_IN" : "other signal"));
> }
>
> int main(int argc, char **argv)
> {
> struct perf_event_attr pe;
> struct sigaction act;
> int fd;
> void *buf;
>
> memset(&act, 0, sizeof(act));
> act.sa_sigaction = sigio_handler;
> act.sa_flags = SA_SIGINFO;
> sigaction(SIGIO, &act, 0);
>
> memset(&pe, 0, sizeof(struct perf_event_attr));
> pe.size = sizeof(struct perf_event_attr);
> pe.type = PERF_TYPE_SOFTWARE;
> pe.config = PERF_COUNT_SW_DUMMY;
> pe.disabled = 1;
> pe.sample_period = 1;
> pe.context_switch = 1;
> pe.signal_on_wakeup = 1;
> pe.count_sb_events = 1;
> pe.wakeup_events = 2;
>
> fd = perf_event_open(&pe, 0, -1, -1, 0);
> if (fd == -1) {
> fprintf(stderr, "Error opening leader %lx\n", (unsigned long)pe.config);
> exit(EXIT_FAILURE);
> }
>
> buf = mmap(NULL, sysconf(_SC_PAGESIZE) * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
> if (buf == MAP_FAILED) {
> fprintf(stderr, "Can't mmap buffer\n");
> return -1;
> }
>
> if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_ASYNC) == -1)
> return -2;
>
> if (fcntl(fd, F_SETSIG, SIGIO) == -1)
> return -3;
>
> if (fcntl(fd, F_SETOWN, getpid()) == -1)
> return -4;
>
> if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1)
> return -5;
>
> fprintf (stderr, "Sleep 1\n");
> sleep(1);
> fprintf (stderr, "Sleep 2\n");
> sleep(1);
> fprintf (stderr, "Sleep 3\n");
> sleep(1);
>
> /* Disable the event counter */
> ioctl(fd, PERF_EVENT_IOC_DISABLE, 1);
>
> close(fd);
>
> return 0;
> }
>
>
> A sample output:
> $ time ./cs
> Sleep 1
> Caught POLL_IN
> Sleep 2
> Caught POLL_IN
> Sleep 3
> Caught POLL_IN
>
> real 0m3.040s
> user 0m0.001s
> sys 0m0.003s
>
>
> Naveen N. Rao (2):
> kernel/events: Add option to notify through signals on wakeup
> kernel/events: Add option to enable counting sideband events in
> wakeup_events
>
> include/uapi/linux/perf_event.h | 4 +++-
> kernel/events/core.c | 20 ++++++++++++--------
> kernel/events/ring_buffer.c | 16 ++++++++++++++++
> 3 files changed, 31 insertions(+), 9 deletions(-)
>
> --
> 2.13.1
>
On Mon, Jun 19, 2017 at 08:01:07PM +0530, Naveen N. Rao wrote:
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 6c4e523dc1e2..812fcfc767f4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2679,7 +2679,8 @@ static int _perf_event_refresh(struct perf_event *event, int refresh)
> /*
> * not supported on inherited events
That comment wants updating to explain the signal crud..
> */
> - if (event->attr.inherit || !is_sampling_event(event))
> + if (event->attr.inherit || event->attr.signal_on_wakeup ||
> + !is_sampling_event(event))
> return -EINVAL;
>
> atomic_add(refresh, &event->event_limit);
> @@ -7362,12 +7362,15 @@ static int __perf_event_overflow(struct perf_event *event,
> * events
> */
>
> + if (!event->attr.signal_on_wakeup) {
> + int events = atomic_read(&event->event_limit);
> + event->pending_kill = POLL_IN;
> + if (events && atomic_dec_and_test(&event->event_limit)) {
> + ret = 1;
> + event->pending_kill = POLL_HUP;
>
> + perf_event_disable_inatomic(event);
> + }
> }
So even without event_limit (IOC_REFRESH) this would've generated
SIGIO:POLL_IN.
>
> READ_ONCE(event->overflow_handler)(event, data, regs);
> @@ -10408,6 +10411,7 @@ perf_event_exit_event(struct perf_event *child_event,
> perf_group_detach(child_event);
> list_del_event(child_event, child_ctx);
> child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
> + child_event->pending_kill = POLL_HUP;
This looks like an undocumented change..
> raw_spin_unlock_irq(&child_ctx->lock);
>
> /*
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 2831480c63a2..4e7c728569a8 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -21,6 +21,9 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
> {
> atomic_set(&handle->rb->poll, POLLIN);
>
> + if (handle->event->attr.signal_on_wakeup)
> + handle->event->pending_kill = POLL_IN;
> +
And this is the bit that generates SIGIO:POLL_IN on wakeup.
> handle->event->pending_wakeup = 1;
> irq_work_queue(&handle->event->pending);
> }
> --
> 2.13.1
>
On Mon, Jun 19, 2017 at 08:01:08PM +0530, Naveen N. Rao wrote:
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 4e7c728569a8..f43a6081141f 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -197,6 +197,19 @@ __perf_output_begin(struct perf_output_handle *handle,
> * none of the data stores below can be lifted up by the compiler.
> */
>
> + if (event->attr.count_sb_events && !event->attr.watermark) {
> + int wakeup_events = event->attr.wakeup_events;
> +
> + if (wakeup_events) {
> + int events = local_inc_return(&rb->events);
> +
> + if (events >= wakeup_events) {
> + local_sub(wakeup_events, &rb->events);
> + local_inc(&rb->wakeup);
> + }
> + }
> + }
> +
> if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
> local_add(rb->watermark, &rb->wakeup);
>
So this is a very performance sensitive function; not at all happy to
add bits here ... :/
[ Adding Vince...]
On 2017/07/12 12:48PM, Jiri Olsa wrote:
> On Mon, Jun 19, 2017 at 08:01:06PM +0530, Naveen N. Rao wrote:
> > Currently, there is no way to ask for signals to be delivered when a
> > certain number of sideband events have been logged into the ring buffer.
> > This is problematic if we are only interested in, say, context switch
> > events. Furthermore, signals are more useful (rather than polling) for
> > self-profiling. This series provides for a way to achieve this.
> >
> > We ride on top of the existing support for ring buffer wakeup to
> > generate signals as desired. Counting sideband events still requires
> > some changes in the output path, but in normal cases, it ends up being
> > just a comparison.
> >
> > The test program below demonstrates how a process can profile itself for
> > context switch events and how it can control notification through
> > signals. The key changes include the below perf_event_attr settings as
> > well as use of IOC_ENABLE:
> > pe.signal_on_wakeup = 1;
> > pe.count_sb_events = 1;
> > pe.wakeup_events = 2;
>
> Vince,
> could you please check on this? thanks
>
> Naveen,
> have you run Vince's test suite on this?
> http://github.com/deater/perf_event_tests.git
I just tried this and I see quite a few failures even without these
patches.
The behavior is similar with/without these patches and all the ioctl
tests pass, but I see some failures with the overflow tests. I'll look
into those tests in detail tomorrow. I may be missing something.
Thanks,
Naveen
On 2017/07/12 12:48PM, Jiri Olsa wrote:
> On Mon, Jun 19, 2017 at 08:01:08PM +0530, Naveen N. Rao wrote:
>
> SNIP
>
> > - if (!event->attr.watermark) {
> > + if (!event->attr.count_sb_events && !event->attr.watermark) {
> > int wakeup_events = event->attr.wakeup_events;
> >
> > if (wakeup_events) {
> > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> > index 4e7c728569a8..f43a6081141f 100644
> > --- a/kernel/events/ring_buffer.c
> > +++ b/kernel/events/ring_buffer.c
> > @@ -197,6 +197,19 @@ __perf_output_begin(struct perf_output_handle *handle,
> > * none of the data stores below can be lifted up by the compiler.
> > */
> >
> > + if (event->attr.count_sb_events && !event->attr.watermark) {
> > + int wakeup_events = event->attr.wakeup_events;
> > +
> > + if (wakeup_events) {
> > + int events = local_inc_return(&rb->events);
> > +
> > + if (events >= wakeup_events) {
> > + local_sub(wakeup_events, &rb->events);
> > + local_inc(&rb->wakeup);
> > + }
> > + }
> > + }
>
> hum, so there's the same wakeup code in perf_output_sample,
> but it's not called for non-sample (sideband) events
Good point.
>
> it'd be nice to have this wakeup only once in __perf_output_begin,
> but it'd change the behaviour for sideband events, which would start to
> follow the wakeup_events logic.. and possibly disturb Vince's tests?
I suppose you meant 'change the behaviour for _sample_ events'. Yes, the
problem with having the check here is that it will now start firing for
all events, rather than just the sampling events. I am not sure if we
can differentiate sample events from sideband events in
__perf_output_begin(). The other aspect is also Peter's concerns around
performance.
Thanks,
Naveen
On 2017/07/12 01:46PM, Peter Zijlstra wrote:
> On Mon, Jun 19, 2017 at 08:01:07PM +0530, Naveen N. Rao wrote:
>
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 6c4e523dc1e2..812fcfc767f4 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -2679,7 +2679,8 @@ static int _perf_event_refresh(struct perf_event *event, int refresh)
> > /*
> > * not supported on inherited events
>
> That comment wants updating to explain the signal crud..
Ok sure. Will do.
>
> > */
> > - if (event->attr.inherit || !is_sampling_event(event))
> > + if (event->attr.inherit || event->attr.signal_on_wakeup ||
> > + !is_sampling_event(event))
> > return -EINVAL;
> >
> > atomic_add(refresh, &event->event_limit);
>
> > @@ -7362,12 +7362,15 @@ static int __perf_event_overflow(struct perf_event *event,
> > * events
> > */
> >
> > + if (!event->attr.signal_on_wakeup) {
> > + int events = atomic_read(&event->event_limit);
> > + event->pending_kill = POLL_IN;
> > + if (events && atomic_dec_and_test(&event->event_limit)) {
> > + ret = 1;
> > + event->pending_kill = POLL_HUP;
> >
> > + perf_event_disable_inatomic(event);
> > + }
> > }
>
> So even without event_limit (IOC_REFRESH) this would've generated
> SIGIO:POLL_IN.
Yes, this still does if signal_on_wakeup isn't set. However, if
signal_on_wakeup is set, we follow the ring buffer notification which
will generate a POLL_IN controlled by {wakeup_events, wakeup_watermark}.
>
> >
> > READ_ONCE(event->overflow_handler)(event, data, regs);
> > @@ -10408,6 +10411,7 @@ perf_event_exit_event(struct perf_event *child_event,
> > perf_group_detach(child_event);
> > list_del_event(child_event, child_ctx);
> > child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
> > + child_event->pending_kill = POLL_HUP;
>
> This looks like an undocumented change..
Yes, sorry - I should have added a note.
This comes from Jiri's feedback that if user chooses to have a signal
delivered on ring buffer wakeup, we should also send a HUP on exit.
>
> > raw_spin_unlock_irq(&child_ctx->lock);
> >
> > /*
> > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> > index 2831480c63a2..4e7c728569a8 100644
> > --- a/kernel/events/ring_buffer.c
> > +++ b/kernel/events/ring_buffer.c
> > @@ -21,6 +21,9 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
> > {
> > atomic_set(&handle->rb->poll, POLLIN);
> >
> > + if (handle->event->attr.signal_on_wakeup)
> > + handle->event->pending_kill = POLL_IN;
> > +
>
> And this is the bit that generates SIGIO:POLL_IN on wakeup.
Yes.
Thanks,
Naveen
>
> > handle->event->pending_wakeup = 1;
> > irq_work_queue(&handle->event->pending);
> > }
> > --
> > 2.13.1
> >
>
On 2017/07/12 01:48PM, Peter Zijlstra wrote:
> On Mon, Jun 19, 2017 at 08:01:08PM +0530, Naveen N. Rao wrote:
> > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> > index 4e7c728569a8..f43a6081141f 100644
> > --- a/kernel/events/ring_buffer.c
> > +++ b/kernel/events/ring_buffer.c
> > @@ -197,6 +197,19 @@ __perf_output_begin(struct perf_output_handle *handle,
> > * none of the data stores below can be lifted up by the compiler.
> > */
> >
> > + if (event->attr.count_sb_events && !event->attr.watermark) {
> > + int wakeup_events = event->attr.wakeup_events;
> > +
> > + if (wakeup_events) {
> > + int events = local_inc_return(&rb->events);
> > +
> > + if (events >= wakeup_events) {
> > + local_sub(wakeup_events, &rb->events);
> > + local_inc(&rb->wakeup);
> > + }
> > + }
> > + }
> > +
> > if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
> > local_add(rb->watermark, &rb->wakeup);
> >
>
> So this is a very performance sensitive function; not at all happy to
> add bits here ... :/
:O
Does it at all help if the above is instead guarded by:
if (unlikely(event->attr.count_sb_events)) {
...
}
That should hopefully limit the impact to only when that option is used?
Thanks,
Naveen
On Thu, 13 Jul 2017, Naveen N. Rao wrote:
> > could you please check on this? thanks
> >
> > Naveen,
> > have you run Vince's test suite on this?
> > http://github.com/deater/perf_event_tests.git
>
> I just tried this and I see quite a few failures even without these
> patches.
>
> The behavior is similar with/without these patches and all the ioctl
> tests pass, but I see some failures with the overflow tests. I'll look
> into those tests in detail tomorrow. I may be missing something.
the signal/overflow interface has always been a troublesome one :(
It's part of why I haven't had much to say about the whole kernel
addresses leaking into samples mess.
I had noticed some of the related overflow tests have been failing
recently (both in perf_event_tests and in PAPI), but I haven't had
a chance to see if it was an actual regression or just due to the way the
tests are designed. I'll see if I can figure out what's going on.
Vince