Previous patch set at [1] has a problem when using hw_breakpoints on
ARM and ARM64. This new version fix that by introducing
is_default_overflow_handler() to replace all '!overflow_handler'
checking.
[1] http://lkml.kernel.org/g/[email protected]
Wang Nan (5):
perf core: Introduce new ioctl options to pause and resume ring buffer
perf core: Set event's default overflow_handler
perf core: Prepare writing into ring buffer from end
perf core: Add backward attribute to perf event
perf core: Reduce perf event output overhead by new overflow handler
arch/arm/kernel/hw_breakpoint.c | 4 +--
arch/arm64/kernel/hw_breakpoint.c | 4 +--
include/linux/perf_event.h | 32 +++++++++++++++--
include/uapi/linux/perf_event.h | 4 ++-
kernel/events/core.c | 73 +++++++++++++++++++++++++++++++++------
kernel/events/internal.h | 11 ++++++
kernel/events/ring_buffer.c | 63 +++++++++++++++++++++++++++++----
7 files changed, 167 insertions(+), 24 deletions(-)
Cc: He Kuang <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Brendan Gregg <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
--
1.8.3.4
Add new ioctl() to pause/resume ring-buffer output.
In some situations we want to read from ring buffer only when we
ensure nothing can write to the ring buffer during reading. Without
this patch we have to turn off all events attached to this ring buffer
to achieve this.
This patch is for supporting overwrite ring buffer. Following
commits will introduce new methods support reading from overwrite ring
buffer. Before reading caller must ensure the ring buffer is frozen, or
the reading is unreliable.
Signed-off-by: Wang Nan <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Brendan Gregg <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
---
include/uapi/linux/perf_event.h | 1 +
kernel/events/core.c | 13 +++++++++++++
kernel/events/internal.h | 11 +++++++++++
kernel/events/ring_buffer.c | 7 ++++++-
4 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..a3c1903 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -401,6 +401,7 @@ struct perf_event_attr {
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b723149..1a1312e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4306,6 +4306,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
+ case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!event->rb) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rb_toggle_paused(rb, !!arg);
+ rcu_read_unlock();
+ return 0;
+ }
default:
return -ENOTTY;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c..6a93d1b 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -18,6 +18,7 @@ struct ring_buffer {
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
+ int paused; /* can write into ring buffer */
atomic_t poll; /* POLL_ for wakeups */
@@ -65,6 +66,16 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
rb_free(rb);
}
+static inline void
+rb_toggle_paused(struct ring_buffer *rb,
+ bool pause)
+{
+ if (!pause && rb->nr_pages)
+ rb->paused = 0;
+ else
+ rb->paused = 1;
+}
+
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 1faad2c..22e1a47 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -125,8 +125,11 @@ int perf_output_begin(struct perf_output_handle *handle,
if (unlikely(!rb))
goto out;
- if (unlikely(!rb->nr_pages))
+ if (unlikely(rb->paused)) {
+ if (rb->nr_pages)
+ local_inc(&rb->lost);
goto out;
+ }
handle->rb = rb;
handle->event = event;
@@ -244,6 +247,8 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
init_irq_work(&rb->irq_work, rb_irq_work);
+
+ rb->paused = rb->nr_pages ? 0 : 1;
}
static void ring_buffer_put_async(struct ring_buffer *rb)
--
1.8.3.4
By creating onward and backward specific overflow handlers and setting
them according to event's backward setting, normal sampling events
don't need checking backward setting of an event any more.
This is the last patch of backward writing patchset. After this patch,
there's no extra overhead introduced to the fast path of sampling
output.
Signed-off-by: Wang Nan <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Brendan Gregg <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
---
include/linux/perf_event.h | 23 ++++++++++++++++++++---
kernel/events/core.c | 41 ++++++++++++++++++++++++++++++++++++-----
kernel/events/ring_buffer.c | 12 ++++++++++++
3 files changed, 68 insertions(+), 8 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2bf23c9..9adfaa4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -829,14 +829,24 @@ extern int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs);
+extern void perf_event_output_onward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
+extern void perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
extern void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs);
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
static inline bool
is_default_overflow_handler(struct perf_event *event)
{
- return (event->overflow_handler == perf_event_output);
+ if (likely(event->overflow_handler == perf_event_output_onward))
+ return true;
+ if (unlikely(event->overflow_handler == perf_event_output_backward))
+ return true;
+ return false;
}
extern void
@@ -1044,6 +1054,13 @@ static inline bool is_write_backward(struct perf_event *event)
extern int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size);
+extern int perf_output_begin_onward(struct perf_output_handle *handle,
+ struct perf_event *event,
+ unsigned int size);
+extern int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_event *event,
+ unsigned int size);
+
extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3da3e38..c80b113 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5606,9 +5606,13 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs,
+ int (*output_begin)(struct perf_output_handle *,
+ struct perf_event *,
+ unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5618,7 +5622,7 @@ void perf_event_output(struct perf_event *event,
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ if (output_begin(&handle, event, header.size))
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -5629,6 +5633,30 @@ exit:
rcu_read_unlock();
}
+void
+perf_event_output_onward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_onward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin);
+}
+
/*
* read event_id
*/
@@ -7963,8 +7991,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (overflow_handler) {
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
+ } else if (is_write_backward(event)){
+ event->overflow_handler = perf_event_output_backward;
+ event->overflow_handler_context = NULL;
} else {
- event->overflow_handler = perf_event_output;
+ event->overflow_handler = perf_event_output_onward;
event->overflow_handler_context = NULL;
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 80b1fa7..7e30e012 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -230,6 +230,18 @@ out:
return -ENOSPC;
}
+int perf_output_begin_onward(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, false);
+}
+
+int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, true);
+}
+
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
--
1.8.3.4
Set a default event->overflow_handler in perf_event_alloc() so don't
need checking event->overflow_handler in __perf_event_overflow().
Following commits can give a different default overflow_handler.
No extra performance introduced into hot path because in the original
code we still need reading this handler from memory. A conditional branch
is avoided so actually we remove some instructions.
Initial idea comes from Peter at [1].
Since default value of event->overflow_handler is not null, existing
'if (!overflow_handler)' need to be changed.
is_default_overflow_handler() is introduced for this.
[1] http://lkml.kernel.org/r/[email protected]
Signed-off-by: Wang Nan <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Brendan Gregg <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
---
arch/arm/kernel/hw_breakpoint.c | 4 ++--
arch/arm64/kernel/hw_breakpoint.c | 4 ++--
include/linux/perf_event.h | 6 ++++++
kernel/events/core.c | 14 ++++++++------
4 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 6284779..b8df458 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
info->address &= ~alignment_mask;
info->ctrl.len <<= offset;
- if (!bp->overflow_handler) {
+ if (is_default_overflow_handler(bp)) {
/*
* Mismatch breakpoints are required for single-stepping
* breakpoints.
@@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* mismatch breakpoint so we can single-step over the
* watchpoint trigger.
*/
- if (!wp->overflow_handler)
+ if (is_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs));
unlock:
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d..4ef5373 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
perf_bp_event(bp, regs);
/* Do we need to handle the stepping? */
- if (!bp->overflow_handler)
+ if (is_default_overflow_handler(bp))
step = 1;
unlock:
rcu_read_unlock();
@@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
perf_bp_event(wp, regs);
/* Do we need to handle the stepping? */
- if (!wp->overflow_handler)
+ if (is_default_overflow_handler(wp))
step = 1;
unlock:
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a9d8cab..d5f99cd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -833,6 +833,12 @@ extern void perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs);
+static inline bool
+is_default_overflow_handler(struct perf_event *event)
+{
+ return (event->overflow_handler == perf_event_output);
+}
+
extern void
perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1a1312e..ed69532 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6467,10 +6467,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(&event->pending);
}
- if (event->overflow_handler)
- event->overflow_handler(event, data, regs);
- else
- perf_event_output(event, data, regs);
+ event->overflow_handler(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
@@ -7963,8 +7960,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
context = parent_event->overflow_handler_context;
}
- event->overflow_handler = overflow_handler;
- event->overflow_handler_context = context;
+ if (overflow_handler) {
+ event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
+ } else {
+ event->overflow_handler = perf_event_output;
+ event->overflow_handler_context = NULL;
+ }
perf_event__state_init(event);
--
1.8.3.4
Convert perf_output_begin to __perf_output_begin and make the later
function able to write records from the end of the ring buffer.
Following commits will utilize the 'backward' flag.
This patch doesn't introduce any extra performance overhead since we
use always_inline.
Signed-off-by: Wang Nan <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Brendan Gregg <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
---
kernel/events/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++------
1 file changed, 36 insertions(+), 6 deletions(-)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 22e1a47..37c11c6 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -102,8 +102,21 @@ out:
preempt_enable();
}
-int perf_output_begin(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size)
+static bool __always_inline
+ring_buffer_has_space(unsigned long head, unsigned long tail,
+ unsigned long data_size, unsigned int size,
+ bool backward)
+{
+ if (!backward)
+ return CIRC_SPACE(head, tail, data_size) >= size;
+ else
+ return CIRC_SPACE(tail, head, data_size) >= size;
+}
+
+static int __always_inline
+__perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size,
+ bool backward)
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
@@ -146,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
do {
tail = READ_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
- if (!rb->overwrite &&
- unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
- goto fail;
+ if (!rb->overwrite) {
+ if (unlikely(!ring_buffer_has_space(head, tail,
+ perf_data_size(rb),
+ size, backward)))
+ goto fail;
+ }
/*
* The above forms a control dependency barrier separating the
@@ -162,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
* See perf_output_put_handle().
*/
- head += size;
+ if (!backward)
+ head += size;
+ else
+ head -= size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
+ if (backward) {
+ offset = head;
+ head = (u64)(-head);
+ }
+
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
@@ -206,6 +230,12 @@ out:
return -ENOSPC;
}
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, false);
+}
+
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
--
1.8.3.4
In perf_event_attr a new bit 'write_backward' is appended to indicate
this event should write ring buffer from its end to beginning.
In perf_output_begin(), prepare ring buffer according this bit.
This patch introduces small overhead into perf_output_begin():
an extra memory read and a conditional branch. Further patch can remove
this overhead by using custom output handler.
Signed-off-by: Wang Nan <[email protected]>
Cc: He Kuang <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Brendan Gregg <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Masami Hiramatsu <[email protected]>
Cc: Namhyung Kim <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
---
include/linux/perf_event.h | 5 +++++
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 7 +++++++
kernel/events/ring_buffer.c | 2 ++
4 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5f99cd..2bf23c9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1037,6 +1037,11 @@ static inline bool has_aux(struct perf_event *event)
return event->pmu->setup_aux;
}
+static inline bool is_write_backward(struct perf_event *event)
+{
+ return !!event->attr.write_backward;
+}
+
extern int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size);
extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index a3c1903..43fc8d2 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -340,7 +340,8 @@ struct perf_event_attr {
comm_exec : 1, /* flag comm events that are due to an exec */
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
- __reserved_1 : 37;
+ write_backward : 1, /* Write ring buffer from end to beginning */
+ __reserved_1 : 36;
union {
__u32 wakeup_events; /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ed69532..3da3e38 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8199,6 +8199,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
goto out;
/*
+ * Either writing ring buffer from beginning or from end.
+ * Mixing is not allowed.
+ */
+ if (is_write_backward(output_event) != is_write_backward(event))
+ goto out;
+
+ /*
* If both events generate aux data, they must be on the same PMU
*/
if (has_aux(event) && has_aux(output_event) &&
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 37c11c6..80b1fa7 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -233,6 +233,8 @@ out:
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
+ if (unlikely(is_write_backward(event)))
+ return __perf_output_begin(handle, event, size, true);
return __perf_output_begin(handle, event, size, false);
}
--
1.8.3.4
On Mon, Mar 14, 2016 at 09:59:41AM +0000, Wang Nan wrote:
> Add new ioctl() to pause/resume ring-buffer output.
>
> In some situations we want to read from ring buffer only when we
> ensure nothing can write to the ring buffer during reading. Without
> this patch we have to turn off all events attached to this ring buffer
> to achieve this.
>
> This patch is for supporting overwrite ring buffer. Following
> commits will introduce new methods support reading from overwrite ring
> buffer. Before reading caller must ensure the ring buffer is frozen, or
> the reading is unreliable.
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 1afe962..a3c1903 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -401,6 +401,7 @@ struct perf_event_attr {
> #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
> #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
> #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
> +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
Can you also do a patch to the man-pages?
http://man7.org/linux/man-pages/man2/perf_event_open.2.html
Michael and Vince (who has so far been stellar in composing all that)
might be able to point you to the 'right' resources for that.
On 2016/3/23 17:16, Peter Zijlstra wrote:
> On Mon, Mar 14, 2016 at 09:59:41AM +0000, Wang Nan wrote:
>> Add new ioctl() to pause/resume ring-buffer output.
>>
>> In some situations we want to read from ring buffer only when we
>> ensure nothing can write to the ring buffer during reading. Without
>> this patch we have to turn off all events attached to this ring buffer
>> to achieve this.
>>
>> This patch is for supporting overwrite ring buffer. Following
>> commits will introduce new methods support reading from overwrite ring
>> buffer. Before reading caller must ensure the ring buffer is frozen, or
>> the reading is unreliable.
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 1afe962..a3c1903 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -401,6 +401,7 @@ struct perf_event_attr {
>> #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
>> #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
>> #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
>> +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
Glad to see you start to look at this patchset.
> Can you also do a patch to the man-pages?
>
> http://man7.org/linux/man-pages/man2/perf_event_open.2.html
Sure.
I think I need to provide a patch for:
http://git.kernel.org/cgit/docs/man-pages/man-pages.git
But which one should be the first? Shall we update man pages before
this patch be merged by upstream? Or Michael and Vince will consider
this problem?
Thank you.
On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
> Convert perf_output_begin to __perf_output_begin and make the later
> function able to write records from the end of the ring buffer.
> Following commits will utilize the 'backward' flag.
>
> This patch doesn't introduce any extra performance overhead since we
> use always_inline.
So while I agree that with __always_inline and constant propagation we
_should_ end up with the same code, we have:
$ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
text data bss dec hex filename
3785 2 0 3787 ecb defconfig-build/kernel/events/ring_buffer.o.pre
3673 2 0 3675 e5b defconfig-build/kernel/events/ring_buffer.o.post
The patch actually makes the file shrink.
So I think we still want to have some actual performance numbers.
On Wed, Mar 23, 2016 at 05:33:53PM +0800, Wangnan (F) wrote:
> Glad to see you start to look at this patchset.
My brain is completely fried from staring at fuzzer output for weeks, I
just need to do _something_, _anything_ else for a while :-)
On Mon, Mar 14, 2016 at 09:59:45AM +0000, Wang Nan wrote:
> By creating onward and backward specific overflow handlers and setting
> them according to event's backward setting, normal sampling events
> don't need checking backward setting of an event any more.
The normal antonym of backward is forward. Onward does not indicate a
direction per se, its more a continuance or progression of a previously
set goal.
Also, just squash these last two patches together.
On 2016/3/23 17:50, Peter Zijlstra wrote:
> On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
>> Convert perf_output_begin to __perf_output_begin and make the later
>> function able to write records from the end of the ring buffer.
>> Following commits will utilize the 'backward' flag.
>>
>> This patch doesn't introduce any extra performance overhead since we
>> use always_inline.
> So while I agree that with __always_inline and constant propagation we
> _should_ end up with the same code, we have:
>
> $ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
> text data bss dec hex filename
> 3785 2 0 3787 ecb defconfig-build/kernel/events/ring_buffer.o.pre
> 3673 2 0 3675 e5b defconfig-build/kernel/events/ring_buffer.o.post
>
> The patch actually makes the file shrink.
>
> So I think we still want to have some actual performance numbers.
There are some numbers. You can find them from:
http://lkml.iu.edu/hypermail/linux/kernel/1601.2/03966.html
Thank you.
On Wed, 23 Mar 2016, Wangnan (F) wrote:
>
> > Can you also do a patch to the man-pages?
> >
> > http://man7.org/linux/man-pages/man2/perf_event_open.2.html
>
> Sure.
>
> I think I need to provide a patch for:
>
> http://git.kernel.org/cgit/docs/man-pages/man-pages.git
>
> But which one should be the first? Shall we update man pages before
> this patch be merged by upstream? Or Michael and Vince will consider
> this problem?
It is good to see a rough draft of the manpage changes for an ABI
addition like this before the patch gets merged just so we can catch
anything odd about the interface.
Also include (as a comment) the git commit id that introduces the change.
In general manpage patches do not get committed until after the change
hits a full released kernel (as things have been known to get reverted
from -rc kernels in the past).
If you send a rough patch I can fix it up and queue it up with other
manpage patches I have (I'm a bit backlogged but working on catching up).
I'm never quite sure that I get all of the various Signed-off- lines right
for contributed patches though.
Vince
On Mon, Mar 14, 2016 at 09:59:42AM +0000, Wang Nan wrote:
> +++ b/arch/arm/kernel/hw_breakpoint.c
> @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
> info->address &= ~alignment_mask;
> info->ctrl.len <<= offset;
>
> - if (!bp->overflow_handler) {
> + if (is_default_overflow_handler(bp)) {
> /*
> * Mismatch breakpoints are required for single-stepping
> * breakpoints.
> @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
> * mismatch breakpoint so we can single-step over the
> * watchpoint trigger.
> */
> - if (!wp->overflow_handler)
> + if (is_default_overflow_handler(wp))
> enable_single_step(wp, instruction_pointer(regs));
>
> unlock:
> diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
> index b45c95d..4ef5373 100644
> --- a/arch/arm64/kernel/hw_breakpoint.c
> +++ b/arch/arm64/kernel/hw_breakpoint.c
> @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
> perf_bp_event(bp, regs);
>
> /* Do we need to handle the stepping? */
> - if (!bp->overflow_handler)
> + if (is_default_overflow_handler(bp))
> step = 1;
> unlock:
> rcu_read_unlock();
> @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
> perf_bp_event(wp, regs);
>
> /* Do we need to handle the stepping? */
> - if (!wp->overflow_handler)
> + if (is_default_overflow_handler(wp))
> step = 1;
>
> unlock:
Will, why does it matter what the overflow handler is for this stuff?
On Wed, Mar 23, 2016 at 06:50:21PM +0100, Peter Zijlstra wrote:
> On Mon, Mar 14, 2016 at 09:59:42AM +0000, Wang Nan wrote:
> > +++ b/arch/arm/kernel/hw_breakpoint.c
> > @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
> > info->address &= ~alignment_mask;
> > info->ctrl.len <<= offset;
> >
> > - if (!bp->overflow_handler) {
> > + if (is_default_overflow_handler(bp)) {
> > /*
> > * Mismatch breakpoints are required for single-stepping
> > * breakpoints.
> > @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
> > * mismatch breakpoint so we can single-step over the
> > * watchpoint trigger.
> > */
> > - if (!wp->overflow_handler)
> > + if (is_default_overflow_handler(wp))
> > enable_single_step(wp, instruction_pointer(regs));
> >
> > unlock:
> > diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
> > index b45c95d..4ef5373 100644
> > --- a/arch/arm64/kernel/hw_breakpoint.c
> > +++ b/arch/arm64/kernel/hw_breakpoint.c
> > @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
> > perf_bp_event(bp, regs);
> >
> > /* Do we need to handle the stepping? */
> > - if (!bp->overflow_handler)
> > + if (is_default_overflow_handler(bp))
> > step = 1;
> > unlock:
> > rcu_read_unlock();
> > @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
> > perf_bp_event(wp, regs);
> >
> > /* Do we need to handle the stepping? */
> > - if (!wp->overflow_handler)
> > + if (is_default_overflow_handler(wp))
> > step = 1;
> >
> > unlock:
>
> Will, why does it matter what the overflow handler is for this stuff?
Because ptrace registers an overflow handler for raising a SIGTRAP and
ptrace users (e.g. GDB) expect to handle the single-stepping themselves.
Perf, on the other hand, will livelock if the kernel doesn't do the
stepping.
FWIW, I hate this whole thing. The only users of the perf side just seem
to be people running whacky test cases and then pointing out the places
where we're not identical to x86 :(
Will
On Wed, Mar 23, 2016 at 06:13:49PM +0000, Will Deacon wrote:
> > Will, why does it matter what the overflow handler is for this stuff?
>
> Because ptrace registers an overflow handler for raising a SIGTRAP and
> ptrace users (e.g. GDB) expect to handle the single-stepping themselves.
> Perf, on the other hand, will livelock if the kernel doesn't do the
> stepping.
Ah makes sense.
> FWIW, I hate this whole thing. The only users of the perf side just seem
> to be people running whacky test cases and then pointing out the places
> where we're not identical to x86 :(
Yeah, its not my favourite part either. The whole breakpoint thing
doesn't quite behave like a regular PMU so its all a bit 'special'.
On Wed, Mar 23, 2016 at 06:08:41PM +0800, Wangnan (F) wrote:
>
>
> On 2016/3/23 17:50, Peter Zijlstra wrote:
> >On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
> >>Convert perf_output_begin to __perf_output_begin and make the later
> >>function able to write records from the end of the ring buffer.
> >>Following commits will utilize the 'backward' flag.
> >>
> >>This patch doesn't introduce any extra performance overhead since we
> >>use always_inline.
> >So while I agree that with __always_inline and constant propagation we
> >_should_ end up with the same code, we have:
> >
> >$ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
> > text data bss dec hex filename
> > 3785 2 0 3787 ecb defconfig-build/kernel/events/ring_buffer.o.pre
> > 3673 2 0 3675 e5b defconfig-build/kernel/events/ring_buffer.o.post
> >
> >The patch actually makes the file shrink.
> >
> >So I think we still want to have some actual performance numbers.
>
> There are some numbers. You can find them from:
>
> http://lkml.iu.edu/hypermail/linux/kernel/1601.2/03966.html
Wang, when you respin, please add all perf analysis that you've
done and the reasons to do it this way to commit log
to make sure it stays in git history.
I've reviewed it in the past and looks like the patches didn't
change over the last months. So still ack, I believe
overwrite buffers is a great feature.
On Wed, Mar 23, 2016 at 06:13:49PM +0000, Will Deacon wrote:
> On Wed, Mar 23, 2016 at 06:50:21PM +0100, Peter Zijlstra wrote:
> > On Mon, Mar 14, 2016 at 09:59:42AM +0000, Wang Nan wrote:
> > > +++ b/arch/arm/kernel/hw_breakpoint.c
> > > @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
> > > info->address &= ~alignment_mask;
> > > info->ctrl.len <<= offset;
> > >
> > > - if (!bp->overflow_handler) {
> > > + if (is_default_overflow_handler(bp)) {
> > > /*
> > > * Mismatch breakpoints are required for single-stepping
> > > * breakpoints.
> > > @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
> > > * mismatch breakpoint so we can single-step over the
> > > * watchpoint trigger.
> > > */
> > > - if (!wp->overflow_handler)
> > > + if (is_default_overflow_handler(wp))
> > > enable_single_step(wp, instruction_pointer(regs));
> > >
> > > unlock:
> > > diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
> > > index b45c95d..4ef5373 100644
> > > --- a/arch/arm64/kernel/hw_breakpoint.c
> > > +++ b/arch/arm64/kernel/hw_breakpoint.c
> > > @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
> > > perf_bp_event(bp, regs);
> > >
> > > /* Do we need to handle the stepping? */
> > > - if (!bp->overflow_handler)
> > > + if (is_default_overflow_handler(bp))
> > > step = 1;
> > > unlock:
> > > rcu_read_unlock();
> > > @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
> > > perf_bp_event(wp, regs);
> > >
> > > /* Do we need to handle the stepping? */
> > > - if (!wp->overflow_handler)
> > > + if (is_default_overflow_handler(wp))
> > > step = 1;
> > >
> > > unlock:
> >
> > Will, why does it matter what the overflow handler is for this stuff?
>
> Because ptrace registers an overflow handler for raising a SIGTRAP and
> ptrace users (e.g. GDB) expect to handle the single-stepping themselves.
> Perf, on the other hand, will livelock if the kernel doesn't do the
> stepping.
Would it, perhaps, make sense to invert this test and check for
->overflow_handler == ptrace_hbptriggered instead? That way nobody gets
surprise live-locks, endlessly triggering the same trap.
But yes, this kinda blows.
On 2016/3/24 3:25, Alexei Starovoitov wrote:
> On Wed, Mar 23, 2016 at 06:08:41PM +0800, Wangnan (F) wrote:
>>
>> On 2016/3/23 17:50, Peter Zijlstra wrote:
>>> On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
>>>> Convert perf_output_begin to __perf_output_begin and make the later
>>>> function able to write records from the end of the ring buffer.
>>>> Following commits will utilize the 'backward' flag.
>>>>
>>>> This patch doesn't introduce any extra performance overhead since we
>>>> use always_inline.
>>> So while I agree that with __always_inline and constant propagation we
>>> _should_ end up with the same code, we have:
>>>
>>> $ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
>>> text data bss dec hex filename
>>> 3785 2 0 3787 ecb defconfig-build/kernel/events/ring_buffer.o.pre
>>> 3673 2 0 3675 e5b defconfig-build/kernel/events/ring_buffer.o.post
>>>
>>> The patch actually makes the file shrink.
>>>
>>> So I think we still want to have some actual performance numbers.
>> There are some numbers. You can find them from:
>>
>> http://lkml.iu.edu/hypermail/linux/kernel/1601.2/03966.html
> Wang, when you respin, please add all perf analysis that you've
> done and the reasons to do it this way to commit log
> to make sure it stays in git history.
>
> I've reviewed it in the past and looks like the patches didn't
> change over the last months. So still ack, I believe
> overwrite buffers is a great feature.
Can I add your Acked-by in all these 5 patches?
Thank you.
On Wed, Mar 23, 2016 at 08:29:38PM +0100, Peter Zijlstra wrote:
> On Wed, Mar 23, 2016 at 06:13:49PM +0000, Will Deacon wrote:
> > On Wed, Mar 23, 2016 at 06:50:21PM +0100, Peter Zijlstra wrote:
> > > On Mon, Mar 14, 2016 at 09:59:42AM +0000, Wang Nan wrote:
> > > > +++ b/arch/arm/kernel/hw_breakpoint.c
> > > > @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
> > > > info->address &= ~alignment_mask;
> > > > info->ctrl.len <<= offset;
> > > >
> > > > - if (!bp->overflow_handler) {
> > > > + if (is_default_overflow_handler(bp)) {
> > > > /*
> > > > * Mismatch breakpoints are required for single-stepping
> > > > * breakpoints.
> > > > @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
> > > > * mismatch breakpoint so we can single-step over the
> > > > * watchpoint trigger.
> > > > */
> > > > - if (!wp->overflow_handler)
> > > > + if (is_default_overflow_handler(wp))
> > > > enable_single_step(wp, instruction_pointer(regs));
> > > >
> > > > unlock:
> > > > diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
> > > > index b45c95d..4ef5373 100644
> > > > --- a/arch/arm64/kernel/hw_breakpoint.c
> > > > +++ b/arch/arm64/kernel/hw_breakpoint.c
> > > > @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
> > > > perf_bp_event(bp, regs);
> > > >
> > > > /* Do we need to handle the stepping? */
> > > > - if (!bp->overflow_handler)
> > > > + if (is_default_overflow_handler(bp))
> > > > step = 1;
> > > > unlock:
> > > > rcu_read_unlock();
> > > > @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
> > > > perf_bp_event(wp, regs);
> > > >
> > > > /* Do we need to handle the stepping? */
> > > > - if (!wp->overflow_handler)
> > > > + if (is_default_overflow_handler(wp))
> > > > step = 1;
> > > >
> > > > unlock:
> > >
> > > Will, why does it matter what the overflow handler is for this stuff?
> >
> > Because ptrace registers an overflow handler for raising a SIGTRAP and
> > ptrace users (e.g. GDB) expect to handle the single-stepping themselves.
> > Perf, on the other hand, will livelock if the kernel doesn't do the
> > stepping.
>
> Would it, perhaps, make sense to invert this test and check for
> ->overflow_handler == ptrace_hbptriggered instead? That way nobody gets
> surprise live-locks, endlessly triggering the same trap.
Not sure... I can imagine kgdb, for example, wanting to handle the stepping
itself. You also need to play clever tricks if you want to step through
LL/SC atomics, which the code here doesn't even try to handle (because
it involves disassembling the instructions and applying a bunch of
heuristics), so I imagine most debuggers wanting to take care of the step
themselves.
> But yes, this kinda blows.
Yup.
Will
On Thu, Mar 24, 2016 at 09:58:02AM +0000, Will Deacon wrote:
> On Wed, Mar 23, 2016 at 08:29:38PM +0100, Peter Zijlstra wrote:
> Not sure... I can imagine kgdb, for example, wanting to handle the stepping
> itself. You also need to play clever tricks if you want to step through
> LL/SC atomics, which the code here doesn't even try to handle (because
> it involves disassembling the instructions and applying a bunch of
> heuristics), so I imagine most debuggers wanting to take care of the step
> themselves.
Fair enough... right, a whole host of tricky this.
On Thu, Mar 24, 2016 at 11:48:54AM +0800, Wangnan (F) wrote:
>
> >>http://lkml.iu.edu/hypermail/linux/kernel/1601.2/03966.html
> >Wang, when you respin, please add all perf analysis that you've
> >done and the reasons to do it this way to commit log
> >to make sure it stays in git history.
> >
> >I've reviewed it in the past and looks like the patches didn't
> >change over the last months. So still ack, I believe
> >overwrite buffers is a great feature.
> Can I add your Acked-by in all these 5 patches?
please respin first, I'll review promptly.
On 2016/3/23 17:50, Peter Zijlstra wrote:
> On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
>> Convert perf_output_begin to __perf_output_begin and make the later
>> function able to write records from the end of the ring buffer.
>> Following commits will utilize the 'backward' flag.
>>
>> This patch doesn't introduce any extra performance overhead since we
>> use always_inline.
> So while I agree that with __always_inline and constant propagation we
> _should_ end up with the same code, we have:
>
> $ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
> text data bss dec hex filename
> 3785 2 0 3787 ecb defconfig-build/kernel/events/ring_buffer.o.pre
> 3673 2 0 3675 e5b defconfig-build/kernel/events/ring_buffer.o.post
>
> The patch actually makes the file shrink.
>
> So I think we still want to have some actual performance numbers.
In my environment the two objects are nearly idential:
$ objdump -d kernel/events/ring_buffer.o.new > ./out.new.S
$ objdump -d kernel/events/ring_buffer.o.old > ./out.old.S
--- ./out.old.S 2016-03-25 12:18:52.060656423 +0000
+++ ./out.new.S 2016-03-25 12:18:45.376630269 +0000
@@ -1,5 +1,5 @@
-kernel/events/ring_buffer.o.old: file format elf64-x86-64
+kernel/events/ring_buffer.o.new: file format elf64-x86-64
Disassembly of section .text:
@@ -320,7 +320,7 @@
402: 4d 8d 04 0f lea (%r15,%rcx,1),%r8
406: 48 89 c8 mov %rcx,%rax
409: 4c 0f b1 43 40 cmpxchg %r8,0x40(%rbx)
- 40e: 48 39 c8 cmp %rcx,%rax
+ 40e: 48 39 c1 cmp %rax,%rcx
411: 75 b4 jne 3c7 <perf_output_begin+0xc7>
413: 48 8b 73 58 mov 0x58(%rbx),%rsi
417: 48 8b 43 68 mov 0x68(%rbx),%rax
@@ -357,7 +357,7 @@
480: 85 c0 test %eax,%eax
482: 0f 85 02 ff ff ff jne 38a <perf_output_begin+0x8a>
488: 48 c7 c2 00 00 00 00 mov $0x0,%rdx
- 48f: be 7c 00 00 00 mov $0x7c,%esi
+ 48f: be 89 00 00 00 mov $0x89,%esi
494: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
49b: c6 05 00 00 00 00 01 movb $0x1,0x0(%rip) # 4a2
<perf_output_begin+0x1a2>
4a2: e8 00 00 00 00 callq 4a7 <perf_output_begin+0x1a7>
@@ -874,7 +874,7 @@
c39: eb e7 jmp c22 <perf_aux_output_begin+0x172>
c3b: 80 3d 00 00 00 00 00 cmpb $0x0,0x0(%rip) # c42
<perf_aux_output_begin+0x192>
c42: 75 93 jne bd7 <perf_aux_output_begin+0x127>
- c44: be 2b 01 00 00 mov $0x12b,%esi
+ c44: be 49 01 00 00 mov $0x149,%esi
c49: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
c50: e8 00 00 00 00 callq c55 <perf_aux_output_begin+0x1a5>
c55: c6 05 00 00 00 00 01 movb $0x1,0x0(%rip) # c5c
<perf_aux_output_begin+0x1ac>
I think you enabled some unusual config options?
Thank you.
On 2016/3/25 20:26, Wangnan (F) wrote:
>
>
> On 2016/3/23 17:50, Peter Zijlstra wrote:
>> On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
>>> Convert perf_output_begin to __perf_output_begin and make the later
>>> function able to write records from the end of the ring buffer.
>>> Following commits will utilize the 'backward' flag.
>>>
>>> This patch doesn't introduce any extra performance overhead since we
>>> use always_inline.
>> So while I agree that with __always_inline and constant propagation we
>> _should_ end up with the same code, we have:
>>
>> $ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
>> text data bss dec hex filename
>> 3785 2 0 3787 ecb
>> defconfig-build/kernel/events/ring_buffer.o.pre
>> 3673 2 0 3675 e5b
>> defconfig-build/kernel/events/ring_buffer.o.post
>>
>> The patch actually makes the file shrink.
>>
>> So I think we still want to have some actual performance numbers.
>
> In my environment the two objects are nearly idential:
>
>
> $ objdump -d kernel/events/ring_buffer.o.new > ./out.new.S
> $ objdump -d kernel/events/ring_buffer.o.old > ./out.old.S
>
> --- ./out.old.S 2016-03-25 12:18:52.060656423 +0000
> +++ ./out.new.S 2016-03-25 12:18:45.376630269 +0000
> @@ -1,5 +1,5 @@
>
> -kernel/events/ring_buffer.o.old: file format elf64-x86-64
> +kernel/events/ring_buffer.o.new: file format elf64-x86-64
>
>
> Disassembly of section .text:
> @@ -320,7 +320,7 @@
> 402: 4d 8d 04 0f lea (%r15,%rcx,1),%r8
> 406: 48 89 c8 mov %rcx,%rax
> 409: 4c 0f b1 43 40 cmpxchg %r8,0x40(%rbx)
> - 40e: 48 39 c8 cmp %rcx,%rax
> + 40e: 48 39 c1 cmp %rax,%rcx
> 411: 75 b4 jne 3c7 <perf_output_begin+0xc7>
> 413: 48 8b 73 58 mov 0x58(%rbx),%rsi
> 417: 48 8b 43 68 mov 0x68(%rbx),%rax
> @@ -357,7 +357,7 @@
> 480: 85 c0 test %eax,%eax
> 482: 0f 85 02 ff ff ff jne 38a <perf_output_begin+0x8a>
> 488: 48 c7 c2 00 00 00 00 mov $0x0,%rdx
> - 48f: be 7c 00 00 00 mov $0x7c,%esi
> + 48f: be 89 00 00 00 mov $0x89,%esi
> 494: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
> 49b: c6 05 00 00 00 00 01 movb $0x1,0x0(%rip) # 4a2
> <perf_output_begin+0x1a2>
> 4a2: e8 00 00 00 00 callq 4a7 <perf_output_begin+0x1a7>
> @@ -874,7 +874,7 @@
> c39: eb e7 jmp c22
> <perf_aux_output_begin+0x172>
> c3b: 80 3d 00 00 00 00 00 cmpb $0x0,0x0(%rip) # c42
> <perf_aux_output_begin+0x192>
> c42: 75 93 jne bd7
> <perf_aux_output_begin+0x127>
> - c44: be 2b 01 00 00 mov $0x12b,%esi
> + c44: be 49 01 00 00 mov $0x149,%esi
> c49: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
> c50: e8 00 00 00 00 callq c55
> <perf_aux_output_begin+0x1a5>
> c55: c6 05 00 00 00 00 01 movb $0x1,0x0(%rip) # c5c
> <perf_aux_output_begin+0x1ac>
>
>
> I think you enabled some unusual config options?
>
You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
$ size kernel/events/ring_buffer.o*
text data bss dec hex filename
4545 4 8 4557 11cd
kernel/events/ring_buffer.o.new
4641 4 8 4653 122d
kernel/events/ring_buffer.o.old
Thank you.
On 2016/3/25 20:36, Wangnan (F) wrote:
>
>
> On 2016/3/25 20:26, Wangnan (F) wrote:
>>
>>
>> On 2016/3/23 17:50, Peter Zijlstra wrote:
>>> On Mon, Mar 14, 2016 at 09:59:43AM +0000, Wang Nan wrote:
>>>> Convert perf_output_begin to __perf_output_begin and make the later
>>>> function able to write records from the end of the ring buffer.
>>>> Following commits will utilize the 'backward' flag.
>>>>
>>>> This patch doesn't introduce any extra performance overhead since we
>>>> use always_inline.
>>> So while I agree that with __always_inline and constant propagation we
>>> _should_ end up with the same code, we have:
>>>
>>> $ size defconfig-build/kernel/events/ring_buffer.o.{pre,post}
>>> text data bss dec hex filename
>>> 3785 2 0 3787 ecb
>>> defconfig-build/kernel/events/ring_buffer.o.pre
>>> 3673 2 0 3675 e5b
>>> defconfig-build/kernel/events/ring_buffer.o.post
>>>
>>> The patch actually makes the file shrink.
>>>
>>> So I think we still want to have some actual performance numbers.
>>
>> In my environment the two objects are nearly idential:
>>
>>
>> $ objdump -d kernel/events/ring_buffer.o.new > ./out.new.S
>> $ objdump -d kernel/events/ring_buffer.o.old > ./out.old.S
>>
>> --- ./out.old.S 2016-03-25 12:18:52.060656423 +0000
>> +++ ./out.new.S 2016-03-25 12:18:45.376630269 +0000
>> @@ -1,5 +1,5 @@
>>
>> -kernel/events/ring_buffer.o.old: file format elf64-x86-64
>> +kernel/events/ring_buffer.o.new: file format elf64-x86-64
>>
>>
>> Disassembly of section .text:
>> @@ -320,7 +320,7 @@
>> 402: 4d 8d 04 0f lea (%r15,%rcx,1),%r8
>> 406: 48 89 c8 mov %rcx,%rax
>> 409: 4c 0f b1 43 40 cmpxchg %r8,0x40(%rbx)
>> - 40e: 48 39 c8 cmp %rcx,%rax
>> + 40e: 48 39 c1 cmp %rax,%rcx
>> 411: 75 b4 jne 3c7 <perf_output_begin+0xc7>
>> 413: 48 8b 73 58 mov 0x58(%rbx),%rsi
>> 417: 48 8b 43 68 mov 0x68(%rbx),%rax
>> @@ -357,7 +357,7 @@
>> 480: 85 c0 test %eax,%eax
>> 482: 0f 85 02 ff ff ff jne 38a <perf_output_begin+0x8a>
>> 488: 48 c7 c2 00 00 00 00 mov $0x0,%rdx
>> - 48f: be 7c 00 00 00 mov $0x7c,%esi
>> + 48f: be 89 00 00 00 mov $0x89,%esi
>> 494: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
>> 49b: c6 05 00 00 00 00 01 movb $0x1,0x0(%rip) # 4a2
>> <perf_output_begin+0x1a2>
>> 4a2: e8 00 00 00 00 callq 4a7 <perf_output_begin+0x1a7>
>> @@ -874,7 +874,7 @@
>> c39: eb e7 jmp c22
>> <perf_aux_output_begin+0x172>
>> c3b: 80 3d 00 00 00 00 00 cmpb $0x0,0x0(%rip) # c42
>> <perf_aux_output_begin+0x192>
>> c42: 75 93 jne bd7
>> <perf_aux_output_begin+0x127>
>> - c44: be 2b 01 00 00 mov $0x12b,%esi
>> + c44: be 49 01 00 00 mov $0x149,%esi
>> c49: 48 c7 c7 00 00 00 00 mov $0x0,%rdi
>> c50: e8 00 00 00 00 callq c55
>> <perf_aux_output_begin+0x1a5>
>> c55: c6 05 00 00 00 00 01 movb $0x1,0x0(%rip) # c5c
>> <perf_aux_output_begin+0x1ac>
>>
>>
>> I think you enabled some unusual config options?
>>
>
> You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
>
> $ size kernel/events/ring_buffer.o*
> text data bss dec hex filename
> 4545 4 8 4557 11cd
> kernel/events/ring_buffer.o.new
> 4641 4 8 4653 122d
> kernel/events/ring_buffer.o.old
>
> Thank you.
>
After enabling CONFIG_OPTIMIZE_INLINING:
Test its performance by calling 'close(-1)' for 3000000 times and
use 'perf record -o /dev/null -e raw_syscalls:* test-ring-buffer' to
capture system calls:
MEAN STDVAR
BASE 800077.1 23448.13
RAWPERF.PRE 2465858.0 603473.70
RAWPERF.POST 2471925.0 609437.60
Considering the high stdvar, after applying this patch the performance
is not change.
I'll put this number in my commit message.
Thank you.
On Fri, Mar 25, 2016 at 10:14:36PM +0800, Wangnan (F) wrote:
> >>I think you enabled some unusual config options?
x86_64-defconfig
> >You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
It has that indeed.
> After enabling CONFIG_OPTIMIZE_INLINING:
>
> Test its performance by calling 'close(-1)' for 3000000 times and
> use 'perf record -o /dev/null -e raw_syscalls:* test-ring-buffer' to
> capture system calls:
>
> MEAN STDVAR
> BASE 800077.1 23448.13
> RAWPERF.PRE 2465858.0 603473.70
> RAWPERF.POST 2471925.0 609437.60
>
> Considering the high stdvar, after applying this patch the performance
> is not change.
Why is your variance so immense? And doesn't that render the
measurements pointless?
On 2016/3/27 23:30, pi3orama wrote:
>
> 发自我的 iPhone
>
>> 在 2016年3月27日,下午11:20,Peter Zijlstra <[email protected]> 写道:
>>
>> On Fri, Mar 25, 2016 at 10:14:36PM +0800, Wangnan (F) wrote:
>>>>> I think you enabled some unusual config options?
>> x86_64-defconfig
>>
>>>> You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
>> It has that indeed.
>>
>>> After enabling CONFIG_OPTIMIZE_INLINING:
>>>
>>> Test its performance by calling 'close(-1)' for 3000000 times and
>>> use 'perf record -o /dev/null -e raw_syscalls:* test-ring-buffer' to
>>> capture system calls:
>>>
>>> MEAN STDVAR
>>> BASE 800077.1 23448.13
>>> RAWPERF.PRE 2465858.0 603473.70
>>> RAWPERF.POST 2471925.0 609437.60
>>>
>>> Considering the high stdvar, after applying this patch the performance
>>> is not change.
>> Why is your variance so immense? And doesn't that render the
>> measurements pointless?
>>
> For some unknown reason, about
> 10% of these results raises 2 times of normal
> results. Say, "normal results" are about
> 2200000, but those "outliers" are about
> 4400000 (I can't access raw data now).
> Variance becomes much smaller if I remove
> those outliers.
After manually removing outliners (remove 10 outliners from 100 raw
data points in each data set):
MEAN STDVAR
BASE 800077.1 23448.13
RAWPERF.PRE 2265741.0 10421.35
RAWPERF.POST 2269826.0 10507.45
Thank you.
> I guess the outliers is caused by some type
> of lock stepping? No clue about it.
>
> Thank you.
>
On 2016/3/28 9:07, Wangnan (F) wrote:
>
>
> On 2016/3/27 23:30, pi3orama wrote:
>>
>> 发自我的 iPhone
>>
>>> 在 2016年3月27日,下午11:20,Peter Zijlstra <[email protected]>
>>> 写道:
>>>
>>> On Fri, Mar 25, 2016 at 10:14:36PM +0800, Wangnan (F) wrote:
>>>>>> I think you enabled some unusual config options?
>>> x86_64-defconfig
>>>
>>>>> You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
>>> It has that indeed.
>>>
>>>> After enabling CONFIG_OPTIMIZE_INLINING:
>>>>
>>>> Test its performance by calling 'close(-1)' for 3000000 times and
>>>> use 'perf record -o /dev/null -e raw_syscalls:* test-ring-buffer' to
>>>> capture system calls:
>>>>
>>>> MEAN STDVAR
>>>> BASE 800077.1 23448.13
>>>> RAWPERF.PRE 2465858.0 603473.70
>>>> RAWPERF.POST 2471925.0 609437.60
>>>>
>>>> Considering the high stdvar, after applying this patch the performance
>>>> is not change.
>>> Why is your variance so immense? And doesn't that render the
>>> measurements pointless?
>>>
>> For some unknown reason, about
>> 10% of these results raises 2 times of normal
>> results. Say, "normal results" are about
>> 2200000, but those "outliers" are about
>> 4400000 (I can't access raw data now).
>> Variance becomes much smaller if I remove
>> those outliers.
>
Find the reason of these outliners.
If perf and 'test-ring-buffer' are scheduled on different processors,
the performance is bad. I think cache is the main reason.
I will redo the test, bind them to cores on same CPU.
Thank you.
On 2016/3/28 9:58, Wangnan (F) wrote:
>
>
> On 2016/3/28 9:07, Wangnan (F) wrote:
>>
>>
>> On 2016/3/27 23:30, pi3orama wrote:
>>>
>>> 发自我的 iPhone
>>>
>>>> 在 2016年3月27日,下午11:20,Peter Zijlstra <[email protected]>
>>>> 写道:
>>>>
>>>> On Fri, Mar 25, 2016 at 10:14:36PM +0800, Wangnan (F) wrote:
>>>>>>> I think you enabled some unusual config options?
>>>> x86_64-defconfig
>>>>
>>>>>> You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
>>>> It has that indeed.
>>>>
>>>>> After enabling CONFIG_OPTIMIZE_INLINING:
>>>>>
>>>>> Test its performance by calling 'close(-1)' for 3000000 times and
>>>>> use 'perf record -o /dev/null -e raw_syscalls:* test-ring-buffer' to
>>>>> capture system calls:
>>>>>
>>>>> MEAN STDVAR
>>>>> BASE 800077.1 23448.13
>>>>> RAWPERF.PRE 2465858.0 603473.70
>>>>> RAWPERF.POST 2471925.0 609437.60
>>>>>
>>>>> Considering the high stdvar, after applying this patch the
>>>>> performance
>>>>> is not change.
>>>> Why is your variance so immense? And doesn't that render the
>>>> measurements pointless?
>>>>
>>> For some unknown reason, about
>>> 10% of these results raises 2 times of normal
>>> results. Say, "normal results" are about
>>> 2200000, but those "outliers" are about
>>> 4400000 (I can't access raw data now).
>>> Variance becomes much smaller if I remove
>>> those outliers.
>>
>
> Find the reason of these outliners.
>
> If perf and 'test-ring-buffer' are scheduled on different processors,
> the performance is bad. I think cache is the main reason.
>
> I will redo the test, bind them to cores on same CPU.
>
> Thank you.
Test method improvements:
1. Set CPU freq:
# for f in /sys/devices/system/cpu/cpufreq/policy*/scaling_governor ;
do echo performance > $f ; done
2. Bind core:
Add following code into head of test-ring-buffer:
CPU_ZERO(&mask);
CPU_SET(6, &mask);
pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
pthread_yield();
3. Bind core (perf):
Use following command to start perf:
# taskset -c 7 ./perf record -o /dev/null --no-buildid-cache -e
raw_syscalls:* test-ring-buffer
New result of 100 test data in both cases:
MEAN STDVAR
BASE 800214.950 2853.083
RAWPERF.PRE 2253846.700 9997.014
RAWPERF.POST 2257495.540 8516.293
Thank you.
?????ҵ? iPhone
> ?? 2016??3??27?գ?????11:20??Peter Zijlstra <[email protected]> д????
>
> On Fri, Mar 25, 2016 at 10:14:36PM +0800, Wangnan (F) wrote:
>>>> I think you enabled some unusual config options?
>
> x86_64-defconfig
>
>>> You must enabled CONFIG_OPTIMIZE_INLINING. Now I get similar result:
>
> It has that indeed.
>
>> After enabling CONFIG_OPTIMIZE_INLINING:
>>
>> Test its performance by calling 'close(-1)' for 3000000 times and
>> use 'perf record -o /dev/null -e raw_syscalls:* test-ring-buffer' to
>> capture system calls:
>>
>> MEAN STDVAR
>> BASE 800077.1 23448.13
>> RAWPERF.PRE 2465858.0 603473.70
>> RAWPERF.POST 2471925.0 609437.60
>>
>> Considering the high stdvar, after applying this patch the performance
>> is not change.
>
> Why is your variance so immense? And doesn't that render the
> measurements pointless?
>
For some unknown reason, about
10% of these results raises 2 times of normal
results. Say, "normal results" are about
2200000, but those "outliers" are about
4400000 (I can't access raw data now).
Variance becomes much smaller if I remove
those outliers.
I guess the outliers is caused by some type
of lock stepping? No clue about it.
Thank you.
Hello Wangnan,
The patch below seems to have landed in Linux 4.7,
commit 86e7972f690c1017fd086cdfe53d8524e68c661c
Could you draft a man-pages patch for this interface
change, please? Or, failing that, a plain-text
description that we can integrate into the man-page.
Thanks,
Michael
On 03/23/2016 10:33 AM, Wangnan (F) wrote:
>
>
> On 2016/3/23 17:16, Peter Zijlstra wrote:
>> On Mon, Mar 14, 2016 at 09:59:41AM +0000, Wang Nan wrote:
>>> Add new ioctl() to pause/resume ring-buffer output.
>>>
>>> In some situations we want to read from ring buffer only when we
>>> ensure nothing can write to the ring buffer during reading. Without
>>> this patch we have to turn off all events attached to this ring buffer
>>> to achieve this.
>>>
>>> This patch is for supporting overwrite ring buffer. Following
>>> commits will introduce new methods support reading from overwrite ring
>>> buffer. Before reading caller must ensure the ring buffer is frozen, or
>>> the reading is unreliable.
>>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>>> index 1afe962..a3c1903 100644
>>> --- a/include/uapi/linux/perf_event.h
>>> +++ b/include/uapi/linux/perf_event.h
>>> @@ -401,6 +401,7 @@ struct perf_event_attr {
>>> #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
>>> #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
>>> #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
>>> +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
>
> Glad to see you start to look at this patchset.
>
>
>
>> Can you also do a patch to the man-pages?
>>
>> http://man7.org/linux/man-pages/man2/perf_event_open.2.html
>
> Sure.
>
> I think I need to provide a patch for:
>
> http://git.kernel.org/cgit/docs/man-pages/man-pages.git
>
> But which one should be the first? Shall we update man pages before
> this patch be merged by upstream? Or Michael and Vince will consider
> this problem?
>
> Thank you.
>
>
>
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
On 2016/10/21 15:06, Michael Kerrisk (man-pages) wrote:
> Hello Wangnan,
>
> The patch below seems to have landed in Linux 4.7,
> commit 86e7972f690c1017fd086cdfe53d8524e68c661c
>
> Could you draft a man-pages patch for this interface
> change, please? Or, failing that, a plain-text
> description that we can integrate into the man-page.
I sent man-pages patches at March:
https://patchwork.kernel.org/patch/8678861/
http://www.spinics.net/lists/linux-man/msg10177.html
Let me resend them again.
Thank you.
Hi Wangnan
On 10/21/2016 09:13 AM, Wangnan (F) wrote:
>
>
> On 2016/10/21 15:06, Michael Kerrisk (man-pages) wrote:
>> Hello Wangnan,
>>
>> The patch below seems to have landed in Linux 4.7,
>> commit 86e7972f690c1017fd086cdfe53d8524e68c661c
>>
>> Could you draft a man-pages patch for this interface
>> change, please? Or, failing that, a plain-text
>> description that we can integrate into the man-page.
>
> I sent man-pages patches at March:
>
> https://patchwork.kernel.org/patch/8678861/
> http://www.spinics.net/lists/linux-man/msg10177.html
>
> Let me resend them again.
Ahhh -- my apologies. I found these now.
Cheers,
Michael
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/