The layout of perf_sample_data is designed to minimize cache-line
access. The perf_sample_data_init() used to initialize a couple of
fields unconditionally so they were placed together at the head.
But it's changed now to set the fields according to the actual
sample_type flags. The main user (the perf tools) sets the IP, TID,
TIME, PERIOD always. Also group relevant fields like addr, phys_addr
and data_page_size.
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
include/linux/perf_event.h | 34 +++++++++++++++++++---------------
1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c6a3bac76966..dd565306f479 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1098,47 +1098,51 @@ extern u64 perf_event_read_value(struct perf_event *event,
struct perf_sample_data {
/*
- * Fields set by perf_sample_data_init(), group so as to
- * minimize the cachelines touched.
+ * Fields set by perf_sample_data_init() unconditionally,
+ * group so as to minimize the cachelines touched.
*/
u64 sample_flags;
u64 period;
/*
- * The other fields, optionally {set,used} by
- * perf_{prepare,output}_sample().
+ * Fields commonly set by __perf_event_header__init_id(),
+ * group so as to minimize the cachelines touched.
*/
- struct perf_branch_stack *br_stack;
- union perf_sample_weight weight;
- union perf_mem_data_src data_src;
- u64 txn;
- u64 addr;
- struct perf_raw_record *raw;
-
u64 type;
- u64 ip;
struct {
u32 pid;
u32 tid;
} tid_entry;
u64 time;
u64 id;
- u64 stream_id;
struct {
u32 cpu;
u32 reserved;
} cpu_entry;
+
+ /*
+ * The other fields, optionally {set,used} by
+ * perf_{prepare,output}_sample().
+ */
+ u64 ip;
struct perf_callchain_entry *callchain;
- u64 aux_size;
+ struct perf_raw_record *raw;
+ struct perf_branch_stack *br_stack;
+ union perf_sample_weight weight;
+ union perf_mem_data_src data_src;
+ u64 txn;
struct perf_regs regs_user;
struct perf_regs regs_intr;
u64 stack_user_size;
- u64 phys_addr;
+ u64 stream_id;
u64 cgroup;
+ u64 addr;
+ u64 phys_addr;
u64 data_page_size;
u64 code_page_size;
+ u64 aux_size;
} ____cacheline_aligned;
/* default value for data source */
--
2.39.0.314.g84b9a713c41-goog
The perf_prepare_sample() sets the perf_sample_data according to the
attr->sample_type before copying it to the ring buffer. But BPF also
wants to access the sample data so it needs to prepare the sample even
before the regular path.
That means the perf_prepare_sample() can be called more than once. Set
the data->sample_flags consistently so that it can indicate which fields
are set already and skip them if sets.
Mostly it's just a matter of checking filtered_sample_type which is a
bitmask for unset bits in the attr->sample_type. But some of sample
data is implied by others even if it's not in the attr->sample_type
(like PERF_SAMPLE_ADDR for PERF_SAMPLE_PHYS_ADDR). So they need to
check data->sample_flags separately.
Also some of them like callchain, user regs/stack and aux data require
more calculations. Protect them using the data->sample_flags to avoid
the duplicate work.
Signed-off-by: Namhyung Kim <[email protected]>
---
Maybe we don't need this change to prevent duplication in favor of the
next patch using the data->saved_size. But I think it's still useful
to set data->sample_flags consistently. Anyway it's up to you.
kernel/events/core.c | 86 ++++++++++++++++++++++++++++++++------------
1 file changed, 63 insertions(+), 23 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eacc3702654d..70bff8a04583 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7582,14 +7582,21 @@ void perf_prepare_sample(struct perf_event_header *header,
filtered_sample_type = sample_type & ~data->sample_flags;
__perf_event_header__init_id(header, data, event, filtered_sample_type);
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
- data->ip = perf_instruction_pointer(regs);
+ if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
+ /* attr.sample_type may not have PERF_SAMPLE_IP */
+ if (!(data->sample_flags & PERF_SAMPLE_IP)) {
+ data->ip = perf_instruction_pointer(regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
+ }
+ }
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
data->callchain = perf_callchain(event, regs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
size += data->callchain->nr;
@@ -7634,8 +7641,13 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs);
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) {
+ /* attr.sample_type may not have PERF_SAMPLE_REGS_USER */
+ if (!(data->sample_flags & PERF_SAMPLE_REGS_USER)) {
+ perf_sample_regs_user(&data->regs_user, regs);
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
+ }
+ }
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -7656,11 +7668,18 @@ void perf_prepare_sample(struct perf_event_header *header,
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
- u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
+ u16 stack_size;
+
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
+ stack_size = event->attr.sample_stack_user;
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ data->regs_user.regs);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
- data->regs_user.regs);
+ data->stack_user_size = stack_size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
+ }
+ stack_size = data->stack_user_size;
/*
* If there is something to dump, add space for the dump
@@ -7670,29 +7689,40 @@ void perf_prepare_sample(struct perf_event_header *header,
if (stack_size)
size += sizeof(u64) + stack_size;
- data->stack_user_size = stack_size;
header->size += size;
}
- if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
- if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
- if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
- if (filtered_sample_type & PERF_SAMPLE_ADDR)
+ /* attr.sample_type may not have PERF_SAMPLE_ADDR */
+ if (!(data->sample_flags & PERF_SAMPLE_ADDR)) {
data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
}
if (sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_intr(&data->regs_intr, regs);
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
+ perf_sample_regs_intr(&data->regs_intr, regs);
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
+ }
if (data->regs_intr.regs) {
u64 mask = event->attr.sample_regs_intr;
@@ -7703,17 +7733,19 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
- filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7722,11 +7754,15 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
@@ -7739,10 +7775,14 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
- event->attr.aux_sample_size);
- size = rounddown(size, 8);
- size = perf_prepare_sample_aux(event, data, size);
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ perf_prepare_sample_aux(event, data, size);
+ data->sample_flags |= PERF_SAMPLE_AUX;
+ }
+ size = data->aux_size;
WARN_ON_ONCE(size + header->size > U16_MAX);
header->size += size;
--
2.39.0.314.g84b9a713c41-goog
On Thu, Dec 29, 2022 at 12:41:00PM -0800, Namhyung Kim wrote:
So I like the general idea; I just think it's turned into a bit of a
mess. That is code is already overly branchy which is known to hurt
performance, we should really try and not make it worse than absolutely
needed.
> kernel/events/core.c | 86 ++++++++++++++++++++++++++++++++------------
> 1 file changed, 63 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index eacc3702654d..70bff8a04583 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7582,14 +7582,21 @@ void perf_prepare_sample(struct perf_event_header *header,
> filtered_sample_type = sample_type & ~data->sample_flags;
> __perf_event_header__init_id(header, data, event, filtered_sample_type);
>
> - if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
> - data->ip = perf_instruction_pointer(regs);
> + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
> + /* attr.sample_type may not have PERF_SAMPLE_IP */
Right, but that shouldn't matter, IIRC its OK to have more bits set in
data->sample_flags than we have set in attr.sample_type. It just means
we have data available for sample types we're (possibly) not using.
That is, I think you can simply write this like:
> + if (!(data->sample_flags & PERF_SAMPLE_IP)) {
> + data->ip = perf_instruction_pointer(regs);
> + data->sample_flags |= PERF_SAMPLE_IP;
> + }
> + }
if (filtered_sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
data->ip = perf_instruction_pointer(regs);
data->sample_flags |= PERF_SAMPLE_IP);
}
...
if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
}
Then after a single perf_prepare_sample() run we have:
pre | post
----------------------------------------
0 | 0
IP | IP
CODE_PAGE_SIZE | IP|CODE_PAGE_SIZE
IP|CODE_PAGE_SIZE | IP|CODE_PAGE_SIZE
So while data->sample_flags will have an extra bit set in the 3rd case,
that will not affect perf_sample_outout() which only looks at data->type
(== attr.sample_type).
And since data->sample_flags will have both bits set, a second run will
filter out both and avoid the extra work (except doing that will mess up
the branch predictors).
> if (sample_type & PERF_SAMPLE_CALLCHAIN) {
> int size = 1;
>
> - if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
> + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
> data->callchain = perf_callchain(event, regs);
> + data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
> + }
>
> size += data->callchain->nr;
>
This, why can't this be:
if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
data->callchain = perf_callchain(event, regs);
data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
header->size += (1 + data->callchain->nr) * sizeof(u64);
}
I suppose this is because perf_event_header lives on the stack of the
overflow handler and all that isn't available / relevant for the BPF
thing.
And we can't pull that out into anther function without adding yet
another branch fest.
However; inspired by your next patch; we can do something like so:
if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
data->callchain = perf_callchain(event, regs);
data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
data->size += (1 + data->callchain->nr) * sizeof(u64);
}
And then have __perf_event_output() (or something thereabout) do:
perf_prepare_sample(data, event, regs);
perf_prepare_header(&header, data, event);
err = output_begin(&handle, data, event, header.size);
if (err)
goto exit;
perf_output_sample(&handle, &header, data, event);
perf_output_end(&handle);
With perf_prepare_header() being something like:
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size + data->size;
header->misc = perf_misc_flags(regs);
...
Hmm ?
(same for all the other sites)
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 965547f0a2f3cd0786d4faee2db008bb281d4922
Gitweb: https://git.kernel.org/tip/965547f0a2f3cd0786d4faee2db008bb281d4922
Author: Namhyung Kim <[email protected]>
AuthorDate: Thu, 29 Dec 2022 12:41:00 -08:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 09 Jan 2023 12:22:09 +01:00
perf/core: Set data->sample_flags in perf_prepare_sample()
The perf_prepare_sample() sets the perf_sample_data according to the
attr->sample_type before copying it to the ring buffer. But BPF also
wants to access the sample data so it needs to prepare the sample even
before the regular path.
That means the perf_prepare_sample() can be called more than once. Set
the data->sample_flags consistently so that it can indicate which fields
are set already and skip them if sets.
Mostly it's just a matter of checking filtered_sample_type which is a
bitmask for unset bits in the attr->sample_type. But some of sample
data is implied by others even if it's not in the attr->sample_type
(like PERF_SAMPLE_ADDR for PERF_SAMPLE_PHYS_ADDR). So they need to
check data->sample_flags separately.
Also some of them like callchain, user regs/stack and aux data require
more calculations. Protect them using the data->sample_flags to avoid
the duplicate work.
Signed-off-by: Namhyung Kim <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
kernel/events/core.c | 86 +++++++++++++++++++++++++++++++------------
1 file changed, 63 insertions(+), 23 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eacc370..70bff8a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7582,14 +7582,21 @@ void perf_prepare_sample(struct perf_event_header *header,
filtered_sample_type = sample_type & ~data->sample_flags;
__perf_event_header__init_id(header, data, event, filtered_sample_type);
- if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
- data->ip = perf_instruction_pointer(regs);
+ if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
+ /* attr.sample_type may not have PERF_SAMPLE_IP */
+ if (!(data->sample_flags & PERF_SAMPLE_IP)) {
+ data->ip = perf_instruction_pointer(regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
+ }
+ }
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
data->callchain = perf_callchain(event, regs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
size += data->callchain->nr;
@@ -7634,8 +7641,13 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs);
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) {
+ /* attr.sample_type may not have PERF_SAMPLE_REGS_USER */
+ if (!(data->sample_flags & PERF_SAMPLE_REGS_USER)) {
+ perf_sample_regs_user(&data->regs_user, regs);
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
+ }
+ }
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -7656,11 +7668,18 @@ void perf_prepare_sample(struct perf_event_header *header,
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
- u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
+ u16 stack_size;
+
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
+ stack_size = event->attr.sample_stack_user;
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ data->regs_user.regs);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
- data->regs_user.regs);
+ data->stack_user_size = stack_size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
+ }
+ stack_size = data->stack_user_size;
/*
* If there is something to dump, add space for the dump
@@ -7670,29 +7689,40 @@ void perf_prepare_sample(struct perf_event_header *header,
if (stack_size)
size += sizeof(u64) + stack_size;
- data->stack_user_size = stack_size;
header->size += size;
}
- if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
- if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
- if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
- if (filtered_sample_type & PERF_SAMPLE_ADDR)
+ /* attr.sample_type may not have PERF_SAMPLE_ADDR */
+ if (!(data->sample_flags & PERF_SAMPLE_ADDR)) {
data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
}
if (sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_intr(&data->regs_intr, regs);
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
+ perf_sample_regs_intr(&data->regs_intr, regs);
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
+ }
if (data->regs_intr.regs) {
u64 mask = event->attr.sample_regs_intr;
@@ -7703,17 +7733,19 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
- filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
#ifdef CONFIG_CGROUP_PERF
- if (sample_type & PERF_SAMPLE_CGROUP) {
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
struct cgroup *cgrp;
/* protected by RCU */
cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
}
#endif
@@ -7722,11 +7754,15 @@ void perf_prepare_sample(struct perf_event_header *header,
* require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
* but the value will not dump to the userspace.
*/
- if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
- if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
@@ -7739,10 +7775,14 @@ void perf_prepare_sample(struct perf_event_header *header,
* Make sure this doesn't happen by using up to U16_MAX bytes
* per sample in total (rounded down to 8 byte boundary).
*/
- size = min_t(size_t, U16_MAX - header->size,
- event->attr.aux_sample_size);
- size = rounddown(size, 8);
- size = perf_prepare_sample_aux(event, data, size);
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ perf_prepare_sample_aux(event, data, size);
+ data->sample_flags |= PERF_SAMPLE_AUX;
+ }
+ size = data->aux_size;
WARN_ON_ONCE(size + header->size > U16_MAX);
header->size += size;
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 7bdb1767bf011c7f6065ac483ad2f00e434c3979
Gitweb: https://git.kernel.org/tip/7bdb1767bf011c7f6065ac483ad2f00e434c3979
Author: Namhyung Kim <[email protected]>
AuthorDate: Thu, 29 Dec 2022 12:40:59 -08:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 09 Jan 2023 12:22:09 +01:00
perf/core: Change the layout of perf_sample_data
The layout of perf_sample_data is designed to minimize cache-line
access. The perf_sample_data_init() used to initialize a couple of
fields unconditionally so they were placed together at the head.
But it's changed now to set the fields according to the actual
sample_type flags. The main user (the perf tools) sets the IP, TID,
TIME, PERIOD always. Also group relevant fields like addr, phys_addr
and data_page_size.
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
include/linux/perf_event.h | 34 +++++++++++++++++++---------------
1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ad92ad3..03949d0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1098,47 +1098,51 @@ extern u64 perf_event_read_value(struct perf_event *event,
struct perf_sample_data {
/*
- * Fields set by perf_sample_data_init(), group so as to
- * minimize the cachelines touched.
+ * Fields set by perf_sample_data_init() unconditionally,
+ * group so as to minimize the cachelines touched.
*/
u64 sample_flags;
u64 period;
/*
- * The other fields, optionally {set,used} by
- * perf_{prepare,output}_sample().
+ * Fields commonly set by __perf_event_header__init_id(),
+ * group so as to minimize the cachelines touched.
*/
- struct perf_branch_stack *br_stack;
- union perf_sample_weight weight;
- union perf_mem_data_src data_src;
- u64 txn;
- u64 addr;
- struct perf_raw_record *raw;
-
u64 type;
- u64 ip;
struct {
u32 pid;
u32 tid;
} tid_entry;
u64 time;
u64 id;
- u64 stream_id;
struct {
u32 cpu;
u32 reserved;
} cpu_entry;
+
+ /*
+ * The other fields, optionally {set,used} by
+ * perf_{prepare,output}_sample().
+ */
+ u64 ip;
struct perf_callchain_entry *callchain;
- u64 aux_size;
+ struct perf_raw_record *raw;
+ struct perf_branch_stack *br_stack;
+ union perf_sample_weight weight;
+ union perf_mem_data_src data_src;
+ u64 txn;
struct perf_regs regs_user;
struct perf_regs regs_intr;
u64 stack_user_size;
- u64 phys_addr;
+ u64 stream_id;
u64 cgroup;
+ u64 addr;
+ u64 phys_addr;
u64 data_page_size;
u64 code_page_size;
+ u64 aux_size;
} ____cacheline_aligned;
/* default value for data source */
Hi Peter,
On Mon, Jan 09, 2023 at 01:14:31PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 29, 2022 at 12:41:00PM -0800, Namhyung Kim wrote:
>
> So I like the general idea; I just think it's turned into a bit of a
> mess. That is code is already overly branchy which is known to hurt
> performance, we should really try and not make it worse than absolutely
> needed.
Agreed.
>
> > kernel/events/core.c | 86 ++++++++++++++++++++++++++++++++------------
> > 1 file changed, 63 insertions(+), 23 deletions(-)
> >
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index eacc3702654d..70bff8a04583 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -7582,14 +7582,21 @@ void perf_prepare_sample(struct perf_event_header *header,
> > filtered_sample_type = sample_type & ~data->sample_flags;
> > __perf_event_header__init_id(header, data, event, filtered_sample_type);
> >
> > - if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
> > - data->ip = perf_instruction_pointer(regs);
> > + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
> > + /* attr.sample_type may not have PERF_SAMPLE_IP */
>
> Right, but that shouldn't matter, IIRC its OK to have more bits set in
> data->sample_flags than we have set in attr.sample_type. It just means
> we have data available for sample types we're (possibly) not using.
>
> That is, I think you can simply write this like:
>
> > + if (!(data->sample_flags & PERF_SAMPLE_IP)) {
> > + data->ip = perf_instruction_pointer(regs);
> > + data->sample_flags |= PERF_SAMPLE_IP;
> > + }
> > + }
>
> if (filtered_sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
> data->ip = perf_instruction_pointer(regs);
> data->sample_flags |= PERF_SAMPLE_IP);
> }
>
> ...
>
> if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
> data->code_page_size = perf_get_page_size(data->ip);
> data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
> }
>
> Then after a single perf_prepare_sample() run we have:
>
> pre | post
> ----------------------------------------
> 0 | 0
> IP | IP
> CODE_PAGE_SIZE | IP|CODE_PAGE_SIZE
> IP|CODE_PAGE_SIZE | IP|CODE_PAGE_SIZE
>
> So while data->sample_flags will have an extra bit set in the 3rd case,
> that will not affect perf_sample_outout() which only looks at data->type
> (== attr.sample_type).
>
> And since data->sample_flags will have both bits set, a second run will
> filter out both and avoid the extra work (except doing that will mess up
> the branch predictors).
Yeah, it'd be better to check filtered_sample_type in the first place.
Btw, I was thinking about a hypothetical scenario that IP set by a PMU
driver not from the regs. In this case, having CODE_PAGE_SIZE will
overwrite the IP. I don't think we need to worry about that for now
since PMU drivers updates the regs (using set_linear_ip). But it seems
like a possible scenario for something like PEBS or IBS.
>
>
> > if (sample_type & PERF_SAMPLE_CALLCHAIN) {
> > int size = 1;
> >
> > - if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
> > + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
> > data->callchain = perf_callchain(event, regs);
> > + data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
> > + }
> >
> > size += data->callchain->nr;
> >
>
> This, why can't this be:
>
> if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
> data->callchain = perf_callchain(event, regs);
> data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
>
> header->size += (1 + data->callchain->nr) * sizeof(u64);
> }
>
> I suppose this is because perf_event_header lives on the stack of the
> overflow handler and all that isn't available / relevant for the BPF
> thing.
Right, it needs to calculate the data size for each sample data.
>
> And we can't pull that out into anther function without adding yet
> another branch fest.
>
> However; inspired by your next patch; we can do something like so:
>
> if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
> data->callchain = perf_callchain(event, regs);
> data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
>
> data->size += (1 + data->callchain->nr) * sizeof(u64);
> }
This is fine as long as all other places (like in PMU drivers) set the
callchain update the sample data size accordingly. If not, we can get
the callchain but the data size will be wrong.
>
> And then have __perf_event_output() (or something thereabout) do:
>
> perf_prepare_sample(data, event, regs);
> perf_prepare_header(&header, data, event);
> err = output_begin(&handle, data, event, header.size);
> if (err)
> goto exit;
> perf_output_sample(&handle, &header, data, event);
> perf_output_end(&handle);
>
> With perf_prepare_header() being something like:
>
> header->type = PERF_RECORD_SAMPLE;
> header->size = sizeof(*header) + event->header_size + data->size;
> header->misc = perf_misc_flags(regs);
> ...
>
> Hmm ?
>
> (same for all the other sites)
Looks good. But I'm confused by the tip-bot2 messages saying it's
merged. Do you want me to work on it as a follow up?
Thanks,
Namhyung
On Mon, Jan 09, 2023 at 12:21:25PM -0800, Namhyung Kim wrote:
> Looks good. But I'm confused by the tip-bot2 messages saying it's
> merged. Do you want me to work on it as a follow up?
Ingo and me talked past one another, I agreed with 1/3 and he applied
the whole series. Just talked to him again and he's just zapped these
last two patches.
Sorry for the confusion.
On Mon, Jan 09, 2023 at 12:21:25PM -0800, Namhyung Kim wrote:
> > However; inspired by your next patch; we can do something like so:
> >
> > if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
> > data->callchain = perf_callchain(event, regs);
> > data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
> >
> > data->size += (1 + data->callchain->nr) * sizeof(u64);
> > }
>
> This is fine as long as all other places (like in PMU drivers) set the
> callchain update the sample data size accordingly. If not, we can get
> the callchain but the data size will be wrong.
Good point, maybe add a helper there to ensure that code doesn't
duplicate/diverge?
* Peter Zijlstra <[email protected]> wrote:
> On Mon, Jan 09, 2023 at 12:21:25PM -0800, Namhyung Kim wrote:
>
> > Looks good. But I'm confused by the tip-bot2 messages saying it's
> > merged. Do you want me to work on it as a follow up?
>
> Ingo and me talked past one another, I agreed with 1/3 and he applied
> the whole series. Just talked to him again and he's just zapped these
> last two patches.
Yeah - perf/core is now:
7bdb1767bf01 ("perf/core: Change the layout of perf_sample_data")
which can be used for further work. Sorry about the confusion ...
Thanks,
Ingo
On Tue, Jan 10, 2023 at 2:55 AM Peter Zijlstra <[email protected]> wrote:
>
> On Mon, Jan 09, 2023 at 12:21:25PM -0800, Namhyung Kim wrote:
>
> > > However; inspired by your next patch; we can do something like so:
> > >
> > > if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) {
> > > data->callchain = perf_callchain(event, regs);
> > > data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
> > >
> > > data->size += (1 + data->callchain->nr) * sizeof(u64);
> > > }
> >
> > This is fine as long as all other places (like in PMU drivers) set the
> > callchain update the sample data size accordingly. If not, we can get
> > the callchain but the data size will be wrong.
>
> Good point, maybe add a helper there to ensure that code doesn't
> duplicate/diverge?
Sure, will do.
Thanks,
Namhyung
Hi Ingo,
On Tue, Jan 10, 2023 at 3:10 AM Ingo Molnar <[email protected]> wrote:
>
>
> * Peter Zijlstra <[email protected]> wrote:
>
> > On Mon, Jan 09, 2023 at 12:21:25PM -0800, Namhyung Kim wrote:
> >
> > > Looks good. But I'm confused by the tip-bot2 messages saying it's
> > > merged. Do you want me to work on it as a follow up?
> >
> > Ingo and me talked past one another, I agreed with 1/3 and he applied
> > the whole series. Just talked to him again and he's just zapped these
> > last two patches.
>
> Yeah - perf/core is now:
>
> 7bdb1767bf01 ("perf/core: Change the layout of perf_sample_data")
>
> which can be used for further work. Sorry about the confusion ...
No problem, thanks for your work!
Thanks,
Namhyung
On Mon, Jan 9, 2023 at 12:21 PM Namhyung Kim <[email protected]> wrote:
>
> Hi Peter,
>
> On Mon, Jan 09, 2023 at 01:14:31PM +0100, Peter Zijlstra wrote:
> > On Thu, Dec 29, 2022 at 12:41:00PM -0800, Namhyung Kim wrote:
> >
> > So I like the general idea; I just think it's turned into a bit of a
> > mess. That is code is already overly branchy which is known to hurt
> > performance, we should really try and not make it worse than absolutely
> > needed.
>
> Agreed.
>
> >
> > > kernel/events/core.c | 86 ++++++++++++++++++++++++++++++++------------
> > > 1 file changed, 63 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > > index eacc3702654d..70bff8a04583 100644
> > > --- a/kernel/events/core.c
> > > +++ b/kernel/events/core.c
> > > @@ -7582,14 +7582,21 @@ void perf_prepare_sample(struct perf_event_header *header,
> > > filtered_sample_type = sample_type & ~data->sample_flags;
> > > __perf_event_header__init_id(header, data, event, filtered_sample_type);
> > >
> > > - if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
> > > - data->ip = perf_instruction_pointer(regs);
> > > + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
> > > + /* attr.sample_type may not have PERF_SAMPLE_IP */
> >
> > Right, but that shouldn't matter, IIRC its OK to have more bits set in
> > data->sample_flags than we have set in attr.sample_type. It just means
> > we have data available for sample types we're (possibly) not using.
> >
> > That is, I think you can simply write this like:
> >
> > > + if (!(data->sample_flags & PERF_SAMPLE_IP)) {
> > > + data->ip = perf_instruction_pointer(regs);
> > > + data->sample_flags |= PERF_SAMPLE_IP;
> > > + }
> > > + }
> >
> > if (filtered_sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) {
> > data->ip = perf_instruction_pointer(regs);
> > data->sample_flags |= PERF_SAMPLE_IP);
> > }
> >
> > ...
> >
> > if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
> > data->code_page_size = perf_get_page_size(data->ip);
> > data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
> > }
> >
> > Then after a single perf_prepare_sample() run we have:
> >
> > pre | post
> > ----------------------------------------
> > 0 | 0
> > IP | IP
> > CODE_PAGE_SIZE | IP|CODE_PAGE_SIZE
> > IP|CODE_PAGE_SIZE | IP|CODE_PAGE_SIZE
> >
> > So while data->sample_flags will have an extra bit set in the 3rd case,
> > that will not affect perf_sample_outout() which only looks at data->type
> > (== attr.sample_type).
> >
> > And since data->sample_flags will have both bits set, a second run will
> > filter out both and avoid the extra work (except doing that will mess up
> > the branch predictors).
>
> Yeah, it'd be better to check filtered_sample_type in the first place.
>
> Btw, I was thinking about a hypothetical scenario that IP set by a PMU
> driver not from the regs. In this case, having CODE_PAGE_SIZE will
> overwrite the IP. I don't think we need to worry about that for now
> since PMU drivers updates the regs (using set_linear_ip). But it seems
> like a possible scenario for something like PEBS or IBS.
Another example, but in this case it's real, is ADDR. We cannot update
the data->addr just because filtered_sample_type has PHYS_ADDR or
DATA_PAGE_SIZE as it'd lose the original value.
Other than that, I'll update the other paths to minimized the branches.
Thanks,
Namhyung
On Tue, Jan 10, 2023 at 12:06:00PM -0800, Namhyung Kim wrote:
> Another example, but in this case it's real, is ADDR. We cannot update
> the data->addr just because filtered_sample_type has PHYS_ADDR or
> DATA_PAGE_SIZE as it'd lose the original value.
Hmm, how about something like so?
/*
* if (flags & s) flags |= d; // without branches
*/
static __always_inline unsigned long
__cond_set(unsigned long flags, unsigned long s, unsigned long d)
{
return flags | (d * !!(flags & s));
}
Then:
fst = sample_type;
fst = __cond_set(fst, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP);
fst = __cond_set(fst, PERF_SAMPLE_DATA_PAGE_SIZE |
PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
fst = __cond_set(fst, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER);
fst &= ~data->sample_flags;
This way we express the implicit conditions by setting the required
sample data flags, then we mask those we already have set.
After the above something like:
if (fst & PERF_SAMPLE_ADDR) {
data->addr = 0;
data->sample_flags |= PERF_SAMPLE_ADDR;
}
if (fst & PERF_SAMPLE_PHYS_ADDR) {
data->phys_addr = perf_virt_to_phys(data->addr);
data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
}
if (fst & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
}
And maybe something like:
#define __IF_SAMPLE_DATA(f_) ({ \
bool __f = fst & PERF_SAMPLE_##f_; \
if (__f) data->sample_flags |= PERF_SAMPLE_##f_;\
__f; })
#define IF_SAMPLE_DATA(f_) if (__IF_SAMPLE_DATA(f_))
Then we can write:
IF_SAMPLE_DATA(ADDR)
data->addr = 0;
IF_SAMPLE_DATA(PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
IF_SAMPLE_DATA(DATA_PAGE_SIZE)
data->data_page_size = perf_get_page_size(data->addr);
But I didn't check code-gen for this last suggestion.
On Wed, Jan 11, 2023 at 01:54:54PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 10, 2023 at 12:06:00PM -0800, Namhyung Kim wrote:
>
> > Another example, but in this case it's real, is ADDR. We cannot update
> > the data->addr just because filtered_sample_type has PHYS_ADDR or
> > DATA_PAGE_SIZE as it'd lose the original value.
>
> Hmm, how about something like so?
>
> /*
> * if (flags & s) flags |= d; // without branches
> */
> static __always_inline unsigned long
> __cond_set(unsigned long flags, unsigned long s, unsigned long d)
> {
> return flags | (d * !!(flags & s));
> }
>
> Then:
>
> fst = sample_type;
> fst = __cond_set(fst, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP);
> fst = __cond_set(fst, PERF_SAMPLE_DATA_PAGE_SIZE |
> PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
> fst = __cond_set(fst, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER);
> fst &= ~data->sample_flags;
>
Hmm, I think it's better to write this like:
static __always_inline unsigned long
__cond_set(unsigned long flags, unsigned long s, unsigned long d)
{
return d * !!(flags & s);
}
fst = sample_type;
fst |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP);
fst |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
fst |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER);
fst &= ~data->sample_flags;
Which should be identical but has less data dependencies and thus gives
an OoO CPU more leaway to paralleize things.
On Wed, Jan 11, 2023 at 8:45 AM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Jan 11, 2023 at 01:54:54PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 10, 2023 at 12:06:00PM -0800, Namhyung Kim wrote:
> >
> > > Another example, but in this case it's real, is ADDR. We cannot update
> > > the data->addr just because filtered_sample_type has PHYS_ADDR or
> > > DATA_PAGE_SIZE as it'd lose the original value.
> >
> > Hmm, how about something like so?
> >
> > /*
> > * if (flags & s) flags |= d; // without branches
> > */
> > static __always_inline unsigned long
> > __cond_set(unsigned long flags, unsigned long s, unsigned long d)
> > {
> > return flags | (d * !!(flags & s));
> > }
> >
> > Then:
> >
> > fst = sample_type;
> > fst = __cond_set(fst, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP);
> > fst = __cond_set(fst, PERF_SAMPLE_DATA_PAGE_SIZE |
> > PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
> > fst = __cond_set(fst, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER);
> > fst &= ~data->sample_flags;
> >
>
> Hmm, I think it's better to write this like:
>
> static __always_inline unsigned long
> __cond_set(unsigned long flags, unsigned long s, unsigned long d)
> {
> return d * !!(flags & s);
> }
>
> fst = sample_type;
> fst |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, PERF_SAMPLE_IP);
> fst |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
> PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
> fst |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, PERF_SAMPLE_REGS_USER);
> fst &= ~data->sample_flags;
>
> Which should be identical but has less data dependencies and thus gives
> an OoO CPU more leaway to paralleize things.
Looks good. Let me try this.
Thanks,
Namhyung