2021-11-02 18:08:00

by German Gomez

[permalink] [raw]
Subject: [PATCH 0/3] perf arm-spe: Track pid/tid for Arm SPE samples

The following patchset is an iteration on RFC [1] where pid/tid info is
assigned to the Arm SPE synthesized samples. Two methods of tracking
pids are considered: hardware-based (using Arm SPE CONTEXT packets), and
context-switch events (from perf) as fallback.

- Patch #1 enables pid tracking using RECORD_SWITCH* events from perf.
- Patch #2 saves the value of SPE CONTEXT packet to the arm_spe_record
struct (patch from [2]).
- Patch #3 enables hardware-based pid tracking using SPE CONTEXT packets.

[1] https://lore.kernel.org/lkml/[email protected]/
[2] https://www.spinics.net/lists/linux-perf-users/msg12543.html

German Gomez (3):
perf arm-spe: Track task context switch for cpu-mode events
perf arm-spe: Save context ID in record
perf arm-spe: Support hardware-based PID tracing

tools/perf/arch/arm64/util/arm-spe.c | 6 +-
.../util/arm-spe-decoder/arm-spe-decoder.c | 2 +
.../util/arm-spe-decoder/arm-spe-decoder.h | 1 +
tools/perf/util/arm-spe.c | 144 ++++++++++++++----
4 files changed, 123 insertions(+), 30 deletions(-)

--
2.25.1



2021-11-02 18:08:10

by German Gomez

[permalink] [raw]
Subject: [PATCH 2/3] perf arm-spe: Save context ID in record

This patch is to save context ID in record, this will be used to set TID
for samples.

Signed-off-by: Leo Yan <[email protected]>
Signed-off-by: James Clark <[email protected]>
Signed-off-by: German Gomez <[email protected]>
---
tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 2 ++
tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 1 +
2 files changed, 3 insertions(+)

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 32fe41835..1b58859d2 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -151,6 +151,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
u64 payload, ip;

memset(&decoder->record, 0x0, sizeof(decoder->record));
+ decoder->record.context_id = -1;

while (1) {
err = arm_spe_get_next_packet(decoder);
@@ -180,6 +181,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
case ARM_SPE_COUNTER:
break;
case ARM_SPE_CONTEXT:
+ decoder->record.context_id = payload;
break;
case ARM_SPE_OP_TYPE:
if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 59bdb7309..46a8556a9 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -38,6 +38,7 @@ struct arm_spe_record {
u64 timestamp;
u64 virt_addr;
u64 phys_addr;
+ u64 context_id;
};

struct arm_spe_insn;
--
2.25.1


2021-11-02 18:08:16

by German Gomez

[permalink] [raw]
Subject: [PATCH 3/3] perf arm-spe: Support hardware-based PID tracing

If Arm SPE traces contain CONTEXT packets with PID info, use these
values for tracking pid of samples. Otherwise fall back to using context
switch events and display a message warning the user of possible timing
inaccuracies [1].

[1] https://lore.kernel.org/lkml/[email protected]/

Signed-off-by: German Gomez <[email protected]>
---
tools/perf/util/arm-spe.c | 123 ++++++++++++++++++++++++++++----------
1 file changed, 92 insertions(+), 31 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 230bc7ab2..00a409469 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -71,6 +71,7 @@ struct arm_spe {
u64 kernel_start;

unsigned long num_events;
+ u8 use_ctx_pkt_for_pid;
};

struct arm_spe_queue {
@@ -226,6 +227,44 @@ static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
PERF_RECORD_MISC_USER;
}

+static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
+ struct auxtrace_queue *queue)
+{
+ struct arm_spe_queue *speq = queue->priv;
+ pid_t tid;
+
+ tid = machine__get_current_tid(spe->machine, speq->cpu);
+ if (tid != -1) {
+ speq->tid = tid;
+ thread__zput(speq->thread);
+ } else
+ speq->tid = queue->tid;
+
+ if ((!speq->thread) && (speq->tid != -1)) {
+ speq->thread = machine__find_thread(spe->machine, -1,
+ speq->tid);
+ }
+
+ if (speq->thread) {
+ speq->pid = speq->thread->pid_;
+ if (queue->cpu == -1)
+ speq->cpu = speq->thread->cpu;
+ }
+}
+
+static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
+{
+ struct arm_spe *spe = speq->spe;
+ int err = machine__set_current_tid(spe->machine, speq->cpu, tid, tid);
+
+ if (err)
+ return err;
+
+ arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
+
+ return 0;
+}
+
static void arm_spe_prep_sample(struct arm_spe *spe,
struct arm_spe_queue *speq,
union perf_event *event,
@@ -460,6 +499,13 @@ static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
* can correlate samples between Arm SPE trace data and other
* perf events with correct time ordering.
*/
+
+ if (spe->use_ctx_pkt_for_pid) {
+ ret = arm_spe_set_tid(speq, speq->decoder->record.context_id);
+ if (ret)
+ return ret;
+ }
+
ret = arm_spe_sample(speq);
if (ret)
return ret;
@@ -586,31 +632,6 @@ static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
return timeless_decoding;
}

-static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
- struct auxtrace_queue *queue)
-{
- struct arm_spe_queue *speq = queue->priv;
- pid_t tid;
-
- tid = machine__get_current_tid(spe->machine, speq->cpu);
- if (tid != -1) {
- speq->tid = tid;
- thread__zput(speq->thread);
- } else
- speq->tid = queue->tid;
-
- if ((!speq->thread) && (speq->tid != -1)) {
- speq->thread = machine__find_thread(spe->machine, -1,
- speq->tid);
- }
-
- if (speq->thread) {
- speq->pid = speq->thread->pid_;
- if (queue->cpu == -1)
- speq->cpu = speq->thread->cpu;
- }
-}
-
static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
{
unsigned int queue_nr;
@@ -641,7 +662,13 @@ static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
ts = timestamp;
}

- arm_spe_set_pid_tid_cpu(spe, queue);
+ /*
+ * Here we only consider PID tracking based on switch events.
+ * For tracking based on CONTEXT packets, the pid is assigned in the function
+ * arm_spe_run_decoder() in order to support timeless decoding.
+ */
+ if (!spe->use_ctx_pkt_for_pid)
+ arm_spe_set_pid_tid_cpu(spe, queue);

ret = arm_spe_run_decoder(speq, &ts);
if (ret < 0) {
@@ -740,8 +767,9 @@ static int arm_spe_process_event(struct perf_session *session,
if (err)
return err;

- if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
- event->header.type == PERF_RECORD_SWITCH)
+ if (!spe->use_ctx_pkt_for_pid &&
+ (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
+ event->header.type == PERF_RECORD_SWITCH))
err = arm_spe_context_switch(spe, event, sample);
}

@@ -805,10 +833,16 @@ static int arm_spe_flush(struct perf_session *session __maybe_unused,
return ret;

if (spe->timeless_decoding)
- return arm_spe_process_timeless_queues(spe, -1,
+ ret = arm_spe_process_timeless_queues(spe, -1,
MAX_TIMESTAMP - 1);
+ else
+ ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);

- return arm_spe_process_queues(spe, MAX_TIMESTAMP);
+ if (!spe->use_ctx_pkt_for_pid)
+ ui__warning("Arm SPE CONTEXT packets not found in the traces.\n\n"
+ "Matching of TIDs to SPE events could be inaccurate.\n\n");
+
+ return ret;
}

static void arm_spe_free_queue(void *priv)
@@ -1056,6 +1090,22 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
return 0;
}

+static bool arm_spe_is_ctx_pkt_enabled(struct arm_spe *spe)
+{
+ struct auxtrace_queues *queues = &spe->queues;
+ unsigned int i;
+
+ for (i = 0; i < queues->nr_queues; i++) {
+ struct auxtrace_queue *queue = &spe->queues.queue_array[i];
+ struct arm_spe_queue *speq = queue->priv;
+
+ if (speq)
+ return speq->decoder->record.context_id != (u64) -1;
+ }
+
+ return false;
+}
+
int arm_spe_process_auxtrace_info(union perf_event *event,
struct perf_session *session)
{
@@ -1131,9 +1181,20 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
if (err)
goto err_free_queues;

- if (spe->queues.populated)
+ if (spe->queues.populated) {
spe->data_queued = true;

+ /*
+ * Ensure the first record of every queue can be read in the function
+ * arm_spe_is_ctx_pkt_enabled()
+ */
+ err = arm_spe__update_queues(spe);
+ if (err)
+ goto err_free_queues;
+
+ spe->use_ctx_pkt_for_pid = arm_spe_is_ctx_pkt_enabled(spe);
+ }
+
return 0;

err_free_queues:
--
2.25.1


2021-11-06 03:32:23

by Leo Yan

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf arm-spe: Save context ID in record

On Tue, Nov 02, 2021 at 06:07:38PM +0000, German Gomez wrote:
> This patch is to save context ID in record, this will be used to set TID
> for samples.
>
> Signed-off-by: Leo Yan <[email protected]>
> Signed-off-by: James Clark <[email protected]>
> Signed-off-by: German Gomez <[email protected]>

Reviewed-by: Leo Yan <[email protected]>

> ---
> tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 2 ++
> tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 1 +
> 2 files changed, 3 insertions(+)
>
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> index 32fe41835..1b58859d2 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> @@ -151,6 +151,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
> u64 payload, ip;
>
> memset(&decoder->record, 0x0, sizeof(decoder->record));
> + decoder->record.context_id = -1;
>
> while (1) {
> err = arm_spe_get_next_packet(decoder);
> @@ -180,6 +181,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
> case ARM_SPE_COUNTER:
> break;
> case ARM_SPE_CONTEXT:
> + decoder->record.context_id = payload;
> break;
> case ARM_SPE_OP_TYPE:
> if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> index 59bdb7309..46a8556a9 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> @@ -38,6 +38,7 @@ struct arm_spe_record {
> u64 timestamp;
> u64 virt_addr;
> u64 phys_addr;
> + u64 context_id;
> };
>
> struct arm_spe_insn;
> --
> 2.25.1
>

2021-11-06 13:47:41

by Leo Yan

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf arm-spe: Save context ID in record

On Tue, Nov 02, 2021 at 06:07:38PM +0000, German Gomez wrote:
> This patch is to save context ID in record, this will be used to set TID
> for samples.
>
> Signed-off-by: Leo Yan <[email protected]>
> Signed-off-by: James Clark <[email protected]>
> Signed-off-by: German Gomez <[email protected]>
> ---
> tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 2 ++
> tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 1 +
> 2 files changed, 3 insertions(+)
>
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> index 32fe41835..1b58859d2 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
> @@ -151,6 +151,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
> u64 payload, ip;
>
> memset(&decoder->record, 0x0, sizeof(decoder->record));
> + decoder->record.context_id = -1;

Since 'context_id' is type u64, here it's good to assign '(u64)-1'.

> while (1) {
> err = arm_spe_get_next_packet(decoder);
> @@ -180,6 +181,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
> case ARM_SPE_COUNTER:
> break;
> case ARM_SPE_CONTEXT:
> + decoder->record.context_id = payload;
> break;
> case ARM_SPE_OP_TYPE:
> if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> index 59bdb7309..46a8556a9 100644
> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
> @@ -38,6 +38,7 @@ struct arm_spe_record {
> u64 timestamp;
> u64 virt_addr;
> u64 phys_addr;
> + u64 context_id;
> };
>
> struct arm_spe_insn;
> --
> 2.25.1
>

2021-11-06 14:58:11

by Leo Yan

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf arm-spe: Support hardware-based PID tracing

Hi German,

On Tue, Nov 02, 2021 at 06:07:39PM +0000, German Gomez wrote:
> If Arm SPE traces contain CONTEXT packets with PID info, use these
> values for tracking pid of samples. Otherwise fall back to using context
> switch events and display a message warning the user of possible timing
> inaccuracies [1].
>
> [1] https://lore.kernel.org/lkml/[email protected]/

First of all, I'd like to clarify one thing:

The pid/tid has been supported for 'per-thread' mode, the function
arm_spe_process_timeless_queues() invokes arm_spe_set_pid_tid_cpu() to
initialize 'speq->tid' and assign auxtrace_queue's tid to it.

Thus, in this patch set we only need to consider support context packet
for CPU wide tracing and system wide tracing. The following comments
are heavily based on this assumption.

> Signed-off-by: German Gomez <[email protected]>
> ---
> tools/perf/util/arm-spe.c | 123 ++++++++++++++++++++++++++++----------
> 1 file changed, 92 insertions(+), 31 deletions(-)
>
> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
> index 230bc7ab2..00a409469 100644
> --- a/tools/perf/util/arm-spe.c
> +++ b/tools/perf/util/arm-spe.c
> @@ -71,6 +71,7 @@ struct arm_spe {
> u64 kernel_start;
>
> unsigned long num_events;
> + u8 use_ctx_pkt_for_pid;
> };
>
> struct arm_spe_queue {
> @@ -226,6 +227,44 @@ static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
> PERF_RECORD_MISC_USER;
> }
>
> +static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
> + struct auxtrace_queue *queue)
> +{
> + struct arm_spe_queue *speq = queue->priv;
> + pid_t tid;
> +
> + tid = machine__get_current_tid(spe->machine, speq->cpu);
> + if (tid != -1) {
> + speq->tid = tid;
> + thread__zput(speq->thread);
> + } else
> + speq->tid = queue->tid;
> +
> + if ((!speq->thread) && (speq->tid != -1)) {
> + speq->thread = machine__find_thread(spe->machine, -1,
> + speq->tid);
> + }
> +
> + if (speq->thread) {
> + speq->pid = speq->thread->pid_;
> + if (queue->cpu == -1)
> + speq->cpu = speq->thread->cpu;
> + }
> +}
> +
> +static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
> +{
> + struct arm_spe *spe = speq->spe;
> + int err = machine__set_current_tid(spe->machine, speq->cpu, tid, tid);
> +
> + if (err)
> + return err;
> +
> + arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
> +
> + return 0;
> +}
> +
> static void arm_spe_prep_sample(struct arm_spe *spe,
> struct arm_spe_queue *speq,
> union perf_event *event,
> @@ -460,6 +499,13 @@ static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
> * can correlate samples between Arm SPE trace data and other
> * perf events with correct time ordering.
> */
> +
> + if (spe->use_ctx_pkt_for_pid) {
> + ret = arm_spe_set_tid(speq, speq->decoder->record.context_id);
> + if (ret)
> + return ret;
> + }
> +

If trace contains context packet, we can give it the priority. So at
here we can always update tid based on the latest context_id from the
context packet.

And for 'per thread' mode, we should not set pid/tid for it. The
reason is arm_spe_set_tid() will return error for 'per thread' mode,
because the 'speq->cpu' is -1, so it cannot set tid/pid for CPU '-1'.

And if we detect there have context packet is incoming, we should set
the flag 'spe->use_ctx_pkt_for_pid' true.

How about below code:

/* Update pid/tid info */
record = &speq->decoder->record;
if (!spe->timeless_decoding && record->context_id != (u64)-1) {
ret = arm_spe_set_tid(speq, record->context_id);
if (ret)
return ret;

spe->use_ctx_pkt_for_pid = 1;
}

> ret = arm_spe_sample(speq);
> if (ret)
> return ret;
> @@ -586,31 +632,6 @@ static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
> return timeless_decoding;
> }
>
> -static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
> - struct auxtrace_queue *queue)
> -{
> - struct arm_spe_queue *speq = queue->priv;
> - pid_t tid;
> -
> - tid = machine__get_current_tid(spe->machine, speq->cpu);
> - if (tid != -1) {
> - speq->tid = tid;
> - thread__zput(speq->thread);
> - } else
> - speq->tid = queue->tid;
> -
> - if ((!speq->thread) && (speq->tid != -1)) {
> - speq->thread = machine__find_thread(spe->machine, -1,
> - speq->tid);
> - }
> -
> - if (speq->thread) {
> - speq->pid = speq->thread->pid_;
> - if (queue->cpu == -1)
> - speq->cpu = speq->thread->cpu;
> - }
> -}
> -
> static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
> {
> unsigned int queue_nr;
> @@ -641,7 +662,13 @@ static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
> ts = timestamp;
> }
>
> - arm_spe_set_pid_tid_cpu(spe, queue);
> + /*
> + * Here we only consider PID tracking based on switch events.
> + * For tracking based on CONTEXT packets, the pid is assigned in the function
> + * arm_spe_run_decoder() in order to support timeless decoding.
> + */
> + if (!spe->use_ctx_pkt_for_pid)
> + arm_spe_set_pid_tid_cpu(spe, queue);

Yeah, this is right; if without context packet, we need to update thread
context at this point.

Could you refine the comment like something:

"The switch events has set pid/tid in the machine's context, here we
update the thread and pid/tid info for spe queue."

> ret = arm_spe_run_decoder(speq, &ts);
> if (ret < 0) {
> @@ -740,8 +767,9 @@ static int arm_spe_process_event(struct perf_session *session,
> if (err)
> return err;
>
> - if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
> - event->header.type == PERF_RECORD_SWITCH)
> + if (!spe->use_ctx_pkt_for_pid &&
> + (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
> + event->header.type == PERF_RECORD_SWITCH))
>
> err = arm_spe_context_switch(spe, event, sample);
> }
>
> @@ -805,10 +833,16 @@ static int arm_spe_flush(struct perf_session *session __maybe_unused,
> return ret;
>
> if (spe->timeless_decoding)
> - return arm_spe_process_timeless_queues(spe, -1,
> + ret = arm_spe_process_timeless_queues(spe, -1,
> MAX_TIMESTAMP - 1);
> + else
> + ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
>
> - return arm_spe_process_queues(spe, MAX_TIMESTAMP);
> + if (!spe->use_ctx_pkt_for_pid)
> + ui__warning("Arm SPE CONTEXT packets not found in the traces.\n\n"
> + "Matching of TIDs to SPE events could be inaccurate.\n\n");

I think we only need to report the warning for no timeless case and
it's not necessary to change code for timeless decoding, thus the
change could be:

ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
if (!spe->use_ctx_pkt_for_pid)
ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
"Matching of TIDs to SPE events could be inaccurate.\n");

> +
> + return ret;
> }
>
> static void arm_spe_free_queue(void *priv)
> @@ -1056,6 +1090,22 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
> return 0;
> }
>
> +static bool arm_spe_is_ctx_pkt_enabled(struct arm_spe *spe)
> +{
> + struct auxtrace_queues *queues = &spe->queues;
> + unsigned int i;
> +
> + for (i = 0; i < queues->nr_queues; i++) {
> + struct auxtrace_queue *queue = &spe->queues.queue_array[i];
> + struct arm_spe_queue *speq = queue->priv;
> +
> + if (speq)
> + return speq->decoder->record.context_id != (u64) -1;
> + }
> +
> + return false;
> +}
> +
> int arm_spe_process_auxtrace_info(union perf_event *event,
> struct perf_session *session)
> {
> @@ -1131,9 +1181,20 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
> if (err)
> goto err_free_queues;
>
> - if (spe->queues.populated)
> + if (spe->queues.populated) {
> spe->data_queued = true;
>
> + /*
> + * Ensure the first record of every queue can be read in the function
> + * arm_spe_is_ctx_pkt_enabled()
> + */
> + err = arm_spe__update_queues(spe);
> + if (err)
> + goto err_free_queues;
> +
> + spe->use_ctx_pkt_for_pid = arm_spe_is_ctx_pkt_enabled(spe);

I don't think this change is needed.

arm_spe__setup_queue() will start to decode and it returns back the
first record; then function arm_spe_run_decoder() will check
'record->context_id', if detect 'context_id' is not '-1' it will set
'spe->use_ctx_pkt_for_pid' as true.

So here we don't need to invoke arm_spe__update_queues(). And I have
some concern that this might introduce other potential issue,
especially the callback process_auxtrace_info() usually is only used
for early initializatoin rather than trace data decoding.

Thanks,
Leo

> + }
> +
> return 0;
>
> err_free_queues:
> --
> 2.25.1
>

2021-11-09 10:41:47

by German Gomez

[permalink] [raw]
Subject: Re: [PATCH 2/3] perf arm-spe: Save context ID in record


On 06/11/2021 13:47, Leo Yan wrote:
> On Tue, Nov 02, 2021 at 06:07:38PM +0000, German Gomez wrote:
>> This patch is to save context ID in record, this will be used to set TID
>> for samples.
>>
>> Signed-off-by: Leo Yan <[email protected]>
>> Signed-off-by: James Clark <[email protected]>
>> Signed-off-by: German Gomez <[email protected]>
>> ---
>> tools/perf/util/arm-spe-decoder/arm-spe-decoder.c | 2 ++
>> tools/perf/util/arm-spe-decoder/arm-spe-decoder.h | 1 +
>> 2 files changed, 3 insertions(+)
>>
>> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
>> index 32fe41835..1b58859d2 100644
>> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
>> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
>> @@ -151,6 +151,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
>> u64 payload, ip;
>>
>> memset(&decoder->record, 0x0, sizeof(decoder->record));
>> + decoder->record.context_id = -1;
> Since 'context_id' is type u64, here it's good to assign '(u64)-1'.

Ack.

Thanks,
German

>
>> while (1) {
>> err = arm_spe_get_next_packet(decoder);
>> @@ -180,6 +181,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
>> case ARM_SPE_COUNTER:
>> break;
>> case ARM_SPE_CONTEXT:
>> + decoder->record.context_id = payload;
>> break;
>> case ARM_SPE_OP_TYPE:
>> if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
>> diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
>> index 59bdb7309..46a8556a9 100644
>> --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
>> +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
>> @@ -38,6 +38,7 @@ struct arm_spe_record {
>> u64 timestamp;
>> u64 virt_addr;
>> u64 phys_addr;
>> + u64 context_id;
>> };
>>
>> struct arm_spe_insn;
>> --
>> 2.25.1
>>

2021-11-09 11:15:10

by German Gomez

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf arm-spe: Support hardware-based PID tracing

Hi Leo,

Thanks for the review.

On 06/11/2021 14:57, Leo Yan wrote:
> Hi German,
>
> On Tue, Nov 02, 2021 at 06:07:39PM +0000, German Gomez wrote:
>> If Arm SPE traces contain CONTEXT packets with PID info, use these
>> values for tracking pid of samples. Otherwise fall back to using context
>> switch events and display a message warning the user of possible timing
>> inaccuracies [1].
>>
>> [1] https://lore.kernel.org/lkml/[email protected]/
> First of all, I'd like to clarify one thing:
>
> The pid/tid has been supported for 'per-thread' mode, the function
> arm_spe_process_timeless_queues() invokes arm_spe_set_pid_tid_cpu() to
> initialize 'speq->tid' and assign auxtrace_queue's tid to it.

Ack.

I forgot to validate per-thread mode and didn't realize it was using the
timeless flow.

>
> Thus, in this patch set we only need to consider support context packet
> for CPU wide tracing and system wide tracing. The following comments
> are heavily based on this assumption.
>
>> Signed-off-by: German Gomez <[email protected]>
>> ---
>> tools/perf/util/arm-spe.c | 123 ++++++++++++++++++++++++++++----------
>> 1 file changed, 92 insertions(+), 31 deletions(-)
>>
>> diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
>> index 230bc7ab2..00a409469 100644
>> --- a/tools/perf/util/arm-spe.c
>> +++ b/tools/perf/util/arm-spe.c
>> @@ -71,6 +71,7 @@ struct arm_spe {
>> u64 kernel_start;
>>
>> unsigned long num_events;
>> + u8 use_ctx_pkt_for_pid;
>> };
>>
>> struct arm_spe_queue {
>> @@ -226,6 +227,44 @@ static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
>> PERF_RECORD_MISC_USER;
>> }
>>
>> +static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
>> + struct auxtrace_queue *queue)
>> +{
>> + struct arm_spe_queue *speq = queue->priv;
>> + pid_t tid;
>> +
>> + tid = machine__get_current_tid(spe->machine, speq->cpu);
>> + if (tid != -1) {
>> + speq->tid = tid;
>> + thread__zput(speq->thread);
>> + } else
>> + speq->tid = queue->tid;
>> +
>> + if ((!speq->thread) && (speq->tid != -1)) {
>> + speq->thread = machine__find_thread(spe->machine, -1,
>> + speq->tid);
>> + }
>> +
>> + if (speq->thread) {
>> + speq->pid = speq->thread->pid_;
>> + if (queue->cpu == -1)
>> + speq->cpu = speq->thread->cpu;
>> + }
>> +}
>> +
>> +static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
>> +{
>> + struct arm_spe *spe = speq->spe;
>> + int err = machine__set_current_tid(spe->machine, speq->cpu, tid, tid);
>> +
>> + if (err)
>> + return err;
>> +
>> + arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);
>> +
>> + return 0;
>> +}
>> +
>> static void arm_spe_prep_sample(struct arm_spe *spe,
>> struct arm_spe_queue *speq,
>> union perf_event *event,
>> @@ -460,6 +499,13 @@ static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
>> * can correlate samples between Arm SPE trace data and other
>> * perf events with correct time ordering.
>> */
>> +
>> + if (spe->use_ctx_pkt_for_pid) {
>> + ret = arm_spe_set_tid(speq, speq->decoder->record.context_id);
>> + if (ret)
>> + return ret;
>> + }
>> +
> If trace contains context packet, we can give it the priority. So at
> here we can always update tid based on the latest context_id from the
> context packet.
>
> And for 'per thread' mode, we should not set pid/tid for it. The
> reason is arm_spe_set_tid() will return error for 'per thread' mode,
> because the 'speq->cpu' is -1, so it cannot set tid/pid for CPU '-1'.
>
> And if we detect there have context packet is incoming, we should set
> the flag 'spe->use_ctx_pkt_for_pid' true.
>
> How about below code:
>
> /* Update pid/tid info */
> record = &speq->decoder->record;
> if (!spe->timeless_decoding && record->context_id != (u64)-1) {
> ret = arm_spe_set_tid(speq, record->context_id);
> if (ret)
> return ret;
>
> spe->use_ctx_pkt_for_pid = 1;
> }

Yeah it looks good. The patch defined the two method as strictly
mutually exclusive, but I prefer this way better which is also more in
line with the previous RFC. Thanks for the suggestion.

>> ret = arm_spe_sample(speq);
>> if (ret)
>> return ret;
>> @@ -586,31 +632,6 @@ static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
>> return timeless_decoding;
>> }
>>
>> -static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
>> - struct auxtrace_queue *queue)
>> -{
>> - struct arm_spe_queue *speq = queue->priv;
>> - pid_t tid;
>> -
>> - tid = machine__get_current_tid(spe->machine, speq->cpu);
>> - if (tid != -1) {
>> - speq->tid = tid;
>> - thread__zput(speq->thread);
>> - } else
>> - speq->tid = queue->tid;
>> -
>> - if ((!speq->thread) && (speq->tid != -1)) {
>> - speq->thread = machine__find_thread(spe->machine, -1,
>> - speq->tid);
>> - }
>> -
>> - if (speq->thread) {
>> - speq->pid = speq->thread->pid_;
>> - if (queue->cpu == -1)
>> - speq->cpu = speq->thread->cpu;
>> - }
>> -}
>> -
>> static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
>> {
>> unsigned int queue_nr;
>> @@ -641,7 +662,13 @@ static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
>> ts = timestamp;
>> }
>>
>> - arm_spe_set_pid_tid_cpu(spe, queue);
>> + /*
>> + * Here we only consider PID tracking based on switch events.
>> + * For tracking based on CONTEXT packets, the pid is assigned in the function
>> + * arm_spe_run_decoder() in order to support timeless decoding.
>> + */
>> + if (!spe->use_ctx_pkt_for_pid)
>> + arm_spe_set_pid_tid_cpu(spe, queue);
> Yeah, this is right; if without context packet, we need to update thread
> context at this point.
>
> Could you refine the comment like something:
>
> "The switch events has set pid/tid in the machine's context, here we
> update the thread and pid/tid info for spe queue."

Ack.

>
>> ret = arm_spe_run_decoder(speq, &ts);
>> if (ret < 0) {
>> @@ -740,8 +767,9 @@ static int arm_spe_process_event(struct perf_session *session,
>> if (err)
>> return err;
>>
>> - if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
>> - event->header.type == PERF_RECORD_SWITCH)
>> + if (!spe->use_ctx_pkt_for_pid &&
>> + (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
>> + event->header.type == PERF_RECORD_SWITCH))
>>
>> err = arm_spe_context_switch(spe, event, sample);
>> }
>>
>> @@ -805,10 +833,16 @@ static int arm_spe_flush(struct perf_session *session __maybe_unused,
>> return ret;
>>
>> if (spe->timeless_decoding)
>> - return arm_spe_process_timeless_queues(spe, -1,
>> + ret = arm_spe_process_timeless_queues(spe, -1,
>> MAX_TIMESTAMP - 1);
>> + else
>> + ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
>>
>> - return arm_spe_process_queues(spe, MAX_TIMESTAMP);
>> + if (!spe->use_ctx_pkt_for_pid)
>> + ui__warning("Arm SPE CONTEXT packets not found in the traces.\n\n"
>> + "Matching of TIDs to SPE events could be inaccurate.\n\n");
> I think we only need to report the warning for no timeless case and
> it's not necessary to change code for timeless decoding, thus the
> change could be:
>
> ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
> if (!spe->use_ctx_pkt_for_pid)
> ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
> "Matching of TIDs to SPE events could be inaccurate.\n");

Ack (I will also return early if ret != 0).

>
>> +
>> + return ret;
>> }
>>
>> static void arm_spe_free_queue(void *priv)
>> @@ -1056,6 +1090,22 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
>> return 0;
>> }
>>
>> +static bool arm_spe_is_ctx_pkt_enabled(struct arm_spe *spe)
>> +{
>> + struct auxtrace_queues *queues = &spe->queues;
>> + unsigned int i;
>> +
>> + for (i = 0; i < queues->nr_queues; i++) {
>> + struct auxtrace_queue *queue = &spe->queues.queue_array[i];
>> + struct arm_spe_queue *speq = queue->priv;
>> +
>> + if (speq)
>> + return speq->decoder->record.context_id != (u64) -1;
>> + }
>> +
>> + return false;
>> +}
>> +
>> int arm_spe_process_auxtrace_info(union perf_event *event,
>> struct perf_session *session)
>> {
>> @@ -1131,9 +1181,20 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
>> if (err)
>> goto err_free_queues;
>>
>> - if (spe->queues.populated)
>> + if (spe->queues.populated) {
>> spe->data_queued = true;
>>
>> + /*
>> + * Ensure the first record of every queue can be read in the function
>> + * arm_spe_is_ctx_pkt_enabled()
>> + */
>> + err = arm_spe__update_queues(spe);
>> + if (err)
>> + goto err_free_queues;
>> +
>> + spe->use_ctx_pkt_for_pid = arm_spe_is_ctx_pkt_enabled(spe);
> I don't think this change is needed.
>
> arm_spe__setup_queue() will start to decode and it returns back the
> first record; then function arm_spe_run_decoder() will check
> 'record->context_id', if detect 'context_id' is not '-1' it will set
> 'spe->use_ctx_pkt_for_pid' as true.
>
> So here we don't need to invoke arm_spe__update_queues(). And I have
> some concern that this might introduce other potential issue,
> especially the callback process_auxtrace_info() usually is only used
> for early initializatoin rather than trace data decoding.

Ack, with the above change in arm_spe_run_decoder, this is no longer
needed.

Thanks,
German

>
> Thanks,
> Leo
>
>> + }
>> +
>> return 0;
>>
>> err_free_queues:
>> --
>> 2.25.1
>>