LinuxLists.cc - [PATCH 0/4] perf cs-etm: Track exception level

[permalink] [raw]

Subject: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

Branch samples currently use the IP of the previous packet as the from
IP, and the IP of the current packet as the to IP. But it incorrectly
uses the current thread. In some cases like a jump into a different
exception level this will attribute to the incorrect process.

Fix it by tracking the previous thread in the same way the previous
packet is tracked.

Signed-off-by: James Clark <[email protected]>
---
tools/perf/util/cs-etm.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index ebffc9052561..a997fe79d458 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -86,6 +86,7 @@ struct cs_etm_traceid_queue {
size_t last_branch_pos;
union perf_event *event_buf;
struct thread *thread;
+ struct thread *prev_thread;
struct branch_stack *last_branch;
struct branch_stack *last_branch_rb;
struct cs_etm_packet *prev_packet;
@@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
tidq->trace_chan_id = trace_chan_id;
tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
queue->tid);
+ tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);

tidq->packet = zalloc(sizeof(struct cs_etm_packet));
if (!tidq->packet)
@@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
tmp = tidq->packet;
tidq->packet = tidq->prev_packet;
tidq->prev_packet = tmp;
+ thread__put(tidq->prev_thread);
+ tidq->prev_thread = thread__get(tidq->thread);
}
}

@@ -791,6 +795,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
/* Free this traceid_queue from the array */
tidq = etmq->traceid_queues[idx];
thread__zput(tidq->thread);
+ thread__zput(tidq->prev_thread);
zfree(&tidq->event_buf);
zfree(&tidq->last_branch);
zfree(&tidq->last_branch_rb);
@@ -1450,8 +1455,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
sample.time = cs_etm__resolve_sample_time(etmq, tidq);

sample.ip = ip;
- sample.pid = tidq->thread->pid_;
- sample.tid = tidq->thread->tid;
+ sample.pid = tidq->prev_thread->pid_;
+ sample.tid = tidq->prev_thread->tid;
sample.addr = cs_etm__first_executed_instr(tidq->packet);
sample.id = etmq->etm->branches_id;
sample.stream_id = etmq->etm->branches_id;
--
2.34.1

2023-05-24 13:24:44

[permalink] [raw]

Subject: [PATCH 3/4] perf cs-etm: Track exception level

Currently we assume all trace belongs to the host machine so when
the decoder should be looking at the guest kernel maps it can crash
because it looks at the host ones instead.

Avoid one scenario (guest kernel running at EL1) by assigning the
default guest machine to this trace. For userspace trace it's still not
possible to determine guest vs host, but the PIDs should help in this
case.

Signed-off-by: James Clark <[email protected]>
---
.../perf/util/cs-etm-decoder/cs-etm-decoder.c | 7 +-
tools/perf/util/cs-etm.c | 64 ++++++++++++++-----
tools/perf/util/cs-etm.h | 5 +-
3 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 82a27ab90c8b..ac227cd03eb0 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -573,12 +573,13 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
break;
}

+ if (cs_etm__etmq_set_tid_el(etmq, tid, trace_chan_id,
+ elem->context.exception_level))
+ return OCSD_RESP_FATAL_SYS_ERR;
+
if (tid == -1)
return OCSD_RESP_CONT;

- if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
- return OCSD_RESP_FATAL_SYS_ERR;
-
/*
* A timestamp is generated after a PE_CONTEXT element so make sure
* to rely on that coming one.
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a997fe79d458..b9ba19327f26 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -14,7 +14,6 @@
#include <linux/types.h>
#include <linux/zalloc.h>

-#include <opencsd/ocsd_if_types.h>
#include <stdlib.h>

#include "auxtrace.h"
@@ -87,6 +86,8 @@ struct cs_etm_traceid_queue {
union perf_event *event_buf;
struct thread *thread;
struct thread *prev_thread;
+ ocsd_ex_level prev_el;
+ ocsd_ex_level el;
struct branch_stack *last_branch;
struct branch_stack *last_branch_rb;
struct cs_etm_packet *prev_packet;
@@ -479,6 +480,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,

queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
tidq->trace_chan_id = trace_chan_id;
+ tidq->el = tidq->prev_el = ocsd_EL_unknown;
tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
queue->tid);
tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
@@ -618,6 +620,7 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
tmp = tidq->packet;
tidq->packet = tidq->prev_packet;
tidq->prev_packet = tmp;
+ tidq->prev_el = tidq->el;
thread__put(tidq->prev_thread);
tidq->prev_thread = thread__get(tidq->thread);
}
@@ -879,11 +882,34 @@ static bool cs_etm__evsel_is_auxtrace(struct perf_session *session,
return evsel->core.attr.type == aux->pmu_type;
}

-static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
+static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
+ ocsd_ex_level el)
{
- struct machine *machine;
+ /*
+ * Not perfect, but assume anything in EL1 is the default guest, and
+ * everything else is the host. nHVE and pKVM may not work with this
+ * assumption. And distinguishing between guest and host userspaces
+ * isn't currently supported either. Neither is multiple guest support.
+ * All this does is reduce the likeliness of decode errors where we look
+ * into the host kernel maps when it should have been the guest maps.
+ */
+ switch (el) {
+ case ocsd_EL1:
+ return machines__find_guest(&etm->session->machines,
+ DEFAULT_GUEST_KERNEL_ID);
+ case ocsd_EL3:
+ case ocsd_EL2:
+ case ocsd_EL0:
+ case ocsd_EL_unknown:
+ default:
+ return &etm->session->machines.host;
+ }
+}

- machine = &etmq->etm->session->machines.host;
+static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
+ ocsd_ex_level el)
+{
+ struct machine *machine = cs_etm__get_machine(etmq->etm, el);

if (address >= machine__kernel_start(machine)) {
if (machine__is_host(machine))
@@ -893,10 +919,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
} else {
if (machine__is_host(machine))
return PERF_RECORD_MISC_USER;
- else if (perf_guest)
+ else {
+ /*
+ * Can't really happen at the moment because
+ * cs_etm__get_machine() will always return
+ * machines.host for any non EL1 trace.
+ */
return PERF_RECORD_MISC_GUEST_USER;
- else
- return PERF_RECORD_MISC_HYPERVISOR;
+ }
}
}

@@ -913,11 +943,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
if (!etmq)
return 0;

- cpumode = cs_etm__cpu_mode(etmq, address);
tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
if (!tidq)
return 0;

+ cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
+
if (!thread__find_map(tidq->thread, cpumode, address, &al))
return 0;

@@ -1296,9 +1327,10 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
}

static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
- struct cs_etm_traceid_queue *tidq, pid_t tid)
+ struct cs_etm_traceid_queue *tidq, pid_t tid,
+ ocsd_ex_level el)
{
- struct machine *machine = &etm->session->machines.host;
+ struct machine *machine = cs_etm__get_machine(etm, el);

if (tid != -1) {
thread__zput(tidq->thread);
@@ -1308,10 +1340,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
/* Couldn't find a known thread */
if (!tidq->thread)
tidq->thread = machine__idle_thread(machine);
+
+ tidq->el = el;
}

-int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
- pid_t tid, u8 trace_chan_id)
+int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
+ ocsd_ex_level el)
{
struct cs_etm_traceid_queue *tidq;

@@ -1319,7 +1353,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
if (!tidq)
return -EINVAL;

- cs_etm__set_thread(etmq->etm, tidq, tid);
+ cs_etm__set_thread(etmq->etm, tidq, tid, el);
return 0;
}

@@ -1389,7 +1423,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
struct perf_sample sample = {.ip = 0,};

event->sample.header.type = PERF_RECORD_SAMPLE;
- event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
+ event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
event->sample.header.size = sizeof(struct perf_event_header);

/* Set time field based on etm auxtrace config. */
@@ -1448,7 +1482,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
ip = cs_etm__last_executed_instr(tidq->prev_packet);

event->sample.header.type = PERF_RECORD_SAMPLE;
- event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
+ event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, tidq->prev_el);
event->sample.header.size = sizeof(struct perf_event_header);

/* Set time field based on etm auxtrace config. */
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 70cac0375b34..88e9b25a8a9f 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -232,10 +232,11 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu);

#ifdef HAVE_CSTRACE_SUPPORT
+#include <opencsd/ocsd_if_types.h>
int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
-int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
- pid_t tid, u8 trace_chan_id);
+int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
+ ocsd_ex_level el);
bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
u8 trace_chan_id);
--
2.34.1

2023-05-24 13:27:53

[permalink] [raw]

Subject: [PATCH 1/4] perf cs-etm: Only track threads instead of PID and TIDs

PIDs and TIDs are already contained within the thread struct, so to
avoid inconsistencies drop the extra members on the etm queue and only
use the thread struct.

At the same time stop using the 'unknown' thread. In a later commit
we will be making samples from multiple machines so it will be better
to use the idle thread of each machine rather than overlapping unknown
threads. Using the idle thread is also better because kernel addresses
with a previously unknown thread will now be assigned to a real kernel
thread.

Signed-off-by: James Clark <[email protected]>
---
tools/perf/util/cs-etm.c | 124 ++++++++++++---------------------------
1 file changed, 38 insertions(+), 86 deletions(-)

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 91299cc56bf7..ebffc9052561 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -46,8 +46,6 @@ struct cs_etm_auxtrace {
struct auxtrace_heap heap;
struct itrace_synth_opts synth_opts;
struct perf_session *session;
- struct machine *machine;
- struct thread *unknown_thread;
struct perf_tsc_conversion tc;

/*
@@ -84,7 +82,6 @@ struct cs_etm_auxtrace {

struct cs_etm_traceid_queue {
u8 trace_chan_id;
- pid_t pid, tid;
u64 period_instructions;
size_t last_branch_pos;
union perf_event *event_buf;
@@ -480,9 +477,9 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
cs_etm__clear_packet_queue(&tidq->packet_queue);

queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
- tidq->tid = queue->tid;
- tidq->pid = -1;
tidq->trace_chan_id = trace_chan_id;
+ tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
+ queue->tid);

tidq->packet = zalloc(sizeof(struct cs_etm_packet));
if (!tidq->packet)
@@ -863,7 +860,6 @@ static void cs_etm__free(struct perf_session *session)
for (i = 0; i < aux->num_cpu; i++)
zfree(&aux->metadata[i]);

- thread__zput(aux->unknown_thread);
zfree(&aux->metadata);
zfree(&aux);
}
@@ -882,7 +878,7 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
{
struct machine *machine;

- machine = etmq->etm->machine;
+ machine = &etmq->etm->session->machines.host;

if (address >= machine__kernel_start(machine)) {
if (machine__is_host(machine))
@@ -905,8 +901,6 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
u8 cpumode;
u64 offset;
int len;
- struct thread *thread;
- struct machine *machine;
struct addr_location al;
struct dso *dso;
struct cs_etm_traceid_queue *tidq;
@@ -914,20 +908,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
if (!etmq)
return 0;

- machine = etmq->etm->machine;
cpumode = cs_etm__cpu_mode(etmq, address);
tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
if (!tidq)
return 0;

- thread = tidq->thread;
- if (!thread) {
- if (cpumode != PERF_RECORD_MISC_KERNEL)
- return 0;
- thread = etmq->etm->unknown_thread;
- }
-
- if (!thread__find_map(thread, cpumode, address, &al))
+ if (!thread__find_map(tidq->thread, cpumode, address, &al))
return 0;

dso = map__dso(al.map);
@@ -942,7 +928,8 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,

map__load(al.map);

- len = dso__data_read_offset(dso, machine, offset, buffer, size);
+ len = dso__data_read_offset(dso, maps__machine(tidq->thread->maps),
+ offset, buffer, size);

if (len <= 0) {
ui__warning_once("CS ETM Trace: Missing DSO. Use 'perf archive' or debuginfod to export data from the traced system.\n"
@@ -1303,39 +1290,31 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
return etmq->buf_len;
}

-static void cs_etm__set_pid_tid_cpu(struct cs_etm_auxtrace *etm,
- struct cs_etm_traceid_queue *tidq)
+static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
+ struct cs_etm_traceid_queue *tidq, pid_t tid)
{
- if ((!tidq->thread) && (tidq->tid != -1))
- tidq->thread = machine__find_thread(etm->machine, -1,
- tidq->tid);
+ struct machine *machine = &etm->session->machines.host;
+
+ if (tid != -1) {
+ thread__zput(tidq->thread);
+ tidq->thread = machine__find_thread(machine, -1, tid);
+ }

- if (tidq->thread)
- tidq->pid = tidq->thread->pid_;
+ /* Couldn't find a known thread */
+ if (!tidq->thread)
+ tidq->thread = machine__idle_thread(machine);
}

int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
pid_t tid, u8 trace_chan_id)
{
- int cpu, err = -EINVAL;
- struct cs_etm_auxtrace *etm = etmq->etm;
struct cs_etm_traceid_queue *tidq;

tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
if (!tidq)
- return err;
-
- if (cs_etm__get_cpu(trace_chan_id, &cpu) < 0)
- return err;
-
- err = machine__set_current_tid(etm->machine, cpu, tid, tid);
- if (err)
- return err;
-
- tidq->tid = tid;
- thread__zput(tidq->thread);
+ return -EINVAL;

- cs_etm__set_pid_tid_cpu(etm, tidq);
+ cs_etm__set_thread(etmq->etm, tidq, tid);
return 0;
}

@@ -1412,8 +1391,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
sample.time = cs_etm__resolve_sample_time(etmq, tidq);

sample.ip = addr;
- sample.pid = tidq->pid;
- sample.tid = tidq->tid;
+ sample.pid = tidq->thread->pid_;
+ sample.tid = tidq->thread->tid;
sample.id = etmq->etm->instructions_id;
sample.stream_id = etmq->etm->instructions_id;
sample.period = period;
@@ -1471,8 +1450,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
sample.time = cs_etm__resolve_sample_time(etmq, tidq);

sample.ip = ip;
- sample.pid = tidq->pid;
- sample.tid = tidq->tid;
+ sample.pid = tidq->thread->pid_;
+ sample.tid = tidq->thread->tid;
sample.addr = cs_etm__first_executed_instr(tidq->packet);
sample.id = etmq->etm->branches_id;
sample.stream_id = etmq->etm->branches_id;
@@ -2466,11 +2445,6 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
if (!etmq)
continue;

- /*
- * Per-cpu mode has contextIDs in the trace and the decoder
- * calls cs_etm__set_pid_tid_cpu() automatically so no need
- * to do this here
- */
if (etm->per_thread_decoding) {
tidq = cs_etm__etmq_get_traceid_queue(
etmq, CS_ETM_PER_THREAD_TRACEID);
@@ -2478,10 +2452,8 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
if (!tidq)
continue;

- if ((tid == -1) || (tidq->tid == tid)) {
- cs_etm__set_pid_tid_cpu(etm, tidq);
+ if (tid == -1 || tidq->thread->tid == tid)
cs_etm__run_per_thread_timeless_decoder(etmq);
- }
} else
cs_etm__run_per_cpu_timeless_decoder(etmq);
}
@@ -2611,10 +2583,12 @@ static int cs_etm__process_itrace_start(struct cs_etm_auxtrace *etm,
return 0;

/*
- * Add the tid/pid to the log so that we can get a match when
- * we get a contextID from the decoder.
+ * Add the tid/pid to the log so that we can get a match when we get a
+ * contextID from the decoder. Only track for the host: only kernel
+ * trace is supported for guests which wouldn't need pids so this should
+ * be fine.
*/
- th = machine__findnew_thread(etm->machine,
+ th = machine__findnew_thread(&etm->session->machines.host,
event->itrace_start.pid,
event->itrace_start.tid);
if (!th)
@@ -2647,10 +2621,12 @@ static int cs_etm__process_switch_cpu_wide(struct cs_etm_auxtrace *etm,
return 0;

/*
- * Add the tid/pid to the log so that we can get a match when
- * we get a contextID from the decoder.
+ * Add the tid/pid to the log so that we can get a match when we get a
+ * contextID from the decoder. Only track for the host: only kernel
+ * trace is supported for guests which wouldn't need pids so this should
+ * be fine.
*/
- th = machine__findnew_thread(etm->machine,
+ th = machine__findnew_thread(&etm->session->machines.host,
event->context_switch.next_prev_pid,
event->context_switch.next_prev_tid);
if (!th)
@@ -3259,7 +3235,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
}

etm->session = session;
- etm->machine = &session->machines.host;

etm->num_cpu = num_cpu;
etm->pmu_type = (unsigned int) ((ptr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff);
@@ -3286,27 +3261,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
if (err)
return err;

- etm->unknown_thread = thread__new(999999999, 999999999);
- if (!etm->unknown_thread) {
- err = -ENOMEM;
- goto err_free_queues;
- }
-
- /*
- * Initialize list node so that at thread__zput() we can avoid
- * segmentation fault at list_del_init().
- */
- INIT_LIST_HEAD(&etm->unknown_thread->node);
-
- err = thread__set_comm(etm->unknown_thread, "unknown", 0);
- if (err)
- goto err_delete_thread;
-
- if (thread__init_maps(etm->unknown_thread, etm->machine)) {
- err = -ENOMEM;
- goto err_delete_thread;
- }
-
etm->tc.time_shift = tc->time_shift;
etm->tc.time_mult = tc->time_mult;
etm->tc.time_zero = tc->time_zero;
@@ -3318,7 +3272,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
}
err = cs_etm__synth_events(etm, session);
if (err)
- goto err_delete_thread;
+ goto err_free_queues;

/*
* Map Trace ID values to CPU metadata.
@@ -3348,7 +3302,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
session->header.data_size,
cs_etm__process_aux_hw_id_cb, &aux_hw_id_found);
if (err)
- goto err_delete_thread;
+ goto err_free_queues;

/* if HW ID found then clear any unused metadata ID values */
if (aux_hw_id_found)
@@ -3358,17 +3312,15 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
err = cs_etm__map_trace_ids_metadata(num_cpu, metadata);

if (err)
- goto err_delete_thread;
+ goto err_free_queues;

err = cs_etm__queue_aux_records(session);
if (err)
- goto err_delete_thread;
+ goto err_free_queues;

etm->data_queued = etm->queues.populated;
return 0;

-err_delete_thread:
- thread__zput(etm->unknown_thread);
err_free_queues:
auxtrace_queues__free(&etm->queues);
session->auxtrace = NULL;
--
2.34.1

2023-05-24 13:29:50

[permalink] [raw]

Subject: [PATCH 4/4] perf cs-etm: Add exception level consistency check

Assert that our own tracking of the exception level matches what
OpenCSD provides. OpenCSD doesn't distinguish between EL0 and EL1 in the
memory access callback so the extra tracking was required. But a rough
assert can still be done.

Signed-off-by: James Clark <[email protected]>
---
.../perf/util/cs-etm-decoder/cs-etm-decoder.c | 6 +--
.../perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 +-
tools/perf/util/cs-etm.c | 37 +++++++++++++------
3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index ac227cd03eb0..50b3c248d1e5 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -52,15 +52,15 @@ struct cs_etm_decoder {
static u32
cs_etm_decoder__mem_access(const void *context,
const ocsd_vaddr_t address,
- const ocsd_mem_space_acc_t mem_space __maybe_unused,
+ const ocsd_mem_space_acc_t mem_space,
const u8 trace_chan_id,
const u32 req_size,
u8 *buffer)
{
struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;

- return decoder->mem_access(decoder->data, trace_chan_id,
- address, req_size, buffer);
+ return decoder->mem_access(decoder->data, trace_chan_id, address,
+ req_size, buffer, mem_space);
}

int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder,
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
index 21d403f55d96..272c2efe78ee 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
@@ -11,6 +11,7 @@
#define INCLUDE__CS_ETM_DECODER_H__

#include <linux/types.h>
+#include <opencsd/ocsd_if_types.h>
#include <stdio.h>

struct cs_etm_decoder;
@@ -19,7 +20,8 @@ struct cs_etm_packet_queue;

struct cs_etm_queue;

-typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *);
+typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *,
+ const ocsd_mem_space_acc_t);

struct cs_etmv3_trace_params {
u32 reg_ctrl;
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index b9ba19327f26..ccf34ed8ddf2 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -931,7 +931,8 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
}

static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
- u64 address, size_t size, u8 *buffer)
+ u64 address, size_t size, u8 *buffer,
+ const ocsd_mem_space_acc_t mem_space)
{
u8 cpumode;
u64 offset;
@@ -947,6 +948,20 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
if (!tidq)
return 0;

+ /*
+ * We've already tracked EL along side the PID in cs_etm__set_thread()
+ * so double check that it matches what OpenCSD thinks as well. It
+ * doesn't distinguish between EL0 and EL1 for this mem access callback
+ * so we had to do the extra tracking.
+ */
+ if (mem_space & OCSD_MEM_SPACE_EL1N) {
+ /* Includes both non secure EL1 and EL0 */
+ assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0);
+ } else if (mem_space & OCSD_MEM_SPACE_EL2)
+ assert(tidq->el == ocsd_EL2);
+ else if (mem_space & OCSD_MEM_SPACE_EL3)
+ assert(tidq->el == ocsd_EL3);
+
cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);

if (!thread__find_map(tidq->thread, cpumode, address, &al))
@@ -1195,8 +1210,8 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
{
u8 instrBytes[2];

- cs_etm__mem_access(etmq, trace_chan_id, addr,
- ARRAY_SIZE(instrBytes), instrBytes);
+ cs_etm__mem_access(etmq, trace_chan_id, addr, ARRAY_SIZE(instrBytes),
+ instrBytes, 0);
/*
* T32 instruction size is indicated by bits[15:11] of the first
* 16-bit word of the instruction: 0b11101, 0b11110 and 0b11111
@@ -1387,8 +1402,8 @@ static void cs_etm__copy_insn(struct cs_etm_queue *etmq,
else
sample->insn_len = 4;

- cs_etm__mem_access(etmq, trace_chan_id, sample->ip,
- sample->insn_len, (void *)sample->insn);
+ cs_etm__mem_access(etmq, trace_chan_id, sample->ip, sample->insn_len,
+ (void *)sample->insn, 0);
}

u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
@@ -1940,8 +1955,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
* so below only read 2 bytes as instruction size for T32.
*/
addr = end_addr - 2;
- cs_etm__mem_access(etmq, trace_chan_id, addr,
- sizeof(instr16), (u8 *)&instr16);
+ cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr16),
+ (u8 *)&instr16, 0);
if ((instr16 & 0xFF00) == 0xDF00)
return true;

@@ -1956,8 +1971,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
* +---------+---------+-------------------------+
*/
addr = end_addr - 4;
- cs_etm__mem_access(etmq, trace_chan_id, addr,
- sizeof(instr32), (u8 *)&instr32);
+ cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
+ (u8 *)&instr32, 0);
if ((instr32 & 0x0F000000) == 0x0F000000 &&
(instr32 & 0xF0000000) != 0xF0000000)
return true;
@@ -1973,8 +1988,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
* +-----------------------+---------+-----------+
*/
addr = end_addr - 4;
- cs_etm__mem_access(etmq, trace_chan_id, addr,
- sizeof(instr32), (u8 *)&instr32);
+ cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
+ (u8 *)&instr32, 0);
if ((instr32 & 0xFFE0001F) == 0xd4000001)
return true;

--
2.34.1

2023-05-25 10:45:51

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf cs-etm: Only track threads instead of PID and TIDs

On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>
> PIDs and TIDs are already contained within the thread struct, so to
> avoid inconsistencies drop the extra members on the etm queue and only
> use the thread struct.
>
> At the same time stop using the 'unknown' thread. In a later commit
> we will be making samples from multiple machines so it will be better
> to use the idle thread of each machine rather than overlapping unknown
> threads. Using the idle thread is also better because kernel addresses
> with a previously unknown thread will now be assigned to a real kernel
> thread.
>
> Signed-off-by: James Clark <[email protected]>
> ---
> tools/perf/util/cs-etm.c | 124 ++++++++++++---------------------------
> 1 file changed, 38 insertions(+), 86 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 91299cc56bf7..ebffc9052561 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -46,8 +46,6 @@ struct cs_etm_auxtrace {
> struct auxtrace_heap heap;
> struct itrace_synth_opts synth_opts;
> struct perf_session *session;
> - struct machine *machine;
> - struct thread *unknown_thread;
> struct perf_tsc_conversion tc;
>
> /*
> @@ -84,7 +82,6 @@ struct cs_etm_auxtrace {
>
> struct cs_etm_traceid_queue {
> u8 trace_chan_id;
> - pid_t pid, tid;
> u64 period_instructions;
> size_t last_branch_pos;
> union perf_event *event_buf;
> @@ -480,9 +477,9 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> cs_etm__clear_packet_queue(&tidq->packet_queue);
>
> queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
> - tidq->tid = queue->tid;
> - tidq->pid = -1;
> tidq->trace_chan_id = trace_chan_id;
> + tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> + queue->tid);
>
> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
> if (!tidq->packet)
> @@ -863,7 +860,6 @@ static void cs_etm__free(struct perf_session *session)
> for (i = 0; i < aux->num_cpu; i++)
> zfree(&aux->metadata[i]);
>
> - thread__zput(aux->unknown_thread);
> zfree(&aux->metadata);
> zfree(&aux);
> }
> @@ -882,7 +878,7 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> {
> struct machine *machine;
>
> - machine = etmq->etm->machine;
> + machine = &etmq->etm->session->machines.host;
>
> if (address >= machine__kernel_start(machine)) {
> if (machine__is_host(machine))
> @@ -905,8 +901,6 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> u8 cpumode;
> u64 offset;
> int len;
> - struct thread *thread;
> - struct machine *machine;
> struct addr_location al;
> struct dso *dso;
> struct cs_etm_traceid_queue *tidq;
> @@ -914,20 +908,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> if (!etmq)
> return 0;
>
> - machine = etmq->etm->machine;
> cpumode = cs_etm__cpu_mode(etmq, address);
> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> if (!tidq)
> return 0;
>
> - thread = tidq->thread;
> - if (!thread) {
> - if (cpumode != PERF_RECORD_MISC_KERNEL)
> - return 0;
> - thread = etmq->etm->unknown_thread;
> - }
> -
> - if (!thread__find_map(thread, cpumode, address, &al))
> + if (!thread__find_map(tidq->thread, cpumode, address, &al))
> return 0;
>
> dso = map__dso(al.map);
> @@ -942,7 +928,8 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>
> map__load(al.map);
>
> - len = dso__data_read_offset(dso, machine, offset, buffer, size);
> + len = dso__data_read_offset(dso, maps__machine(tidq->thread->maps),
> + offset, buffer, size);
>
> if (len <= 0) {
> ui__warning_once("CS ETM Trace: Missing DSO. Use 'perf archive' or debuginfod to export data from the traced system.\n"
> @@ -1303,39 +1290,31 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
> return etmq->buf_len;
> }
>
> -static void cs_etm__set_pid_tid_cpu(struct cs_etm_auxtrace *etm,
> - struct cs_etm_traceid_queue *tidq)
> +static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> + struct cs_etm_traceid_queue *tidq, pid_t tid)
> {
> - if ((!tidq->thread) && (tidq->tid != -1))
> - tidq->thread = machine__find_thread(etm->machine, -1,
> - tidq->tid);
> + struct machine *machine = &etm->session->machines.host;
> +
> + if (tid != -1) {
> + thread__zput(tidq->thread);
> + tidq->thread = machine__find_thread(machine, -1, tid);
> + }
>
> - if (tidq->thread)
> - tidq->pid = tidq->thread->pid_;
> + /* Couldn't find a known thread */
> + if (!tidq->thread)
> + tidq->thread = machine__idle_thread(machine);
> }
>
> int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> pid_t tid, u8 trace_chan_id)
> {
> - int cpu, err = -EINVAL;
> - struct cs_etm_auxtrace *etm = etmq->etm;
> struct cs_etm_traceid_queue *tidq;
>
> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> if (!tidq)
> - return err;
> -
> - if (cs_etm__get_cpu(trace_chan_id, &cpu) < 0)
> - return err;
> -
> - err = machine__set_current_tid(etm->machine, cpu, tid, tid);
> - if (err)
> - return err;
> -
> - tidq->tid = tid;
> - thread__zput(tidq->thread);
> + return -EINVAL;
>
> - cs_etm__set_pid_tid_cpu(etm, tidq);
> + cs_etm__set_thread(etmq->etm, tidq, tid);
> return 0;
> }
>
> @@ -1412,8 +1391,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>
> sample.ip = addr;
> - sample.pid = tidq->pid;
> - sample.tid = tidq->tid;
> + sample.pid = tidq->thread->pid_;
> + sample.tid = tidq->thread->tid;
> sample.id = etmq->etm->instructions_id;
> sample.stream_id = etmq->etm->instructions_id;
> sample.period = period;
> @@ -1471,8 +1450,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>
> sample.ip = ip;
> - sample.pid = tidq->pid;
> - sample.tid = tidq->tid;
> + sample.pid = tidq->thread->pid_;
> + sample.tid = tidq->thread->tid;
> sample.addr = cs_etm__first_executed_instr(tidq->packet);
> sample.id = etmq->etm->branches_id;
> sample.stream_id = etmq->etm->branches_id;
> @@ -2466,11 +2445,6 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
> if (!etmq)
> continue;
>
> - /*
> - * Per-cpu mode has contextIDs in the trace and the decoder
> - * calls cs_etm__set_pid_tid_cpu() automatically so no need
> - * to do this here
> - */
> if (etm->per_thread_decoding) {
> tidq = cs_etm__etmq_get_traceid_queue(
> etmq, CS_ETM_PER_THREAD_TRACEID);
> @@ -2478,10 +2452,8 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
> if (!tidq)
> continue;
>
> - if ((tid == -1) || (tidq->tid == tid)) {
> - cs_etm__set_pid_tid_cpu(etm, tidq);
> + if (tid == -1 || tidq->thread->tid == tid)
> cs_etm__run_per_thread_timeless_decoder(etmq);
> - }
> } else
> cs_etm__run_per_cpu_timeless_decoder(etmq);
> }
> @@ -2611,10 +2583,12 @@ static int cs_etm__process_itrace_start(struct cs_etm_auxtrace *etm,
> return 0;
>
> /*
> - * Add the tid/pid to the log so that we can get a match when
> - * we get a contextID from the decoder.
> + * Add the tid/pid to the log so that we can get a match when we get a
> + * contextID from the decoder. Only track for the host: only kernel
> + * trace is supported for guests which wouldn't need pids so this should
> + * be fine.
> */
> - th = machine__findnew_thread(etm->machine,
> + th = machine__findnew_thread(&etm->session->machines.host,
> event->itrace_start.pid,
> event->itrace_start.tid);
> if (!th)
> @@ -2647,10 +2621,12 @@ static int cs_etm__process_switch_cpu_wide(struct cs_etm_auxtrace *etm,
> return 0;
>
> /*
> - * Add the tid/pid to the log so that we can get a match when
> - * we get a contextID from the decoder.
> + * Add the tid/pid to the log so that we can get a match when we get a
> + * contextID from the decoder. Only track for the host: only kernel
> + * trace is supported for guests which wouldn't need pids so this should
> + * be fine.
> */
> - th = machine__findnew_thread(etm->machine,
> + th = machine__findnew_thread(&etm->session->machines.host,
> event->context_switch.next_prev_pid,
> event->context_switch.next_prev_tid);
> if (!th)
> @@ -3259,7 +3235,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> }
>
> etm->session = session;
> - etm->machine = &session->machines.host;
>
> etm->num_cpu = num_cpu;
> etm->pmu_type = (unsigned int) ((ptr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff);
> @@ -3286,27 +3261,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> if (err)
> return err;
>
> - etm->unknown_thread = thread__new(999999999, 999999999);
> - if (!etm->unknown_thread) {
> - err = -ENOMEM;
> - goto err_free_queues;
> - }
> -
> - /*
> - * Initialize list node so that at thread__zput() we can avoid
> - * segmentation fault at list_del_init().
> - */
> - INIT_LIST_HEAD(&etm->unknown_thread->node);
> -
> - err = thread__set_comm(etm->unknown_thread, "unknown", 0);
> - if (err)
> - goto err_delete_thread;
> -
> - if (thread__init_maps(etm->unknown_thread, etm->machine)) {
> - err = -ENOMEM;
> - goto err_delete_thread;
> - }
> -
> etm->tc.time_shift = tc->time_shift;
> etm->tc.time_mult = tc->time_mult;
> etm->tc.time_zero = tc->time_zero;
> @@ -3318,7 +3272,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> }
> err = cs_etm__synth_events(etm, session);
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> /*
> * Map Trace ID values to CPU metadata.
> @@ -3348,7 +3302,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> session->header.data_size,
> cs_etm__process_aux_hw_id_cb, &aux_hw_id_found);
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> /* if HW ID found then clear any unused metadata ID values */
> if (aux_hw_id_found)
> @@ -3358,17 +3312,15 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> err = cs_etm__map_trace_ids_metadata(num_cpu, metadata);
>
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> err = cs_etm__queue_aux_records(session);
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> etm->data_queued = etm->queues.populated;
> return 0;
>
> -err_delete_thread:
> - thread__zput(etm->unknown_thread);
> err_free_queues:
> auxtrace_queues__free(&etm->queues);
> session->auxtrace = NULL;
> --
> 2.34.1
>
Reviewed-by: Mike Leach <[email protected]>

--
Mike Leach
Principal Engineer, ARM Ltd.
Manchester Design Centre. UK

2023-05-25 11:08:36

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>
> Branch samples currently use the IP of the previous packet as the from
> IP, and the IP of the current packet as the to IP. But it incorrectly
> uses the current thread. In some cases like a jump into a different
> exception level this will attribute to the incorrect process.
>
> Fix it by tracking the previous thread in the same way the previous
> packet is tracked.
>
> Signed-off-by: James Clark <[email protected]>
> ---
> tools/perf/util/cs-etm.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index ebffc9052561..a997fe79d458 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -86,6 +86,7 @@ struct cs_etm_traceid_queue {
> size_t last_branch_pos;
> union perf_event *event_buf;
> struct thread *thread;
> + struct thread *prev_thread;
> struct branch_stack *last_branch;
> struct branch_stack *last_branch_rb;
> struct cs_etm_packet *prev_packet;
> @@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> tidq->trace_chan_id = trace_chan_id;
> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> queue->tid);
> + tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
>
> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
> if (!tidq->packet)
> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
> tmp = tidq->packet;
> tidq->packet = tidq->prev_packet;
> tidq->prev_packet = tmp;
> + thread__put(tidq->prev_thread);
> + tidq->prev_thread = thread__get(tidq->thread);
> }
> }
>
> @@ -791,6 +795,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
> /* Free this traceid_queue from the array */
> tidq = etmq->traceid_queues[idx];
> thread__zput(tidq->thread);
> + thread__zput(tidq->prev_thread);
> zfree(&tidq->event_buf);
> zfree(&tidq->last_branch);
> zfree(&tidq->last_branch_rb);
> @@ -1450,8 +1455,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>
> sample.ip = ip;
> - sample.pid = tidq->thread->pid_;
> - sample.tid = tidq->thread->tid;
> + sample.pid = tidq->prev_thread->pid_;
> + sample.tid = tidq->prev_thread->tid;
> sample.addr = cs_etm__first_executed_instr(tidq->packet);
> sample.id = etmq->etm->branches_id;
> sample.stream_id = etmq->etm->branches_id;
> --
> 2.34.1
>

Reviewed-by: Mike Leach <[email protected]>

--
Mike Leach
Principal Engineer, ARM Ltd.
Manchester Design Centre. UK

2023-05-25 11:31:45

[permalink] [raw]

Subject: Re: [PATCH 3/4] perf cs-etm: Track exception level

Hi James,

On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>
> Currently we assume all trace belongs to the host machine so when
> the decoder should be looking at the guest kernel maps it can crash
> because it looks at the host ones instead.
>
> Avoid one scenario (guest kernel running at EL1) by assigning the
> default guest machine to this trace. For userspace trace it's still not
> possible to determine guest vs host, but the PIDs should help in this
> case.
>
> Signed-off-by: James Clark <[email protected]>
> ---
> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 7 +-
> tools/perf/util/cs-etm.c | 64 ++++++++++++++-----
> tools/perf/util/cs-etm.h | 5 +-
> 3 files changed, 56 insertions(+), 20 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> index 82a27ab90c8b..ac227cd03eb0 100644
> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> @@ -573,12 +573,13 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
> break;
> }
>
> + if (cs_etm__etmq_set_tid_el(etmq, tid, trace_chan_id,
> + elem->context.exception_level))
> + return OCSD_RESP_FATAL_SYS_ERR;
> +
> if (tid == -1)
> return OCSD_RESP_CONT;
>
> - if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
> - return OCSD_RESP_FATAL_SYS_ERR;
> -
> /*
> * A timestamp is generated after a PE_CONTEXT element so make sure
> * to rely on that coming one.
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index a997fe79d458..b9ba19327f26 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -14,7 +14,6 @@
> #include <linux/types.h>
> #include <linux/zalloc.h>
>
> -#include <opencsd/ocsd_if_types.h>
> #include <stdlib.h>
>
> #include "auxtrace.h"
> @@ -87,6 +86,8 @@ struct cs_etm_traceid_queue {
> union perf_event *event_buf;
> struct thread *thread;
> struct thread *prev_thread;
> + ocsd_ex_level prev_el;
> + ocsd_ex_level el;
> struct branch_stack *last_branch;
> struct branch_stack *last_branch_rb;
> struct cs_etm_packet *prev_packet;
> @@ -479,6 +480,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
>
> queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
> tidq->trace_chan_id = trace_chan_id;
> + tidq->el = tidq->prev_el = ocsd_EL_unknown;
> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> queue->tid);
> tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
> @@ -618,6 +620,7 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
> tmp = tidq->packet;
> tidq->packet = tidq->prev_packet;
> tidq->prev_packet = tmp;
> + tidq->prev_el = tidq->el;
> thread__put(tidq->prev_thread);
> tidq->prev_thread = thread__get(tidq->thread);
> }
> @@ -879,11 +882,34 @@ static bool cs_etm__evsel_is_auxtrace(struct perf_session *session,
> return evsel->core.attr.type == aux->pmu_type;
> }
>
> -static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> +static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
> + ocsd_ex_level el)
> {
> - struct machine *machine;
> + /*
> + * Not perfect, but assume anything in EL1 is the default guest, and
> + * everything else is the host. nHVE and pKVM may not work with this
> + * assumption. And distinguishing between guest and host userspaces
> + * isn't currently supported either. Neither is multiple guest support.
> + * All this does is reduce the likeliness of decode errors where we look
> + * into the host kernel maps when it should have been the guest maps.
> + */

What effect does this have if I am running with host only, kernel at
EL1, e.g. any platform that is not running an EL2 kernel?

Mike

> + switch (el) {
> + case ocsd_EL1:
> + return machines__find_guest(&etm->session->machines,
> + DEFAULT_GUEST_KERNEL_ID);
> + case ocsd_EL3:
> + case ocsd_EL2:
> + case ocsd_EL0:
> + case ocsd_EL_unknown:
> + default:
> + return &etm->session->machines.host;
> + }
> +}
>
> - machine = &etmq->etm->session->machines.host;
> +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
> + ocsd_ex_level el)
> +{
> + struct machine *machine = cs_etm__get_machine(etmq->etm, el);
>
> if (address >= machine__kernel_start(machine)) {
> if (machine__is_host(machine))
> @@ -893,10 +919,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> } else {
> if (machine__is_host(machine))
> return PERF_RECORD_MISC_USER;
> - else if (perf_guest)
> + else {
> + /*
> + * Can't really happen at the moment because
> + * cs_etm__get_machine() will always return
> + * machines.host for any non EL1 trace.
> + */
> return PERF_RECORD_MISC_GUEST_USER;
> - else
> - return PERF_RECORD_MISC_HYPERVISOR;
> + }
> }
> }
>
> @@ -913,11 +943,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> if (!etmq)
> return 0;
>
> - cpumode = cs_etm__cpu_mode(etmq, address);
> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> if (!tidq)
> return 0;
>
> + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
> +
> if (!thread__find_map(tidq->thread, cpumode, address, &al))
> return 0;
>
> @@ -1296,9 +1327,10 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
> }
>
> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> - struct cs_etm_traceid_queue *tidq, pid_t tid)
> + struct cs_etm_traceid_queue *tidq, pid_t tid,
> + ocsd_ex_level el)
> {
> - struct machine *machine = &etm->session->machines.host;
> + struct machine *machine = cs_etm__get_machine(etm, el);
>
> if (tid != -1) {
> thread__zput(tidq->thread);
> @@ -1308,10 +1340,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> /* Couldn't find a known thread */
> if (!tidq->thread)
> tidq->thread = machine__idle_thread(machine);
> +
> + tidq->el = el;
> }
>
> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> - pid_t tid, u8 trace_chan_id)
> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
> + ocsd_ex_level el)
> {
> struct cs_etm_traceid_queue *tidq;
>
> @@ -1319,7 +1353,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> if (!tidq)
> return -EINVAL;
>
> - cs_etm__set_thread(etmq->etm, tidq, tid);
> + cs_etm__set_thread(etmq->etm, tidq, tid, el);
> return 0;
> }
>
> @@ -1389,7 +1423,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> struct perf_sample sample = {.ip = 0,};
>
> event->sample.header.type = PERF_RECORD_SAMPLE;
> - event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
> + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
> event->sample.header.size = sizeof(struct perf_event_header);
>
> /* Set time field based on etm auxtrace config. */
> @@ -1448,7 +1482,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> ip = cs_etm__last_executed_instr(tidq->prev_packet);
>
> event->sample.header.type = PERF_RECORD_SAMPLE;
> - event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
> + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, tidq->prev_el);
> event->sample.header.size = sizeof(struct perf_event_header);
>
> /* Set time field based on etm auxtrace config. */
> diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
> index 70cac0375b34..88e9b25a8a9f 100644
> --- a/tools/perf/util/cs-etm.h
> +++ b/tools/perf/util/cs-etm.h
> @@ -232,10 +232,11 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
> struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu);
>
> #ifdef HAVE_CSTRACE_SUPPORT
> +#include <opencsd/ocsd_if_types.h>
> int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
> int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> - pid_t tid, u8 trace_chan_id);
> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
> + ocsd_ex_level el);
> bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
> void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
> u8 trace_chan_id);
> --
> 2.34.1
>

--
Mike Leach
Principal Engineer, ARM Ltd.
Manchester Design Centre. UK

2023-05-25 11:43:39

[permalink] [raw]

Subject: Re: [PATCH 4/4] perf cs-etm: Add exception level consistency check

Hi James,

My concern here is that for etmv3 trace, OpenCSD will only provide
memory spaces as either secure or non-secure, The ETMv3 does not
trace, and hence OpenCSD cannot provide the different ELs.
The memory callback will be either OCSD_MEM_SPACE_S or OCSD_MEM_SPACE_N.

Can this patch - and the set handle this. (assuming perf supports our
ETMv3 coresight kernel driver)

Regards

Mike

On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>
> Assert that our own tracking of the exception level matches what
> OpenCSD provides. OpenCSD doesn't distinguish between EL0 and EL1 in the
> memory access callback so the extra tracking was required. But a rough
> assert can still be done.
>
> Signed-off-by: James Clark <[email protected]>
> ---
> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 6 +--
> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 +-
> tools/perf/util/cs-etm.c | 37 +++++++++++++------
> 3 files changed, 32 insertions(+), 15 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> index ac227cd03eb0..50b3c248d1e5 100644
> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> @@ -52,15 +52,15 @@ struct cs_etm_decoder {
> static u32
> cs_etm_decoder__mem_access(const void *context,
> const ocsd_vaddr_t address,
> - const ocsd_mem_space_acc_t mem_space __maybe_unused,
> + const ocsd_mem_space_acc_t mem_space,
> const u8 trace_chan_id,
> const u32 req_size,
> u8 *buffer)
> {
> struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
>
> - return decoder->mem_access(decoder->data, trace_chan_id,
> - address, req_size, buffer);
> + return decoder->mem_access(decoder->data, trace_chan_id, address,
> + req_size, buffer, mem_space);
> }
>
> int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder,
> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
> index 21d403f55d96..272c2efe78ee 100644
> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
> @@ -11,6 +11,7 @@
> #define INCLUDE__CS_ETM_DECODER_H__
>
> #include <linux/types.h>
> +#include <opencsd/ocsd_if_types.h>
> #include <stdio.h>
>
> struct cs_etm_decoder;
> @@ -19,7 +20,8 @@ struct cs_etm_packet_queue;
>
> struct cs_etm_queue;
>
> -typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *);
> +typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *,
> + const ocsd_mem_space_acc_t);
>
> struct cs_etmv3_trace_params {
> u32 reg_ctrl;
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index b9ba19327f26..ccf34ed8ddf2 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -931,7 +931,8 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
> }
>
> static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> - u64 address, size_t size, u8 *buffer)
> + u64 address, size_t size, u8 *buffer,
> + const ocsd_mem_space_acc_t mem_space)
> {
> u8 cpumode;
> u64 offset;
> @@ -947,6 +948,20 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> if (!tidq)
> return 0;
>
> + /*
> + * We've already tracked EL along side the PID in cs_etm__set_thread()
> + * so double check that it matches what OpenCSD thinks as well. It
> + * doesn't distinguish between EL0 and EL1 for this mem access callback
> + * so we had to do the extra tracking.
> + */
> + if (mem_space & OCSD_MEM_SPACE_EL1N) {
> + /* Includes both non secure EL1 and EL0 */
> + assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0);
> + } else if (mem_space & OCSD_MEM_SPACE_EL2)
> + assert(tidq->el == ocsd_EL2);
> + else if (mem_space & OCSD_MEM_SPACE_EL3)
> + assert(tidq->el == ocsd_EL3);
> +
> cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
>
> if (!thread__find_map(tidq->thread, cpumode, address, &al))
> @@ -1195,8 +1210,8 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
> {
> u8 instrBytes[2];
>
> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> - ARRAY_SIZE(instrBytes), instrBytes);
> + cs_etm__mem_access(etmq, trace_chan_id, addr, ARRAY_SIZE(instrBytes),
> + instrBytes, 0);
> /*
> * T32 instruction size is indicated by bits[15:11] of the first
> * 16-bit word of the instruction: 0b11101, 0b11110 and 0b11111
> @@ -1387,8 +1402,8 @@ static void cs_etm__copy_insn(struct cs_etm_queue *etmq,
> else
> sample->insn_len = 4;
>
> - cs_etm__mem_access(etmq, trace_chan_id, sample->ip,
> - sample->insn_len, (void *)sample->insn);
> + cs_etm__mem_access(etmq, trace_chan_id, sample->ip, sample->insn_len,
> + (void *)sample->insn, 0);
> }
>
> u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
> @@ -1940,8 +1955,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
> * so below only read 2 bytes as instruction size for T32.
> */
> addr = end_addr - 2;
> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> - sizeof(instr16), (u8 *)&instr16);
> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr16),
> + (u8 *)&instr16, 0);
> if ((instr16 & 0xFF00) == 0xDF00)
> return true;
>
> @@ -1956,8 +1971,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
> * +---------+---------+-------------------------+
> */
> addr = end_addr - 4;
> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> - sizeof(instr32), (u8 *)&instr32);
> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
> + (u8 *)&instr32, 0);
> if ((instr32 & 0x0F000000) == 0x0F000000 &&
> (instr32 & 0xF0000000) != 0xF0000000)
> return true;
> @@ -1973,8 +1988,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
> * +-----------------------+---------+-----------+
> */
> addr = end_addr - 4;
> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> - sizeof(instr32), (u8 *)&instr32);
> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
> + (u8 *)&instr32, 0);
> if ((instr32 & 0xFFE0001F) == 0xd4000001)
> return true;
>
> --
> 2.34.1
>

--
Mike Leach
Principal Engineer, ARM Ltd.
Manchester Design Centre. UK

2023-05-27 08:53:54

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf cs-etm: Only track threads instead of PID and TIDs

On Wed, May 24, 2023 at 02:19:55PM +0100, James Clark wrote:
> PIDs and TIDs are already contained within the thread struct, so to
> avoid inconsistencies drop the extra members on the etm queue and only
> use the thread struct.
>
> At the same time stop using the 'unknown' thread. In a later commit
> we will be making samples from multiple machines so it will be better
> to use the idle thread of each machine rather than overlapping unknown
> threads. Using the idle thread is also better because kernel addresses
> with a previously unknown thread will now be assigned to a real kernel
> thread.

Using the idle thread to replace the 'unknown' thread is good thing
for me, though we will introduce imprecise statistics for idle thread,
but the inaccuration is not big in this case.

> Signed-off-by: James Clark <[email protected]>
> ---
> tools/perf/util/cs-etm.c | 124 ++++++++++++---------------------------
> 1 file changed, 38 insertions(+), 86 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 91299cc56bf7..ebffc9052561 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -46,8 +46,6 @@ struct cs_etm_auxtrace {
> struct auxtrace_heap heap;
> struct itrace_synth_opts synth_opts;
> struct perf_session *session;
> - struct machine *machine;
> - struct thread *unknown_thread;
> struct perf_tsc_conversion tc;
>
> /*
> @@ -84,7 +82,6 @@ struct cs_etm_auxtrace {
>
> struct cs_etm_traceid_queue {
> u8 trace_chan_id;
> - pid_t pid, tid;
> u64 period_instructions;
> size_t last_branch_pos;
> union perf_event *event_buf;
> @@ -480,9 +477,9 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> cs_etm__clear_packet_queue(&tidq->packet_queue);
>
> queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
> - tidq->tid = queue->tid;
> - tidq->pid = -1;
> tidq->trace_chan_id = trace_chan_id;
> + tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> + queue->tid);
>
> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
> if (!tidq->packet)
> @@ -863,7 +860,6 @@ static void cs_etm__free(struct perf_session *session)
> for (i = 0; i < aux->num_cpu; i++)
> zfree(&aux->metadata[i]);
>
> - thread__zput(aux->unknown_thread);
> zfree(&aux->metadata);
> zfree(&aux);
> }
> @@ -882,7 +878,7 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> {
> struct machine *machine;
>
> - machine = etmq->etm->machine;
> + machine = &etmq->etm->session->machines.host;
>
> if (address >= machine__kernel_start(machine)) {
> if (machine__is_host(machine))
> @@ -905,8 +901,6 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> u8 cpumode;
> u64 offset;
> int len;
> - struct thread *thread;
> - struct machine *machine;
> struct addr_location al;
> struct dso *dso;
> struct cs_etm_traceid_queue *tidq;
> @@ -914,20 +908,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> if (!etmq)
> return 0;
>
> - machine = etmq->etm->machine;
> cpumode = cs_etm__cpu_mode(etmq, address);
> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> if (!tidq)
> return 0;
>
> - thread = tidq->thread;
> - if (!thread) {
> - if (cpumode != PERF_RECORD_MISC_KERNEL)
> - return 0;
> - thread = etmq->etm->unknown_thread;
> - }
> -
> - if (!thread__find_map(thread, cpumode, address, &al))
> + if (!thread__find_map(tidq->thread, cpumode, address, &al))
> return 0;
>
> dso = map__dso(al.map);
> @@ -942,7 +928,8 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>
> map__load(al.map);
>
> - len = dso__data_read_offset(dso, machine, offset, buffer, size);
> + len = dso__data_read_offset(dso, maps__machine(tidq->thread->maps),
> + offset, buffer, size);
>
> if (len <= 0) {
> ui__warning_once("CS ETM Trace: Missing DSO. Use 'perf archive' or debuginfod to export data from the traced system.\n"
> @@ -1303,39 +1290,31 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
> return etmq->buf_len;
> }
>
> -static void cs_etm__set_pid_tid_cpu(struct cs_etm_auxtrace *etm,
> - struct cs_etm_traceid_queue *tidq)
> +static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> + struct cs_etm_traceid_queue *tidq, pid_t tid)
> {
> - if ((!tidq->thread) && (tidq->tid != -1))
> - tidq->thread = machine__find_thread(etm->machine, -1,
> - tidq->tid);
> + struct machine *machine = &etm->session->machines.host;
> +
> + if (tid != -1) {
> + thread__zput(tidq->thread);
> + tidq->thread = machine__find_thread(machine, -1, tid);
> + }
>
> - if (tidq->thread)
> - tidq->pid = tidq->thread->pid_;
> + /* Couldn't find a known thread */
> + if (!tidq->thread)
> + tidq->thread = machine__idle_thread(machine);
> }
>
> int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> pid_t tid, u8 trace_chan_id)
> {
> - int cpu, err = -EINVAL;
> - struct cs_etm_auxtrace *etm = etmq->etm;
> struct cs_etm_traceid_queue *tidq;
>
> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> if (!tidq)
> - return err;
> -
> - if (cs_etm__get_cpu(trace_chan_id, &cpu) < 0)
> - return err;
> -
> - err = machine__set_current_tid(etm->machine, cpu, tid, tid);

If we don't invoke machine__set_current_tid(), 'thread->cpu' will not
updated anymore.

Seems this is fine since cs-etm uses 'tidq->packet->cpu' to assign
sample's cpu instead of using 'thread->cpu'.

So the change LGTM:

Reviewed-by: Leo Yan <[email protected]>

> - if (err)
> - return err;
> -
> - tidq->tid = tid;
> - thread__zput(tidq->thread);
> + return -EINVAL;
>
> - cs_etm__set_pid_tid_cpu(etm, tidq);
> + cs_etm__set_thread(etmq->etm, tidq, tid);
> return 0;
> }
>
> @@ -1412,8 +1391,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>
> sample.ip = addr;
> - sample.pid = tidq->pid;
> - sample.tid = tidq->tid;
> + sample.pid = tidq->thread->pid_;
> + sample.tid = tidq->thread->tid;
> sample.id = etmq->etm->instructions_id;
> sample.stream_id = etmq->etm->instructions_id;
> sample.period = period;
> @@ -1471,8 +1450,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>
> sample.ip = ip;
> - sample.pid = tidq->pid;
> - sample.tid = tidq->tid;
> + sample.pid = tidq->thread->pid_;
> + sample.tid = tidq->thread->tid;
> sample.addr = cs_etm__first_executed_instr(tidq->packet);
> sample.id = etmq->etm->branches_id;
> sample.stream_id = etmq->etm->branches_id;
> @@ -2466,11 +2445,6 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
> if (!etmq)
> continue;
>
> - /*
> - * Per-cpu mode has contextIDs in the trace and the decoder
> - * calls cs_etm__set_pid_tid_cpu() automatically so no need
> - * to do this here
> - */
> if (etm->per_thread_decoding) {
> tidq = cs_etm__etmq_get_traceid_queue(
> etmq, CS_ETM_PER_THREAD_TRACEID);
> @@ -2478,10 +2452,8 @@ static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
> if (!tidq)
> continue;
>
> - if ((tid == -1) || (tidq->tid == tid)) {
> - cs_etm__set_pid_tid_cpu(etm, tidq);
> + if (tid == -1 || tidq->thread->tid == tid)
> cs_etm__run_per_thread_timeless_decoder(etmq);
> - }
> } else
> cs_etm__run_per_cpu_timeless_decoder(etmq);
> }
> @@ -2611,10 +2583,12 @@ static int cs_etm__process_itrace_start(struct cs_etm_auxtrace *etm,
> return 0;
>
> /*
> - * Add the tid/pid to the log so that we can get a match when
> - * we get a contextID from the decoder.
> + * Add the tid/pid to the log so that we can get a match when we get a
> + * contextID from the decoder. Only track for the host: only kernel
> + * trace is supported for guests which wouldn't need pids so this should
> + * be fine.
> */
> - th = machine__findnew_thread(etm->machine,
> + th = machine__findnew_thread(&etm->session->machines.host,
> event->itrace_start.pid,
> event->itrace_start.tid);
> if (!th)
> @@ -2647,10 +2621,12 @@ static int cs_etm__process_switch_cpu_wide(struct cs_etm_auxtrace *etm,
> return 0;
>
> /*
> - * Add the tid/pid to the log so that we can get a match when
> - * we get a contextID from the decoder.
> + * Add the tid/pid to the log so that we can get a match when we get a
> + * contextID from the decoder. Only track for the host: only kernel
> + * trace is supported for guests which wouldn't need pids so this should
> + * be fine.
> */
> - th = machine__findnew_thread(etm->machine,
> + th = machine__findnew_thread(&etm->session->machines.host,
> event->context_switch.next_prev_pid,
> event->context_switch.next_prev_tid);
> if (!th)
> @@ -3259,7 +3235,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> }
>
> etm->session = session;
> - etm->machine = &session->machines.host;
>
> etm->num_cpu = num_cpu;
> etm->pmu_type = (unsigned int) ((ptr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff);
> @@ -3286,27 +3261,6 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> if (err)
> return err;
>
> - etm->unknown_thread = thread__new(999999999, 999999999);
> - if (!etm->unknown_thread) {
> - err = -ENOMEM;
> - goto err_free_queues;
> - }
> -
> - /*
> - * Initialize list node so that at thread__zput() we can avoid
> - * segmentation fault at list_del_init().
> - */
> - INIT_LIST_HEAD(&etm->unknown_thread->node);
> -
> - err = thread__set_comm(etm->unknown_thread, "unknown", 0);
> - if (err)
> - goto err_delete_thread;
> -
> - if (thread__init_maps(etm->unknown_thread, etm->machine)) {
> - err = -ENOMEM;
> - goto err_delete_thread;
> - }
> -
> etm->tc.time_shift = tc->time_shift;
> etm->tc.time_mult = tc->time_mult;
> etm->tc.time_zero = tc->time_zero;
> @@ -3318,7 +3272,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> }
> err = cs_etm__synth_events(etm, session);
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> /*
> * Map Trace ID values to CPU metadata.
> @@ -3348,7 +3302,7 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> session->header.data_size,
> cs_etm__process_aux_hw_id_cb, &aux_hw_id_found);
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> /* if HW ID found then clear any unused metadata ID values */
> if (aux_hw_id_found)
> @@ -3358,17 +3312,15 @@ int cs_etm__process_auxtrace_info_full(union perf_event *event,
> err = cs_etm__map_trace_ids_metadata(num_cpu, metadata);
>
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> err = cs_etm__queue_aux_records(session);
> if (err)
> - goto err_delete_thread;
> + goto err_free_queues;
>
> etm->data_queued = etm->queues.populated;
> return 0;
>
> -err_delete_thread:
> - thread__zput(etm->unknown_thread);
> err_free_queues:
> auxtrace_queues__free(&etm->queues);
> session->auxtrace = NULL;
> --
> 2.34.1
>

2023-05-27 09:22:27

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On Wed, May 24, 2023 at 02:19:56PM +0100, James Clark wrote:
> Branch samples currently use the IP of the previous packet as the from
> IP, and the IP of the current packet as the to IP. But it incorrectly
> uses the current thread. In some cases like a jump into a different
> exception level this will attribute to the incorrect process.

It's about the timing that branch has taken or not taken :)

If we think the branch sample as 'branch has taken', then current code
is doning right thing, otherwise, we need this fix.

> Fix it by tracking the previous thread in the same way the previous
> packet is tracked.
>
> Signed-off-by: James Clark <[email protected]>
> ---
> tools/perf/util/cs-etm.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index ebffc9052561..a997fe79d458 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -86,6 +86,7 @@ struct cs_etm_traceid_queue {
> size_t last_branch_pos;
> union perf_event *event_buf;
> struct thread *thread;
> + struct thread *prev_thread;
> struct branch_stack *last_branch;
> struct branch_stack *last_branch_rb;
> struct cs_etm_packet *prev_packet;
> @@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> tidq->trace_chan_id = trace_chan_id;
> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> queue->tid);
> + tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
>
> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
> if (!tidq->packet)
> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
> tmp = tidq->packet;
> tidq->packet = tidq->prev_packet;
> tidq->prev_packet = tmp;
> + thread__put(tidq->prev_thread);
> + tidq->prev_thread = thread__get(tidq->thread);

Maybe cs_etm__packet_swap() is not the best place to update
"tidq->prev_thread", since swapping packet doesn't mean it's necessarily
thread switching; can we move this change into the cs_etm__set_thread()?

Thanks,
Leo

> }
> }
>
> @@ -791,6 +795,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
> /* Free this traceid_queue from the array */
> tidq = etmq->traceid_queues[idx];
> thread__zput(tidq->thread);
> + thread__zput(tidq->prev_thread);
> zfree(&tidq->event_buf);
> zfree(&tidq->last_branch);
> zfree(&tidq->last_branch_rb);
> @@ -1450,8 +1455,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>
> sample.ip = ip;
> - sample.pid = tidq->thread->pid_;
> - sample.tid = tidq->thread->tid;
> + sample.pid = tidq->prev_thread->pid_;
> + sample.tid = tidq->prev_thread->tid;
> sample.addr = cs_etm__first_executed_instr(tidq->packet);
> sample.id = etmq->etm->branches_id;
> sample.stream_id = etmq->etm->branches_id;
> --
> 2.34.1
>

2023-05-28 11:22:54

[permalink] [raw]

Subject: Re: [PATCH 3/4] perf cs-etm: Track exception level

On Wed, May 24, 2023 at 02:19:57PM +0100, James Clark wrote:
> Currently we assume all trace belongs to the host machine so when
> the decoder should be looking at the guest kernel maps it can crash
> because it looks at the host ones instead.
>
> Avoid one scenario (guest kernel running at EL1) by assigning the
> default guest machine to this trace. For userspace trace it's still not
> possible to determine guest vs host, but the PIDs should help in this
> case.
>
> Signed-off-by: James Clark <[email protected]>
> ---
> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 7 +-
> tools/perf/util/cs-etm.c | 64 ++++++++++++++-----
> tools/perf/util/cs-etm.h | 5 +-
> 3 files changed, 56 insertions(+), 20 deletions(-)
>
> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> index 82a27ab90c8b..ac227cd03eb0 100644
> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> @@ -573,12 +573,13 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
> break;
> }
>
> + if (cs_etm__etmq_set_tid_el(etmq, tid, trace_chan_id,
> + elem->context.exception_level))
> + return OCSD_RESP_FATAL_SYS_ERR;
> +
> if (tid == -1)
> return OCSD_RESP_CONT;
>
> - if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
> - return OCSD_RESP_FATAL_SYS_ERR;
> -
> /*
> * A timestamp is generated after a PE_CONTEXT element so make sure
> * to rely on that coming one.
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index a997fe79d458..b9ba19327f26 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -14,7 +14,6 @@
> #include <linux/types.h>
> #include <linux/zalloc.h>
>
> -#include <opencsd/ocsd_if_types.h>
> #include <stdlib.h>
>
> #include "auxtrace.h"
> @@ -87,6 +86,8 @@ struct cs_etm_traceid_queue {
> union perf_event *event_buf;
> struct thread *thread;
> struct thread *prev_thread;
> + ocsd_ex_level prev_el;
> + ocsd_ex_level el;
> struct branch_stack *last_branch;
> struct branch_stack *last_branch_rb;
> struct cs_etm_packet *prev_packet;
> @@ -479,6 +480,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
>
> queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
> tidq->trace_chan_id = trace_chan_id;
> + tidq->el = tidq->prev_el = ocsd_EL_unknown;
> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> queue->tid);
> tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
> @@ -618,6 +620,7 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
> tmp = tidq->packet;
> tidq->packet = tidq->prev_packet;
> tidq->prev_packet = tmp;
> + tidq->prev_el = tidq->el;
> thread__put(tidq->prev_thread);
> tidq->prev_thread = thread__get(tidq->thread);
> }
> @@ -879,11 +882,34 @@ static bool cs_etm__evsel_is_auxtrace(struct perf_session *session,
> return evsel->core.attr.type == aux->pmu_type;
> }
>
> -static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> +static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
> + ocsd_ex_level el)
> {
> - struct machine *machine;
> + /*
> + * Not perfect, but assume anything in EL1 is the default guest, and
> + * everything else is the host. nHVE and pKVM may not work with this

s/nHVE/nVHE ?

And it's good to say "if any virtulisation (e.g. pKVM) based on nVHE may
not work with this ....".

> + * assumption. And distinguishing between guest and host userspaces
> + * isn't currently supported either. Neither is multiple guest support.
> + * All this does is reduce the likeliness of decode errors where we look
> + * into the host kernel maps when it should have been the guest maps.
> + */
> + switch (el) {
> + case ocsd_EL1:
> + return machines__find_guest(&etm->session->machines,
> + DEFAULT_GUEST_KERNEL_ID);

I share the same concern with Mike that this will break perf for any
Armv8 CPUs with nVHE (like Cortex-CA53, etc).

You could see CoreSight has trace data "elem->context.vmid", when a
guest kernel runs in EL1, can we use "elem->context.vmid" as a guest
kernel ID and finally retrieve the corresponding machine pointer?

Thanks,
Leo

> + case ocsd_EL3:
> + case ocsd_EL2:
> + case ocsd_EL0:
> + case ocsd_EL_unknown:
> + default:
> + return &etm->session->machines.host;
> + }
> +}
>
> - machine = &etmq->etm->session->machines.host;
> +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
> + ocsd_ex_level el)
> +{
> + struct machine *machine = cs_etm__get_machine(etmq->etm, el);
>
> if (address >= machine__kernel_start(machine)) {
> if (machine__is_host(machine))
> @@ -893,10 +919,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> } else {
> if (machine__is_host(machine))
> return PERF_RECORD_MISC_USER;
> - else if (perf_guest)
> + else {
> + /*
> + * Can't really happen at the moment because
> + * cs_etm__get_machine() will always return
> + * machines.host for any non EL1 trace.
> + */
> return PERF_RECORD_MISC_GUEST_USER;
> - else
> - return PERF_RECORD_MISC_HYPERVISOR;
> + }
> }
> }
>
> @@ -913,11 +943,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> if (!etmq)
> return 0;
>
> - cpumode = cs_etm__cpu_mode(etmq, address);
> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> if (!tidq)
> return 0;
>
> + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
> +
> if (!thread__find_map(tidq->thread, cpumode, address, &al))
> return 0;
>
> @@ -1296,9 +1327,10 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
> }
>
> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> - struct cs_etm_traceid_queue *tidq, pid_t tid)
> + struct cs_etm_traceid_queue *tidq, pid_t tid,
> + ocsd_ex_level el)
> {
> - struct machine *machine = &etm->session->machines.host;
> + struct machine *machine = cs_etm__get_machine(etm, el);
>
> if (tid != -1) {
> thread__zput(tidq->thread);
> @@ -1308,10 +1340,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> /* Couldn't find a known thread */
> if (!tidq->thread)
> tidq->thread = machine__idle_thread(machine);
> +
> + tidq->el = el;
> }
>
> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> - pid_t tid, u8 trace_chan_id)
> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
> + ocsd_ex_level el)
> {
> struct cs_etm_traceid_queue *tidq;
>
> @@ -1319,7 +1353,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> if (!tidq)
> return -EINVAL;
>
> - cs_etm__set_thread(etmq->etm, tidq, tid);
> + cs_etm__set_thread(etmq->etm, tidq, tid, el);
> return 0;
> }
>
> @@ -1389,7 +1423,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> struct perf_sample sample = {.ip = 0,};
>
> event->sample.header.type = PERF_RECORD_SAMPLE;
> - event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
> + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
> event->sample.header.size = sizeof(struct perf_event_header);
>
> /* Set time field based on etm auxtrace config. */
> @@ -1448,7 +1482,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> ip = cs_etm__last_executed_instr(tidq->prev_packet);
>
> event->sample.header.type = PERF_RECORD_SAMPLE;
> - event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
> + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, tidq->prev_el);
> event->sample.header.size = sizeof(struct perf_event_header);
>
> /* Set time field based on etm auxtrace config. */
> diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
> index 70cac0375b34..88e9b25a8a9f 100644
> --- a/tools/perf/util/cs-etm.h
> +++ b/tools/perf/util/cs-etm.h
> @@ -232,10 +232,11 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
> struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu);
>
> #ifdef HAVE_CSTRACE_SUPPORT
> +#include <opencsd/ocsd_if_types.h>
> int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
> int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> - pid_t tid, u8 trace_chan_id);
> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
> + ocsd_ex_level el);
> bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
> void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
> u8 trace_chan_id);
> --
> 2.34.1
>

2023-05-30 09:25:59

[permalink] [raw]

Subject: Re: [PATCH 4/4] perf cs-etm: Add exception level consistency check

On 25/05/2023 12:39, Mike Leach wrote:
> Hi James,
>
> My concern here is that for etmv3 trace, OpenCSD will only provide
> memory spaces as either secure or non-secure, The ETMv3 does not
> trace, and hence OpenCSD cannot provide the different ELs.
> The memory callback will be either OCSD_MEM_SPACE_S or OCSD_MEM_SPACE_N.
>

As long as none of the bits are set for EL1-EL3 then no validation will
be done so it should be fine. But I will try to test ETMv3.

> Can this patch - and the set handle this. (assuming perf supports our
> ETMv3 coresight kernel driver)

For the whole set, the tracked EL will stay as ocsd_EL_unknown and
cs_etm__get_machine() will return host so the behavior will be unchanged
from before. This is assuming OpenCSD will set the EL to unknown in
elem->context.exception_level in this case.

>
> Regards
>
> Mike
>
> On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>>
>> Assert that our own tracking of the exception level matches what
>> OpenCSD provides. OpenCSD doesn't distinguish between EL0 and EL1 in the
>> memory access callback so the extra tracking was required. But a rough
>> assert can still be done.
>>
>> Signed-off-by: James Clark <[email protected]>
>> ---
>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 6 +--
>> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 +-
>> tools/perf/util/cs-etm.c | 37 +++++++++++++------
>> 3 files changed, 32 insertions(+), 15 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> index ac227cd03eb0..50b3c248d1e5 100644
>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> @@ -52,15 +52,15 @@ struct cs_etm_decoder {
>> static u32
>> cs_etm_decoder__mem_access(const void *context,
>> const ocsd_vaddr_t address,
>> - const ocsd_mem_space_acc_t mem_space __maybe_unused,
>> + const ocsd_mem_space_acc_t mem_space,
>> const u8 trace_chan_id,
>> const u32 req_size,
>> u8 *buffer)
>> {
>> struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
>>
>> - return decoder->mem_access(decoder->data, trace_chan_id,
>> - address, req_size, buffer);
>> + return decoder->mem_access(decoder->data, trace_chan_id, address,
>> + req_size, buffer, mem_space);
>> }
>>
>> int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder,
>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>> index 21d403f55d96..272c2efe78ee 100644
>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>> @@ -11,6 +11,7 @@
>> #define INCLUDE__CS_ETM_DECODER_H__
>>
>> #include <linux/types.h>
>> +#include <opencsd/ocsd_if_types.h>
>> #include <stdio.h>
>>
>> struct cs_etm_decoder;
>> @@ -19,7 +20,8 @@ struct cs_etm_packet_queue;
>>
>> struct cs_etm_queue;
>>
>> -typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *);
>> +typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *,
>> + const ocsd_mem_space_acc_t);
>>
>> struct cs_etmv3_trace_params {
>> u32 reg_ctrl;
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index b9ba19327f26..ccf34ed8ddf2 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -931,7 +931,8 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
>> }
>>
>> static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> - u64 address, size_t size, u8 *buffer)
>> + u64 address, size_t size, u8 *buffer,
>> + const ocsd_mem_space_acc_t mem_space)
>> {
>> u8 cpumode;
>> u64 offset;
>> @@ -947,6 +948,20 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> if (!tidq)
>> return 0;
>>
>> + /*
>> + * We've already tracked EL along side the PID in cs_etm__set_thread()
>> + * so double check that it matches what OpenCSD thinks as well. It
>> + * doesn't distinguish between EL0 and EL1 for this mem access callback
>> + * so we had to do the extra tracking.
>> + */
>> + if (mem_space & OCSD_MEM_SPACE_EL1N) {
>> + /* Includes both non secure EL1 and EL0 */
>> + assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0);
>> + } else if (mem_space & OCSD_MEM_SPACE_EL2)
>> + assert(tidq->el == ocsd_EL2);
>> + else if (mem_space & OCSD_MEM_SPACE_EL3)
>> + assert(tidq->el == ocsd_EL3);
>> +
>> cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
>>
>> if (!thread__find_map(tidq->thread, cpumode, address, &al))
>> @@ -1195,8 +1210,8 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
>> {
>> u8 instrBytes[2];
>>
>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>> - ARRAY_SIZE(instrBytes), instrBytes);
>> + cs_etm__mem_access(etmq, trace_chan_id, addr, ARRAY_SIZE(instrBytes),
>> + instrBytes, 0);
>> /*
>> * T32 instruction size is indicated by bits[15:11] of the first
>> * 16-bit word of the instruction: 0b11101, 0b11110 and 0b11111
>> @@ -1387,8 +1402,8 @@ static void cs_etm__copy_insn(struct cs_etm_queue *etmq,
>> else
>> sample->insn_len = 4;
>>
>> - cs_etm__mem_access(etmq, trace_chan_id, sample->ip,
>> - sample->insn_len, (void *)sample->insn);
>> + cs_etm__mem_access(etmq, trace_chan_id, sample->ip, sample->insn_len,
>> + (void *)sample->insn, 0);
>> }
>>
>> u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
>> @@ -1940,8 +1955,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> * so below only read 2 bytes as instruction size for T32.
>> */
>> addr = end_addr - 2;
>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>> - sizeof(instr16), (u8 *)&instr16);
>> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr16),
>> + (u8 *)&instr16, 0);
>> if ((instr16 & 0xFF00) == 0xDF00)
>> return true;
>>
>> @@ -1956,8 +1971,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> * +---------+---------+-------------------------+
>> */
>> addr = end_addr - 4;
>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>> - sizeof(instr32), (u8 *)&instr32);
>> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
>> + (u8 *)&instr32, 0);
>> if ((instr32 & 0x0F000000) == 0x0F000000 &&
>> (instr32 & 0xF0000000) != 0xF0000000)
>> return true;
>> @@ -1973,8 +1988,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> * +-----------------------+---------+-----------+
>> */
>> addr = end_addr - 4;
>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>> - sizeof(instr32), (u8 *)&instr32);
>> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
>> + (u8 *)&instr32, 0);
>> if ((instr32 & 0xFFE0001F) == 0xd4000001)
>> return true;
>>
>> --
>> 2.34.1
>>
>
>

2023-05-30 09:40:17

[permalink] [raw]

Subject: Re: [PATCH 3/4] perf cs-etm: Track exception level

On 25/05/2023 12:16, Mike Leach wrote:
> Hi James,
>
> On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>>
>> Currently we assume all trace belongs to the host machine so when
>> the decoder should be looking at the guest kernel maps it can crash
>> because it looks at the host ones instead.
>>
>> Avoid one scenario (guest kernel running at EL1) by assigning the
>> default guest machine to this trace. For userspace trace it's still not
>> possible to determine guest vs host, but the PIDs should help in this
>> case.
>>
>> Signed-off-by: James Clark <[email protected]>
>> ---
>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 7 +-
>> tools/perf/util/cs-etm.c | 64 ++++++++++++++-----
>> tools/perf/util/cs-etm.h | 5 +-
>> 3 files changed, 56 insertions(+), 20 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> index 82a27ab90c8b..ac227cd03eb0 100644
>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> @@ -573,12 +573,13 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
>> break;
>> }
>>
>> + if (cs_etm__etmq_set_tid_el(etmq, tid, trace_chan_id,
>> + elem->context.exception_level))
>> + return OCSD_RESP_FATAL_SYS_ERR;
>> +
>> if (tid == -1)
>> return OCSD_RESP_CONT;
>>
>> - if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
>> - return OCSD_RESP_FATAL_SYS_ERR;
>> -
>> /*
>> * A timestamp is generated after a PE_CONTEXT element so make sure
>> * to rely on that coming one.
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index a997fe79d458..b9ba19327f26 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -14,7 +14,6 @@
>> #include <linux/types.h>
>> #include <linux/zalloc.h>
>>
>> -#include <opencsd/ocsd_if_types.h>
>> #include <stdlib.h>
>>
>> #include "auxtrace.h"
>> @@ -87,6 +86,8 @@ struct cs_etm_traceid_queue {
>> union perf_event *event_buf;
>> struct thread *thread;
>> struct thread *prev_thread;
>> + ocsd_ex_level prev_el;
>> + ocsd_ex_level el;
>> struct branch_stack *last_branch;
>> struct branch_stack *last_branch_rb;
>> struct cs_etm_packet *prev_packet;
>> @@ -479,6 +480,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
>>
>> queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
>> tidq->trace_chan_id = trace_chan_id;
>> + tidq->el = tidq->prev_el = ocsd_EL_unknown;
>> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
>> queue->tid);
>> tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
>> @@ -618,6 +620,7 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
>> tmp = tidq->packet;
>> tidq->packet = tidq->prev_packet;
>> tidq->prev_packet = tmp;
>> + tidq->prev_el = tidq->el;
>> thread__put(tidq->prev_thread);
>> tidq->prev_thread = thread__get(tidq->thread);
>> }
>> @@ -879,11 +882,34 @@ static bool cs_etm__evsel_is_auxtrace(struct perf_session *session,
>> return evsel->core.attr.type == aux->pmu_type;
>> }
>>
>> -static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
>> +static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
>> + ocsd_ex_level el)
>> {
>> - struct machine *machine;
>> + /*
>> + * Not perfect, but assume anything in EL1 is the default guest, and
>> + * everything else is the host. nHVE and pKVM may not work with this
>> + * assumption. And distinguishing between guest and host userspaces
>> + * isn't currently supported either. Neither is multiple guest support.
>> + * All this does is reduce the likeliness of decode errors where we look
>> + * into the host kernel maps when it should have been the guest maps.
>> + */
>
> What effect does this have if I am running with host only, kernel at
> EL1, e.g. any platform that is not running an EL2 kernel?

I suppose I didn't think about that case. It looks like you'd have to
start using --guest-vmlinux instead of --vmlinux to decode. But that's
probably not what we want.

Is that a standard configuration though? I'm wondering if we need to
support that out of the box, or needing the extra command line argument
is ok? It seems like it's hard to make anything just work without the
user providing extra info.

After this change we also wanted to start setting exclude_guest=1 by
default, so technically this change is only needed when exclude_guest=0.
Otherwise we could assume it's always the host regardless of the EL. It
should be quite easy to check what that setting was in this switch
statement, but it would be temporarily broken in the mean time. And also
we can only make exclude_guest work with FEAT_TRF (v8.4 onwards)

>
> Mike
>
>
>> + switch (el) {
>> + case ocsd_EL1:
>> + return machines__find_guest(&etm->session->machines,
>> + DEFAULT_GUEST_KERNEL_ID);
>> + case ocsd_EL3:
>> + case ocsd_EL2:
>> + case ocsd_EL0:
>> + case ocsd_EL_unknown:
>> + default:
>> + return &etm->session->machines.host;
>> + }
>> +}
>>
>> - machine = &etmq->etm->session->machines.host;
>> +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
>> + ocsd_ex_level el)
>> +{
>> + struct machine *machine = cs_etm__get_machine(etmq->etm, el);
>>
>> if (address >= machine__kernel_start(machine)) {
>> if (machine__is_host(machine))
>> @@ -893,10 +919,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
>> } else {
>> if (machine__is_host(machine))
>> return PERF_RECORD_MISC_USER;
>> - else if (perf_guest)
>> + else {
>> + /*
>> + * Can't really happen at the moment because
>> + * cs_etm__get_machine() will always return
>> + * machines.host for any non EL1 trace.
>> + */
>> return PERF_RECORD_MISC_GUEST_USER;
>> - else
>> - return PERF_RECORD_MISC_HYPERVISOR;
>> + }
>> }
>> }
>>
>> @@ -913,11 +943,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> if (!etmq)
>> return 0;
>>
>> - cpumode = cs_etm__cpu_mode(etmq, address);
>> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
>> if (!tidq)
>> return 0;
>>
>> + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
>> +
>> if (!thread__find_map(tidq->thread, cpumode, address, &al))
>> return 0;
>>
>> @@ -1296,9 +1327,10 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
>> }
>>
>> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
>> - struct cs_etm_traceid_queue *tidq, pid_t tid)
>> + struct cs_etm_traceid_queue *tidq, pid_t tid,
>> + ocsd_ex_level el)
>> {
>> - struct machine *machine = &etm->session->machines.host;
>> + struct machine *machine = cs_etm__get_machine(etm, el);
>>
>> if (tid != -1) {
>> thread__zput(tidq->thread);
>> @@ -1308,10 +1340,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
>> /* Couldn't find a known thread */
>> if (!tidq->thread)
>> tidq->thread = machine__idle_thread(machine);
>> +
>> + tidq->el = el;
>> }
>>
>> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
>> - pid_t tid, u8 trace_chan_id)
>> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
>> + ocsd_ex_level el)
>> {
>> struct cs_etm_traceid_queue *tidq;
>>
>> @@ -1319,7 +1353,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
>> if (!tidq)
>> return -EINVAL;
>>
>> - cs_etm__set_thread(etmq->etm, tidq, tid);
>> + cs_etm__set_thread(etmq->etm, tidq, tid, el);
>> return 0;
>> }
>>
>> @@ -1389,7 +1423,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
>> struct perf_sample sample = {.ip = 0,};
>>
>> event->sample.header.type = PERF_RECORD_SAMPLE;
>> - event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
>> + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
>> event->sample.header.size = sizeof(struct perf_event_header);
>>
>> /* Set time field based on etm auxtrace config. */
>> @@ -1448,7 +1482,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
>> ip = cs_etm__last_executed_instr(tidq->prev_packet);
>>
>> event->sample.header.type = PERF_RECORD_SAMPLE;
>> - event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
>> + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, tidq->prev_el);
>> event->sample.header.size = sizeof(struct perf_event_header);
>>
>> /* Set time field based on etm auxtrace config. */
>> diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
>> index 70cac0375b34..88e9b25a8a9f 100644
>> --- a/tools/perf/util/cs-etm.h
>> +++ b/tools/perf/util/cs-etm.h
>> @@ -232,10 +232,11 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
>> struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu);
>>
>> #ifdef HAVE_CSTRACE_SUPPORT
>> +#include <opencsd/ocsd_if_types.h>
>> int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
>> int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
>> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
>> - pid_t tid, u8 trace_chan_id);
>> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
>> + ocsd_ex_level el);
>> bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
>> void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
>> u8 trace_chan_id);
>> --
>> 2.34.1
>>
>
>

2023-05-30 09:54:14

[permalink] [raw]

Subject: Re: [PATCH 3/4] perf cs-etm: Track exception level

On 28/05/2023 12:05, Leo Yan wrote:
> On Wed, May 24, 2023 at 02:19:57PM +0100, James Clark wrote:
>> Currently we assume all trace belongs to the host machine so when
>> the decoder should be looking at the guest kernel maps it can crash
>> because it looks at the host ones instead.
>>
>> Avoid one scenario (guest kernel running at EL1) by assigning the
>> default guest machine to this trace. For userspace trace it's still not
>> possible to determine guest vs host, but the PIDs should help in this
>> case.
>>
>> Signed-off-by: James Clark <[email protected]>
>> ---
>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 7 +-
>> tools/perf/util/cs-etm.c | 64 ++++++++++++++-----
>> tools/perf/util/cs-etm.h | 5 +-
>> 3 files changed, 56 insertions(+), 20 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> index 82a27ab90c8b..ac227cd03eb0 100644
>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>> @@ -573,12 +573,13 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
>> break;
>> }
>>
>> + if (cs_etm__etmq_set_tid_el(etmq, tid, trace_chan_id,
>> + elem->context.exception_level))
>> + return OCSD_RESP_FATAL_SYS_ERR;
>> +
>> if (tid == -1)
>> return OCSD_RESP_CONT;
>>
>> - if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
>> - return OCSD_RESP_FATAL_SYS_ERR;
>> -
>> /*
>> * A timestamp is generated after a PE_CONTEXT element so make sure
>> * to rely on that coming one.
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index a997fe79d458..b9ba19327f26 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -14,7 +14,6 @@
>> #include <linux/types.h>
>> #include <linux/zalloc.h>
>>
>> -#include <opencsd/ocsd_if_types.h>
>> #include <stdlib.h>
>>
>> #include "auxtrace.h"
>> @@ -87,6 +86,8 @@ struct cs_etm_traceid_queue {
>> union perf_event *event_buf;
>> struct thread *thread;
>> struct thread *prev_thread;
>> + ocsd_ex_level prev_el;
>> + ocsd_ex_level el;
>> struct branch_stack *last_branch;
>> struct branch_stack *last_branch_rb;
>> struct cs_etm_packet *prev_packet;
>> @@ -479,6 +480,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
>>
>> queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
>> tidq->trace_chan_id = trace_chan_id;
>> + tidq->el = tidq->prev_el = ocsd_EL_unknown;
>> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
>> queue->tid);
>> tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
>> @@ -618,6 +620,7 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
>> tmp = tidq->packet;
>> tidq->packet = tidq->prev_packet;
>> tidq->prev_packet = tmp;
>> + tidq->prev_el = tidq->el;
>> thread__put(tidq->prev_thread);
>> tidq->prev_thread = thread__get(tidq->thread);
>> }
>> @@ -879,11 +882,34 @@ static bool cs_etm__evsel_is_auxtrace(struct perf_session *session,
>> return evsel->core.attr.type == aux->pmu_type;
>> }
>>
>> -static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
>> +static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
>> + ocsd_ex_level el)
>> {
>> - struct machine *machine;
>> + /*
>> + * Not perfect, but assume anything in EL1 is the default guest, and
>> + * everything else is the host. nHVE and pKVM may not work with this
>
> s/nHVE/nVHE ?
>
> And it's good to say "if any virtulisation (e.g. pKVM) based on nVHE may
> not work with this ....".
>
>> + * assumption. And distinguishing between guest and host userspaces
>> + * isn't currently supported either. Neither is multiple guest support.
>> + * All this does is reduce the likeliness of decode errors where we look
>> + * into the host kernel maps when it should have been the guest maps.
>> + */
>> + switch (el) {
>> + case ocsd_EL1:
>> + return machines__find_guest(&etm->session->machines,
>> + DEFAULT_GUEST_KERNEL_ID);
>
> I share the same concern with Mike that this will break perf for any
> Armv8 CPUs with nVHE (like Cortex-CA53, etc).
>

I agree, I replied to Mike's comment with some thoughts.

> You could see CoreSight has trace data "elem->context.vmid", when a
> guest kernel runs in EL1, can we use "elem->context.vmid" as a guest
> kernel ID and finally retrieve the corresponding machine pointer?
>

Do you mean something like this?

static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
ocsd_ex_level el)
{
u64 pid_fmt;

cs_etm__get_pid_fmt(trace_chan_id, &pid_fmt);

if (pid_fmt == BIT(ETM_OPT_CTXTID))
return return &etm->session->machines.host;

switch (el) {
case ocsd_EL1:
return machines__find_guest(&etm->session->machines,
DEFAULT_GUEST_KERNEL_ID);
case ocsd_EL3:
case ocsd_EL2:
case ocsd_EL0:
case ocsd_EL_unknown:
default:
return &etm->session->machines.host;
}
}

I think this might work, maybe it's not as bad as I thought and we can
make it work out of the box.

> Thanks,
> Leo
>
>> + case ocsd_EL3:
>> + case ocsd_EL2:
>> + case ocsd_EL0:
>> + case ocsd_EL_unknown:
>> + default:
>> + return &etm->session->machines.host;
>> + }
>> +}
>>
>> - machine = &etmq->etm->session->machines.host;
>> +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
>> + ocsd_ex_level el)
>> +{
>> + struct machine *machine = cs_etm__get_machine(etmq->etm, el);
>>
>> if (address >= machine__kernel_start(machine)) {
>> if (machine__is_host(machine))
>> @@ -893,10 +919,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
>> } else {
>> if (machine__is_host(machine))
>> return PERF_RECORD_MISC_USER;
>> - else if (perf_guest)
>> + else {
>> + /*
>> + * Can't really happen at the moment because
>> + * cs_etm__get_machine() will always return
>> + * machines.host for any non EL1 trace.
>> + */
>> return PERF_RECORD_MISC_GUEST_USER;
>> - else
>> - return PERF_RECORD_MISC_HYPERVISOR;
>> + }
>> }
>> }
>>
>> @@ -913,11 +943,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>> if (!etmq)
>> return 0;
>>
>> - cpumode = cs_etm__cpu_mode(etmq, address);
>> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
>> if (!tidq)
>> return 0;
>>
>> + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
>> +
>> if (!thread__find_map(tidq->thread, cpumode, address, &al))
>> return 0;
>>
>> @@ -1296,9 +1327,10 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
>> }
>>
>> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
>> - struct cs_etm_traceid_queue *tidq, pid_t tid)
>> + struct cs_etm_traceid_queue *tidq, pid_t tid,
>> + ocsd_ex_level el)
>> {
>> - struct machine *machine = &etm->session->machines.host;
>> + struct machine *machine = cs_etm__get_machine(etm, el);
>>
>> if (tid != -1) {
>> thread__zput(tidq->thread);
>> @@ -1308,10 +1340,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
>> /* Couldn't find a known thread */
>> if (!tidq->thread)
>> tidq->thread = machine__idle_thread(machine);
>> +
>> + tidq->el = el;
>> }
>>
>> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
>> - pid_t tid, u8 trace_chan_id)
>> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
>> + ocsd_ex_level el)
>> {
>> struct cs_etm_traceid_queue *tidq;
>>
>> @@ -1319,7 +1353,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
>> if (!tidq)
>> return -EINVAL;
>>
>> - cs_etm__set_thread(etmq->etm, tidq, tid);
>> + cs_etm__set_thread(etmq->etm, tidq, tid, el);
>> return 0;
>> }
>>
>> @@ -1389,7 +1423,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
>> struct perf_sample sample = {.ip = 0,};
>>
>> event->sample.header.type = PERF_RECORD_SAMPLE;
>> - event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
>> + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
>> event->sample.header.size = sizeof(struct perf_event_header);
>>
>> /* Set time field based on etm auxtrace config. */
>> @@ -1448,7 +1482,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
>> ip = cs_etm__last_executed_instr(tidq->prev_packet);
>>
>> event->sample.header.type = PERF_RECORD_SAMPLE;
>> - event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
>> + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, tidq->prev_el);
>> event->sample.header.size = sizeof(struct perf_event_header);
>>
>> /* Set time field based on etm auxtrace config. */
>> diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
>> index 70cac0375b34..88e9b25a8a9f 100644
>> --- a/tools/perf/util/cs-etm.h
>> +++ b/tools/perf/util/cs-etm.h
>> @@ -232,10 +232,11 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
>> struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu);
>>
>> #ifdef HAVE_CSTRACE_SUPPORT
>> +#include <opencsd/ocsd_if_types.h>
>> int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
>> int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
>> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
>> - pid_t tid, u8 trace_chan_id);
>> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
>> + ocsd_ex_level el);
>> bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
>> void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
>> u8 trace_chan_id);
>> --
>> 2.34.1
>>

2023-05-30 11:07:46

[permalink] [raw]

Subject: Re: [PATCH 4/4] perf cs-etm: Add exception level consistency check

Hi James,

Further thought here - this consistency check may be redundant.

You are setting the EL according to that in the traced context packet.
This is correct and sufficient to match the EL that OpenCSD is
expecting, if added to the security state from the same packet. The EL
memory space values that OpenCSD provides in the memory callback are
based on that same data from this context packet.
Thus it is not necessary to use the EL in the memory callback. This is
primarily used for matching when searching for a suitable memory
source - especially when clients register files etc with the library,
rather than provide a global callback as perf does.

Regards

Mike

On Tue, 30 May 2023 at 10:12, James Clark <[email protected]> wrote:
>
>
>
> On 25/05/2023 12:39, Mike Leach wrote:
> > Hi James,
> >
> > My concern here is that for etmv3 trace, OpenCSD will only provide
> > memory spaces as either secure or non-secure, The ETMv3 does not
> > trace, and hence OpenCSD cannot provide the different ELs.
> > The memory callback will be either OCSD_MEM_SPACE_S or OCSD_MEM_SPACE_N.
> >
>
> As long as none of the bits are set for EL1-EL3 then no validation will
> be done so it should be fine. But I will try to test ETMv3.
>
> > Can this patch - and the set handle this. (assuming perf supports our
> > ETMv3 coresight kernel driver)
>
> For the whole set, the tracked EL will stay as ocsd_EL_unknown and
> cs_etm__get_machine() will return host so the behavior will be unchanged
> from before. This is assuming OpenCSD will set the EL to unknown in
> elem->context.exception_level in this case.
>
> >
> > Regards
> >
> > Mike
> >
> > On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
> >>
> >> Assert that our own tracking of the exception level matches what
> >> OpenCSD provides. OpenCSD doesn't distinguish between EL0 and EL1 in the
> >> memory access callback so the extra tracking was required. But a rough
> >> assert can still be done.
> >>
> >> Signed-off-by: James Clark <[email protected]>
> >> ---
> >> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 6 +--
> >> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 +-
> >> tools/perf/util/cs-etm.c | 37 +++++++++++++------
> >> 3 files changed, 32 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> >> index ac227cd03eb0..50b3c248d1e5 100644
> >> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> >> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
> >> @@ -52,15 +52,15 @@ struct cs_etm_decoder {
> >> static u32
> >> cs_etm_decoder__mem_access(const void *context,
> >> const ocsd_vaddr_t address,
> >> - const ocsd_mem_space_acc_t mem_space __maybe_unused,
> >> + const ocsd_mem_space_acc_t mem_space,
> >> const u8 trace_chan_id,
> >> const u32 req_size,
> >> u8 *buffer)
> >> {
> >> struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
> >>
> >> - return decoder->mem_access(decoder->data, trace_chan_id,
> >> - address, req_size, buffer);
> >> + return decoder->mem_access(decoder->data, trace_chan_id, address,
> >> + req_size, buffer, mem_space);
> >> }
> >>
> >> int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder,
> >> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
> >> index 21d403f55d96..272c2efe78ee 100644
> >> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
> >> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
> >> @@ -11,6 +11,7 @@
> >> #define INCLUDE__CS_ETM_DECODER_H__
> >>
> >> #include <linux/types.h>
> >> +#include <opencsd/ocsd_if_types.h>
> >> #include <stdio.h>
> >>
> >> struct cs_etm_decoder;
> >> @@ -19,7 +20,8 @@ struct cs_etm_packet_queue;
> >>
> >> struct cs_etm_queue;
> >>
> >> -typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *);
> >> +typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *,
> >> + const ocsd_mem_space_acc_t);
> >>
> >> struct cs_etmv3_trace_params {
> >> u32 reg_ctrl;
> >> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> >> index b9ba19327f26..ccf34ed8ddf2 100644
> >> --- a/tools/perf/util/cs-etm.c
> >> +++ b/tools/perf/util/cs-etm.c
> >> @@ -931,7 +931,8 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
> >> }
> >>
> >> static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> >> - u64 address, size_t size, u8 *buffer)
> >> + u64 address, size_t size, u8 *buffer,
> >> + const ocsd_mem_space_acc_t mem_space)
> >> {
> >> u8 cpumode;
> >> u64 offset;
> >> @@ -947,6 +948,20 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> >> if (!tidq)
> >> return 0;
> >>
> >> + /*
> >> + * We've already tracked EL along side the PID in cs_etm__set_thread()
> >> + * so double check that it matches what OpenCSD thinks as well. It
> >> + * doesn't distinguish between EL0 and EL1 for this mem access callback
> >> + * so we had to do the extra tracking.
> >> + */
> >> + if (mem_space & OCSD_MEM_SPACE_EL1N) {
> >> + /* Includes both non secure EL1 and EL0 */
> >> + assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0);
> >> + } else if (mem_space & OCSD_MEM_SPACE_EL2)
> >> + assert(tidq->el == ocsd_EL2);
> >> + else if (mem_space & OCSD_MEM_SPACE_EL3)
> >> + assert(tidq->el == ocsd_EL3);
> >> +
> >> cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
> >>
> >> if (!thread__find_map(tidq->thread, cpumode, address, &al))
> >> @@ -1195,8 +1210,8 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
> >> {
> >> u8 instrBytes[2];
> >>
> >> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> >> - ARRAY_SIZE(instrBytes), instrBytes);
> >> + cs_etm__mem_access(etmq, trace_chan_id, addr, ARRAY_SIZE(instrBytes),
> >> + instrBytes, 0);
> >> /*
> >> * T32 instruction size is indicated by bits[15:11] of the first
> >> * 16-bit word of the instruction: 0b11101, 0b11110 and 0b11111
> >> @@ -1387,8 +1402,8 @@ static void cs_etm__copy_insn(struct cs_etm_queue *etmq,
> >> else
> >> sample->insn_len = 4;
> >>
> >> - cs_etm__mem_access(etmq, trace_chan_id, sample->ip,
> >> - sample->insn_len, (void *)sample->insn);
> >> + cs_etm__mem_access(etmq, trace_chan_id, sample->ip, sample->insn_len,
> >> + (void *)sample->insn, 0);
> >> }
> >>
> >> u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
> >> @@ -1940,8 +1955,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
> >> * so below only read 2 bytes as instruction size for T32.
> >> */
> >> addr = end_addr - 2;
> >> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> >> - sizeof(instr16), (u8 *)&instr16);
> >> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr16),
> >> + (u8 *)&instr16, 0);
> >> if ((instr16 & 0xFF00) == 0xDF00)
> >> return true;
> >>
> >> @@ -1956,8 +1971,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
> >> * +---------+---------+-------------------------+
> >> */
> >> addr = end_addr - 4;
> >> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> >> - sizeof(instr32), (u8 *)&instr32);
> >> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
> >> + (u8 *)&instr32, 0);
> >> if ((instr32 & 0x0F000000) == 0x0F000000 &&
> >> (instr32 & 0xF0000000) != 0xF0000000)
> >> return true;
> >> @@ -1973,8 +1988,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
> >> * +-----------------------+---------+-----------+
> >> */
> >> addr = end_addr - 4;
> >> - cs_etm__mem_access(etmq, trace_chan_id, addr,
> >> - sizeof(instr32), (u8 *)&instr32);
> >> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
> >> + (u8 *)&instr32, 0);
> >> if ((instr32 & 0xFFE0001F) == 0xd4000001)
> >> return true;
> >>
> >> --
> >> 2.34.1
> >>
> >
> >

--
Mike Leach
Principal Engineer, ARM Ltd.
Manchester Design Centre. UK

2023-05-30 13:35:57

[permalink] [raw]

Subject: Re: [PATCH 3/4] perf cs-etm: Track exception level

On Tue, May 30, 2023 at 10:24:45AM +0100, James Clark wrote:

[...]

> >> -static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> >> +static struct machine *cs_etm__get_machine(struct cs_etm_auxtrace *etm,
> >> + ocsd_ex_level el)
> >> {
> >> - struct machine *machine;
> >> + /*
> >> + * Not perfect, but assume anything in EL1 is the default guest, and
> >> + * everything else is the host. nHVE and pKVM may not work with this
> >> + * assumption. And distinguishing between guest and host userspaces
> >> + * isn't currently supported either. Neither is multiple guest support.
> >> + * All this does is reduce the likeliness of decode errors where we look
> >> + * into the host kernel maps when it should have been the guest maps.
> >> + */
> >
> > What effect does this have if I am running with host only, kernel at
> > EL1, e.g. any platform that is not running an EL2 kernel?
>
> I suppose I didn't think about that case. It looks like you'd have to
> start using --guest-vmlinux instead of --vmlinux to decode. But that's
> probably not what we want.

IIUC, it's not about how to pass kernel image path with options
"--guest-vmlinux" or "--vmlinux".

Let's think a case: Cortex-A53 CPU which doesn't support VHE, or any
Arm CPUs which disable the VHE feature. In this case, the host OS and
guest OS both run in EL1, only the a KVM hypervisor runs in EL2.

You could see with below change, no matter the host OS and guest OS,
both use DEFAULT_GUEST_KERNEL_ID to retrieve the guest machine. At
the end, this change will break perf tool for host OS.

> Is that a standard configuration though? I'm wondering if we need to
> support that out of the box, or needing the extra command line argument
> is ok? It seems like it's hard to make anything just work without the
> user providing extra info.

Essentially, we need more info to make decision if a system runs in host
or guest, if only depend on it's EL1 or other execptions, it's hard to
say it's running in host or guest. We can look into more details:

- If CPU enables VHE, then the host kernel runs in EL2 and the guest
kernel runs in EL1;

- If CPU only enables nVHE (or CPU doesn't support VHE at all), then
both the host kernel and the guest kernel runs in EL1;

I think we need to retrieve more info from /sys/ node or /proc node:

- We need to know if VHE is enabled or not;

- If CPU doesn't enable VHE, then both host and guest kernels run in
EL1, so we need to use extra info to distinguish it's a host kernel
or guest kernel.

To be honest, I don't have answers for above two items. I will think
a bit more and will share back if have any finding.

Thanks,
Leo

> After this change we also wanted to start setting exclude_guest=1 by
> default, so technically this change is only needed when exclude_guest=0.
> Otherwise we could assume it's always the host regardless of the EL. It
> should be quite easy to check what that setting was in this switch
> statement, but it would be temporarily broken in the mean time. And also
> we can only make exclude_guest work with FEAT_TRF (v8.4 onwards)
>
> >
> > Mike
> >
> >
> >> + switch (el) {
> >> + case ocsd_EL1:
> >> + return machines__find_guest(&etm->session->machines,
> >> + DEFAULT_GUEST_KERNEL_ID);
> >> + case ocsd_EL3:
> >> + case ocsd_EL2:
> >> + case ocsd_EL0:
> >> + case ocsd_EL_unknown:
> >> + default:
> >> + return &etm->session->machines.host;
> >> + }
> >> +}
> >>
> >> - machine = &etmq->etm->session->machines.host;
> >> +static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
> >> + ocsd_ex_level el)
> >> +{
> >> + struct machine *machine = cs_etm__get_machine(etmq->etm, el);
> >>
> >> if (address >= machine__kernel_start(machine)) {
> >> if (machine__is_host(machine))
> >> @@ -893,10 +919,14 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address)
> >> } else {
> >> if (machine__is_host(machine))
> >> return PERF_RECORD_MISC_USER;
> >> - else if (perf_guest)
> >> + else {
> >> + /*
> >> + * Can't really happen at the moment because
> >> + * cs_etm__get_machine() will always return
> >> + * machines.host for any non EL1 trace.
> >> + */
> >> return PERF_RECORD_MISC_GUEST_USER;
> >> - else
> >> - return PERF_RECORD_MISC_HYPERVISOR;
> >> + }
> >> }
> >> }
> >>
> >> @@ -913,11 +943,12 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
> >> if (!etmq)
> >> return 0;
> >>
> >> - cpumode = cs_etm__cpu_mode(etmq, address);
> >> tidq = cs_etm__etmq_get_traceid_queue(etmq, trace_chan_id);
> >> if (!tidq)
> >> return 0;
> >>
> >> + cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
> >> +
> >> if (!thread__find_map(tidq->thread, cpumode, address, &al))
> >> return 0;
> >>
> >> @@ -1296,9 +1327,10 @@ cs_etm__get_trace(struct cs_etm_queue *etmq)
> >> }
> >>
> >> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> >> - struct cs_etm_traceid_queue *tidq, pid_t tid)
> >> + struct cs_etm_traceid_queue *tidq, pid_t tid,
> >> + ocsd_ex_level el)
> >> {
> >> - struct machine *machine = &etm->session->machines.host;
> >> + struct machine *machine = cs_etm__get_machine(etm, el);
> >>
> >> if (tid != -1) {
> >> thread__zput(tidq->thread);
> >> @@ -1308,10 +1340,12 @@ static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> >> /* Couldn't find a known thread */
> >> if (!tidq->thread)
> >> tidq->thread = machine__idle_thread(machine);
> >> +
> >> + tidq->el = el;
> >> }
> >>
> >> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> >> - pid_t tid, u8 trace_chan_id)
> >> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
> >> + ocsd_ex_level el)
> >> {
> >> struct cs_etm_traceid_queue *tidq;
> >>
> >> @@ -1319,7 +1353,7 @@ int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> >> if (!tidq)
> >> return -EINVAL;
> >>
> >> - cs_etm__set_thread(etmq->etm, tidq, tid);
> >> + cs_etm__set_thread(etmq->etm, tidq, tid, el);
> >> return 0;
> >> }
> >>
> >> @@ -1389,7 +1423,7 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
> >> struct perf_sample sample = {.ip = 0,};
> >>
> >> event->sample.header.type = PERF_RECORD_SAMPLE;
> >> - event->sample.header.misc = cs_etm__cpu_mode(etmq, addr);
> >> + event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
> >> event->sample.header.size = sizeof(struct perf_event_header);
> >>
> >> /* Set time field based on etm auxtrace config. */
> >> @@ -1448,7 +1482,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
> >> ip = cs_etm__last_executed_instr(tidq->prev_packet);
> >>
> >> event->sample.header.type = PERF_RECORD_SAMPLE;
> >> - event->sample.header.misc = cs_etm__cpu_mode(etmq, ip);
> >> + event->sample.header.misc = cs_etm__cpu_mode(etmq, ip, tidq->prev_el);
> >> event->sample.header.size = sizeof(struct perf_event_header);
> >>
> >> /* Set time field based on etm auxtrace config. */
> >> diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
> >> index 70cac0375b34..88e9b25a8a9f 100644
> >> --- a/tools/perf/util/cs-etm.h
> >> +++ b/tools/perf/util/cs-etm.h
> >> @@ -232,10 +232,11 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
> >> struct perf_event_attr *cs_etm_get_default_config(struct perf_pmu *pmu);
> >>
> >> #ifdef HAVE_CSTRACE_SUPPORT
> >> +#include <opencsd/ocsd_if_types.h>
> >> int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
> >> int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
> >> -int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
> >> - pid_t tid, u8 trace_chan_id);
> >> +int cs_etm__etmq_set_tid_el(struct cs_etm_queue *etmq, pid_t tid, u8 trace_chan_id,
> >> + ocsd_ex_level el);
> >> bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
> >> void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
> >> u8 trace_chan_id);
> >> --
> >> 2.34.1
> >>
> >
> >

2023-05-30 14:33:17

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On 27/05/2023 10:06, Leo Yan wrote:
> On Wed, May 24, 2023 at 02:19:56PM +0100, James Clark wrote:
>> Branch samples currently use the IP of the previous packet as the from
>> IP, and the IP of the current packet as the to IP. But it incorrectly
>> uses the current thread. In some cases like a jump into a different
>> exception level this will attribute to the incorrect process.
>
> It's about the timing that branch has taken or not taken :)
>
> If we think the branch sample as 'branch has taken', then current code
> is doning right thing, otherwise, we need this fix.
>

If you diff the outputs side by side you can see it mainly has an effect
where there is a discontinuity. At this point we set either the from or
the to IPs to 0.

For example here is a before and after perf script output. Without the
change it looks like stress was running before it actually was. The
schedule function that was attributed to ls on the first line hasn't
finished running yet. But it's attributed to stress on the second line
even though the destination IP is 0 meaning we don't even know where it
went.

Before:

ls 8350 [006] ... __schedule+0x394 => schedule+0x5c
stress 8357 [006] ... schedule+0x84 => 0 [unknown]
stress 8357 [006] ... 0 [unknown] => __unix_dgram_recvmsg+0x130

After:

ls 8350 [006] ... __schedule+0x394 => schedule+0x5c
ls 8357 [006] ... schedule+0x84 => 0 [unknown]
stress 8357 [006] ... 0 [unknown] => __unix_dgram_recvmsg+0x130

I didn't see any decode differences that weren't around these
discontinuity points, so it seems like a low risk change.

>> Fix it by tracking the previous thread in the same way the previous
>> packet is tracked.
>>
>> Signed-off-by: James Clark <[email protected]>
>> ---
>> tools/perf/util/cs-etm.c | 9 +++++++--
>> 1 file changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>> index ebffc9052561..a997fe79d458 100644
>> --- a/tools/perf/util/cs-etm.c
>> +++ b/tools/perf/util/cs-etm.c
>> @@ -86,6 +86,7 @@ struct cs_etm_traceid_queue {
>> size_t last_branch_pos;
>> union perf_event *event_buf;
>> struct thread *thread;
>> + struct thread *prev_thread;
>> struct branch_stack *last_branch;
>> struct branch_stack *last_branch_rb;
>> struct cs_etm_packet *prev_packet;
>> @@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
>> tidq->trace_chan_id = trace_chan_id;
>> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
>> queue->tid);
>> + tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
>>
>> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
>> if (!tidq->packet)
>> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
>> tmp = tidq->packet;
>> tidq->packet = tidq->prev_packet;
>> tidq->prev_packet = tmp;
>> + thread__put(tidq->prev_thread);
>> + tidq->prev_thread = thread__get(tidq->thread);
>
> Maybe cs_etm__packet_swap() is not the best place to update
> "tidq->prev_thread", since swapping packet doesn't mean it's necessarily
> thread switching; can we move this change into the cs_etm__set_thread()?
>

Yeah that might make more sense. I can move it there if we decide to
keep this change.

> Thanks,
> Leo
>
>> }
>> }
>>
>> @@ -791,6 +795,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
>> /* Free this traceid_queue from the array */
>> tidq = etmq->traceid_queues[idx];
>> thread__zput(tidq->thread);
>> + thread__zput(tidq->prev_thread);
>> zfree(&tidq->event_buf);
>> zfree(&tidq->last_branch);
>> zfree(&tidq->last_branch_rb);
>> @@ -1450,8 +1455,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
>> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>>
>> sample.ip = ip;
>> - sample.pid = tidq->thread->pid_;
>> - sample.tid = tidq->thread->tid;
>> + sample.pid = tidq->prev_thread->pid_;
>> + sample.tid = tidq->prev_thread->tid;
>> sample.addr = cs_etm__first_executed_instr(tidq->packet);
>> sample.id = etmq->etm->branches_id;
>> sample.stream_id = etmq->etm->branches_id;
>> --
>> 2.34.1
>>

2023-06-05 20:06:35

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH 0/4] perf cs-etm: Track exception level

Em Wed, May 24, 2023 at 02:19:54PM +0100, James Clark escreveu:
> Some fixes to support an issue reported by Denis Nikitin where decoding
> trace that contains different EL1 and EL2 kernels can crash or go into
> an infinite loop because the wrong kernel maps are used for the decode.
>
> This still doesn't support distinguishing guest and host userspace,
> we'd still have to fix the timestamps and do a bit more work to
> correlate that. And I've removed PERF_RECORD_MISC_HYPERVISOR as a
> possible outcome of cs_etm__cpu_mode(). As far as I know this could
> never have been returned anyway because machine__is_host(machine) was
> always true due to session.machines.host being hard coded. And I'm not
> sure of the relevance of the difference between PERF_RECORD_MISC_KERNEL
> and PERF_RECORD_MISC_HYPERVISOR in this scenario.
>
> The first commit is a tidy up, second fixes a bug that I found when
> comparing the exception level and thread of branch records, the third
> is the main fix, and the last commit is some extra error checking.
>
> Applies to acme/perf-tools (4e111f0cf0)

So there seems to be agreement the first two patches can be applied? May
I go ahead and do that now?

- Arnaldo

> James Clark (4):
> perf cs-etm: Only track threads instead of PID and TIDs
> perf cs-etm: Use previous thread for branch sample source IP
> perf cs-etm: Track exception level
> perf cs-etm: Add exception level consistency check
>
> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 13 +-
> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 +-
> tools/perf/util/cs-etm.c | 220 +++++++++---------
> tools/perf/util/cs-etm.h | 5 +-
> 4 files changed, 126 insertions(+), 116 deletions(-)
>
> --
> 2.34.1
>

--

- Arnaldo

2023-06-05 22:33:58

by Suzuki K Poulose

[permalink] [raw]

Subject: Re: [PATCH 1/4] perf cs-etm: Only track threads instead of PID and TIDs

On 24/05/2023 14:19, James Clark wrote:
> PIDs and TIDs are already contained within the thread struct, so to
> avoid inconsistencies drop the extra members on the etm queue and only
> use the thread struct.
>
> At the same time stop using the 'unknown' thread. In a later commit
> we will be making samples from multiple machines so it will be better
> to use the idle thread of each machine rather than overlapping unknown
> threads. Using the idle thread is also better because kernel addresses
> with a previously unknown thread will now be assigned to a real kernel
> thread.
>
> Signed-off-by: James Clark <[email protected]>

Acked-by: Suzuki K Poulose <[email protected]>

2023-06-06 01:04:51

[permalink] [raw]

Subject: Re: [PATCH 0/4] perf cs-etm: Track exception level

Hi Arnaldo,

On Mon, Jun 05, 2023 at 04:44:45PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Wed, May 24, 2023 at 02:19:54PM +0100, James Clark escreveu:
> > Some fixes to support an issue reported by Denis Nikitin where decoding
> > trace that contains different EL1 and EL2 kernels can crash or go into
> > an infinite loop because the wrong kernel maps are used for the decode.
> >
> > This still doesn't support distinguishing guest and host userspace,
> > we'd still have to fix the timestamps and do a bit more work to
> > correlate that. And I've removed PERF_RECORD_MISC_HYPERVISOR as a
> > possible outcome of cs_etm__cpu_mode(). As far as I know this could
> > never have been returned anyway because machine__is_host(machine) was
> > always true due to session.machines.host being hard coded. And I'm not
> > sure of the relevance of the difference between PERF_RECORD_MISC_KERNEL
> > and PERF_RECORD_MISC_HYPERVISOR in this scenario.
> >
> > The first commit is a tidy up, second fixes a bug that I found when
> > comparing the exception level and thread of branch records, the third
> > is the main fix, and the last commit is some extra error checking.
> >
> > Applies to acme/perf-tools (4e111f0cf0)
>
> So there seems to be agreement the first two patches can be applied? May
> I go ahead and do that now?

Could you pick up the first patch in this series?

I would like ask James to refine a bit for the second patch.

Thanks,
Leo

2023-06-06 01:11:20

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On Tue, May 30, 2023 at 03:28:09PM +0100, James Clark wrote:

[...]

> > On Wed, May 24, 2023 at 02:19:56PM +0100, James Clark wrote:
> >> Branch samples currently use the IP of the previous packet as the from
> >> IP, and the IP of the current packet as the to IP. But it incorrectly
> >> uses the current thread. In some cases like a jump into a different
> >> exception level this will attribute to the incorrect process.
> >
> > It's about the timing that branch has taken or not taken :)
> >
> > If we think the branch sample as 'branch has taken', then current code
> > is doning right thing, otherwise, we need this fix.
> >
>
> If you diff the outputs side by side you can see it mainly has an effect
> where there is a discontinuity. At this point we set either the from or
> the to IPs to 0.
>
> For example here is a before and after perf script output. Without the
> change it looks like stress was running before it actually was. The
> schedule function that was attributed to ls on the first line hasn't
> finished running yet. But it's attributed to stress on the second line
> even though the destination IP is 0 meaning we don't even know where it
> went.

Yeah, this is a good improvement for me. Thanks for sharing the
detailed comparison result.

> Before:
>
> ls 8350 [006] ... __schedule+0x394 => schedule+0x5c
> stress 8357 [006] ... schedule+0x84 => 0 [unknown]
> stress 8357 [006] ... 0 [unknown] => __unix_dgram_recvmsg+0x130
>
> After:
>
> ls 8350 [006] ... __schedule+0x394 => schedule+0x5c
> ls 8357 [006] ... schedule+0x84 => 0 [unknown]
> stress 8357 [006] ... 0 [unknown] => __unix_dgram_recvmsg+0x130
>
> I didn't see any decode differences that weren't around these
> discontinuity points, so it seems like a low risk change.

[...]

> >> @@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
> >> tidq->trace_chan_id = trace_chan_id;
> >> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
> >> queue->tid);
> >> + tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
> >>
> >> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
> >> if (!tidq->packet)
> >> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
> >> tmp = tidq->packet;
> >> tidq->packet = tidq->prev_packet;
> >> tidq->prev_packet = tmp;
> >> + thread__put(tidq->prev_thread);
> >> + tidq->prev_thread = thread__get(tidq->thread);
> >
> > Maybe cs_etm__packet_swap() is not the best place to update
> > "tidq->prev_thread", since swapping packet doesn't mean it's necessarily
> > thread switching; can we move this change into the cs_etm__set_thread()?
> >
>
> Yeah that might make more sense. I can move it there if we decide to
> keep this change.

Please refine the patch for this. Thanks and sorry my late replying.

Leo

2023-06-06 18:16:42

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH 0/4] perf cs-etm: Track exception level

Em Tue, Jun 06, 2023 at 08:46:48AM +0800, Leo Yan escreveu:
> Hi Arnaldo,
>
> On Mon, Jun 05, 2023 at 04:44:45PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Wed, May 24, 2023 at 02:19:54PM +0100, James Clark escreveu:
> > > Some fixes to support an issue reported by Denis Nikitin where decoding
> > > trace that contains different EL1 and EL2 kernels can crash or go into
> > > an infinite loop because the wrong kernel maps are used for the decode.
> > >
> > > This still doesn't support distinguishing guest and host userspace,
> > > we'd still have to fix the timestamps and do a bit more work to
> > > correlate that. And I've removed PERF_RECORD_MISC_HYPERVISOR as a
> > > possible outcome of cs_etm__cpu_mode(). As far as I know this could
> > > never have been returned anyway because machine__is_host(machine) was
> > > always true due to session.machines.host being hard coded. And I'm not
> > > sure of the relevance of the difference between PERF_RECORD_MISC_KERNEL
> > > and PERF_RECORD_MISC_HYPERVISOR in this scenario.
> > >
> > > The first commit is a tidy up, second fixes a bug that I found when
> > > comparing the exception level and thread of branch records, the third
> > > is the main fix, and the last commit is some extra error checking.
> > >
> > > Applies to acme/perf-tools (4e111f0cf0)
> >
> > So there seems to be agreement the first two patches can be applied? May
> > I go ahead and do that now?
>
> Could you pick up the first patch in this series?
>
> I would like ask James to refine a bit for the second patch.

Ok, left just the first patch on my local perf-tools-next branch, will
go public when tested.

- Arnaldo

2023-06-07 09:26:00

[permalink] [raw]

Subject: Re: [PATCH 4/4] perf cs-etm: Add exception level consistency check

On 30/05/2023 11:40, Mike Leach wrote:
> Hi James,
>
> Further thought here - this consistency check may be redundant.
>
> You are setting the EL according to that in the traced context packet.
> This is correct and sufficient to match the EL that OpenCSD is
> expecting, if added to the security state from the same packet. The EL
> memory space values that OpenCSD provides in the memory callback are
> based on that same data from this context packet.
> Thus it is not necessary to use the EL in the memory callback. This is
> primarily used for matching when searching for a suitable memory
> source - especially when clients register files etc with the library,
> rather than provide a global callback as perf does.
>
> Regards
>

It was supposed to be defensive coding against some future refactor
where the Perf side tracking goes wrong. I think it makes sense to keep
it as it could be quite easy to make a mistake there and extra tests
can't be bad.

For ETMv3 I did have to make this change because OpenCSD was returning
the 'any' values so I made it skip the validation in that case:

if (!(mem_space == OCSD_MEM_SPACE_ANY ||
mem_space == OCSD_MEM_SPACE_N ||
mem_space == OCSD_MEM_SPACE_S)) {
if (mem_space & OCSD_MEM_SPACE_EL1N) {
/* Includes both non secure EL1 and EL0 */
assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0);
} else if (mem_space & OCSD_MEM_SPACE_EL2)
assert(tidq->el == ocsd_EL2);
else if (mem_space & OCSD_MEM_SPACE_EL3)
assert(tidq->el == ocsd_EL3);
}

> Mike
>
>
>
> On Tue, 30 May 2023 at 10:12, James Clark <[email protected]> wrote:
>>
>>
>>
>> On 25/05/2023 12:39, Mike Leach wrote:
>>> Hi James,
>>>
>>> My concern here is that for etmv3 trace, OpenCSD will only provide
>>> memory spaces as either secure or non-secure, The ETMv3 does not
>>> trace, and hence OpenCSD cannot provide the different ELs.
>>> The memory callback will be either OCSD_MEM_SPACE_S or OCSD_MEM_SPACE_N.
>>>
>>
>> As long as none of the bits are set for EL1-EL3 then no validation will
>> be done so it should be fine. But I will try to test ETMv3.
>>
>>> Can this patch - and the set handle this. (assuming perf supports our
>>> ETMv3 coresight kernel driver)
>>
>> For the whole set, the tracked EL will stay as ocsd_EL_unknown and
>> cs_etm__get_machine() will return host so the behavior will be unchanged
>> from before. This is assuming OpenCSD will set the EL to unknown in
>> elem->context.exception_level in this case.
>>
>>>
>>> Regards
>>>
>>> Mike
>>>
>>> On Wed, 24 May 2023 at 14:20, James Clark <[email protected]> wrote:
>>>>
>>>> Assert that our own tracking of the exception level matches what
>>>> OpenCSD provides. OpenCSD doesn't distinguish between EL0 and EL1 in the
>>>> memory access callback so the extra tracking was required. But a rough
>>>> assert can still be done.
>>>>
>>>> Signed-off-by: James Clark <[email protected]>
>>>> ---
>>>> .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 6 +--
>>>> .../perf/util/cs-etm-decoder/cs-etm-decoder.h | 4 +-
>>>> tools/perf/util/cs-etm.c | 37 +++++++++++++------
>>>> 3 files changed, 32 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> index ac227cd03eb0..50b3c248d1e5 100644
>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>>> @@ -52,15 +52,15 @@ struct cs_etm_decoder {
>>>> static u32
>>>> cs_etm_decoder__mem_access(const void *context,
>>>> const ocsd_vaddr_t address,
>>>> - const ocsd_mem_space_acc_t mem_space __maybe_unused,
>>>> + const ocsd_mem_space_acc_t mem_space,
>>>> const u8 trace_chan_id,
>>>> const u32 req_size,
>>>> u8 *buffer)
>>>> {
>>>> struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
>>>>
>>>> - return decoder->mem_access(decoder->data, trace_chan_id,
>>>> - address, req_size, buffer);
>>>> + return decoder->mem_access(decoder->data, trace_chan_id, address,
>>>> + req_size, buffer, mem_space);
>>>> }
>>>>
>>>> int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder,
>>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> index 21d403f55d96..272c2efe78ee 100644
>>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
>>>> @@ -11,6 +11,7 @@
>>>> #define INCLUDE__CS_ETM_DECODER_H__
>>>>
>>>> #include <linux/types.h>
>>>> +#include <opencsd/ocsd_if_types.h>
>>>> #include <stdio.h>
>>>>
>>>> struct cs_etm_decoder;
>>>> @@ -19,7 +20,8 @@ struct cs_etm_packet_queue;
>>>>
>>>> struct cs_etm_queue;
>>>>
>>>> -typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *);
>>>> +typedef u32 (*cs_etm_mem_cb_type)(struct cs_etm_queue *, u8, u64, size_t, u8 *,
>>>> + const ocsd_mem_space_acc_t);
>>>>
>>>> struct cs_etmv3_trace_params {
>>>> u32 reg_ctrl;
>>>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>>>> index b9ba19327f26..ccf34ed8ddf2 100644
>>>> --- a/tools/perf/util/cs-etm.c
>>>> +++ b/tools/perf/util/cs-etm.c
>>>> @@ -931,7 +931,8 @@ static u8 cs_etm__cpu_mode(struct cs_etm_queue *etmq, u64 address,
>>>> }
>>>>
>>>> static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>>>> - u64 address, size_t size, u8 *buffer)
>>>> + u64 address, size_t size, u8 *buffer,
>>>> + const ocsd_mem_space_acc_t mem_space)
>>>> {
>>>> u8 cpumode;
>>>> u64 offset;
>>>> @@ -947,6 +948,20 @@ static u32 cs_etm__mem_access(struct cs_etm_queue *etmq, u8 trace_chan_id,
>>>> if (!tidq)
>>>> return 0;
>>>>
>>>> + /*
>>>> + * We've already tracked EL along side the PID in cs_etm__set_thread()
>>>> + * so double check that it matches what OpenCSD thinks as well. It
>>>> + * doesn't distinguish between EL0 and EL1 for this mem access callback
>>>> + * so we had to do the extra tracking.
>>>> + */
>>>> + if (mem_space & OCSD_MEM_SPACE_EL1N) {
>>>> + /* Includes both non secure EL1 and EL0 */
>>>> + assert(tidq->el == ocsd_EL1 || tidq->el == ocsd_EL0);
>>>> + } else if (mem_space & OCSD_MEM_SPACE_EL2)
>>>> + assert(tidq->el == ocsd_EL2);
>>>> + else if (mem_space & OCSD_MEM_SPACE_EL3)
>>>> + assert(tidq->el == ocsd_EL3);
>>>> +
>>>> cpumode = cs_etm__cpu_mode(etmq, address, tidq->el);
>>>>
>>>> if (!thread__find_map(tidq->thread, cpumode, address, &al))
>>>> @@ -1195,8 +1210,8 @@ static inline int cs_etm__t32_instr_size(struct cs_etm_queue *etmq,
>>>> {
>>>> u8 instrBytes[2];
>>>>
>>>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>>>> - ARRAY_SIZE(instrBytes), instrBytes);
>>>> + cs_etm__mem_access(etmq, trace_chan_id, addr, ARRAY_SIZE(instrBytes),
>>>> + instrBytes, 0);
>>>> /*
>>>> * T32 instruction size is indicated by bits[15:11] of the first
>>>> * 16-bit word of the instruction: 0b11101, 0b11110 and 0b11111
>>>> @@ -1387,8 +1402,8 @@ static void cs_etm__copy_insn(struct cs_etm_queue *etmq,
>>>> else
>>>> sample->insn_len = 4;
>>>>
>>>> - cs_etm__mem_access(etmq, trace_chan_id, sample->ip,
>>>> - sample->insn_len, (void *)sample->insn);
>>>> + cs_etm__mem_access(etmq, trace_chan_id, sample->ip, sample->insn_len,
>>>> + (void *)sample->insn, 0);
>>>> }
>>>>
>>>> u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
>>>> @@ -1940,8 +1955,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
>>>> * so below only read 2 bytes as instruction size for T32.
>>>> */
>>>> addr = end_addr - 2;
>>>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>>>> - sizeof(instr16), (u8 *)&instr16);
>>>> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr16),
>>>> + (u8 *)&instr16, 0);
>>>> if ((instr16 & 0xFF00) == 0xDF00)
>>>> return true;
>>>>
>>>> @@ -1956,8 +1971,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
>>>> * +---------+---------+-------------------------+
>>>> */
>>>> addr = end_addr - 4;
>>>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>>>> - sizeof(instr32), (u8 *)&instr32);
>>>> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
>>>> + (u8 *)&instr32, 0);
>>>> if ((instr32 & 0x0F000000) == 0x0F000000 &&
>>>> (instr32 & 0xF0000000) != 0xF0000000)
>>>> return true;
>>>> @@ -1973,8 +1988,8 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
>>>> * +-----------------------+---------+-----------+
>>>> */
>>>> addr = end_addr - 4;
>>>> - cs_etm__mem_access(etmq, trace_chan_id, addr,
>>>> - sizeof(instr32), (u8 *)&instr32);
>>>> + cs_etm__mem_access(etmq, trace_chan_id, addr, sizeof(instr32),
>>>> + (u8 *)&instr32, 0);
>>>> if ((instr32 & 0xFFE0001F) == 0xd4000001)
>>>> return true;
>>>>
>>>> --
>>>> 2.34.1
>>>>
>>>
>>>
>
>
>

2023-06-08 10:01:12

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On 30/05/2023 15:28, James Clark wrote:
>
>
> On 27/05/2023 10:06, Leo Yan wrote:
>> On Wed, May 24, 2023 at 02:19:56PM +0100, James Clark wrote:
>>> Branch samples currently use the IP of the previous packet as the from
>>> IP, and the IP of the current packet as the to IP. But it incorrectly
>>> uses the current thread. In some cases like a jump into a different
>>> exception level this will attribute to the incorrect process.
>>
>> It's about the timing that branch has taken or not taken :)
>>
>> If we think the branch sample as 'branch has taken', then current code
>> is doning right thing, otherwise, we need this fix.
>>
>
> If you diff the outputs side by side you can see it mainly has an effect
> where there is a discontinuity. At this point we set either the from or
> the to IPs to 0.
>
> For example here is a before and after perf script output. Without the
> change it looks like stress was running before it actually was. The
> schedule function that was attributed to ls on the first line hasn't
> finished running yet. But it's attributed to stress on the second line
> even though the destination IP is 0 meaning we don't even know where it
> went.
>
> Before:
>
> ls 8350 [006] ... __schedule+0x394 => schedule+0x5c
> stress 8357 [006] ... schedule+0x84 => 0 [unknown]
> stress 8357 [006] ... 0 [unknown] => __unix_dgram_recvmsg+0x130
>
> After:
>
> ls 8350 [006] ... __schedule+0x394 => schedule+0x5c
> ls 8357 [006] ... schedule+0x84 => 0 [unknown]
> stress 8357 [006] ... 0 [unknown] => __unix_dgram_recvmsg+0x130
>
> I didn't see any decode differences that weren't around these
> discontinuity points, so it seems like a low risk change.
>
>>> Fix it by tracking the previous thread in the same way the previous
>>> packet is tracked.
>>>
>>> Signed-off-by: James Clark <[email protected]>
>>> ---
>>> tools/perf/util/cs-etm.c | 9 +++++++--
>>> 1 file changed, 7 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>>> index ebffc9052561..a997fe79d458 100644
>>> --- a/tools/perf/util/cs-etm.c
>>> +++ b/tools/perf/util/cs-etm.c
>>> @@ -86,6 +86,7 @@ struct cs_etm_traceid_queue {
>>> size_t last_branch_pos;
>>> union perf_event *event_buf;
>>> struct thread *thread;
>>> + struct thread *prev_thread;
>>> struct branch_stack *last_branch;
>>> struct branch_stack *last_branch_rb;
>>> struct cs_etm_packet *prev_packet;
>>> @@ -480,6 +481,7 @@ static int cs_etm__init_traceid_queue(struct cs_etm_queue *etmq,
>>> tidq->trace_chan_id = trace_chan_id;
>>> tidq->thread = machine__findnew_thread(&etm->session->machines.host, -1,
>>> queue->tid);
>>> + tidq->prev_thread = machine__idle_thread(&etm->session->machines.host);
>>>
>>> tidq->packet = zalloc(sizeof(struct cs_etm_packet));
>>> if (!tidq->packet)
>>> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
>>> tmp = tidq->packet;
>>> tidq->packet = tidq->prev_packet;
>>> tidq->prev_packet = tmp;
>>> + thread__put(tidq->prev_thread);
>>> + tidq->prev_thread = thread__get(tidq->thread);
>>
>> Maybe cs_etm__packet_swap() is not the best place to update
>> "tidq->prev_thread", since swapping packet doesn't mean it's necessarily
>> thread switching; can we move this change into the cs_etm__set_thread()?
>>
>
> Yeah that might make more sense. I can move it there if we decide to
> keep this change.
>

Unfortunately I don't think I can make this change. It seems like
putting the previous thread swap in cs_etm__set_thread() has different
semantics to keeping all the swaps together in cs_etm__packet_swap().

This is because if you swap the thread in cs_etm__packet_swap() the
previous packet and next packet can have the _same_ thread if there
happened to be no change. However if you only swap previous thread in
cs_etm__set_thread(), that means that the previous thread is always
different to the next one. This has a huge difference on the decoding
because two adjacent packets on the same thread will say they branched
from the previous thread that ran, not the previous thread on the
previous packet.

>> Thanks,
>> Leo
>>
>>> }
>>> }
>>>
>>> @@ -791,6 +795,7 @@ static void cs_etm__free_traceid_queues(struct cs_etm_queue *etmq)
>>> /* Free this traceid_queue from the array */
>>> tidq = etmq->traceid_queues[idx];
>>> thread__zput(tidq->thread);
>>> + thread__zput(tidq->prev_thread);
>>> zfree(&tidq->event_buf);
>>> zfree(&tidq->last_branch);
>>> zfree(&tidq->last_branch_rb);
>>> @@ -1450,8 +1455,8 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
>>> sample.time = cs_etm__resolve_sample_time(etmq, tidq);
>>>
>>> sample.ip = ip;
>>> - sample.pid = tidq->thread->pid_;
>>> - sample.tid = tidq->thread->tid;
>>> + sample.pid = tidq->prev_thread->pid_;
>>> + sample.tid = tidq->prev_thread->tid;
>>> sample.addr = cs_etm__first_executed_instr(tidq->packet);
>>> sample.id = etmq->etm->branches_id;
>>> sample.stream_id = etmq->etm->branches_id;
>>> --
>>> 2.34.1
>>>

2023-06-08 10:40:37

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On Thu, Jun 08, 2023 at 10:34:42AM +0100, James Clark wrote:

[...]

> >>> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
> >>> tmp = tidq->packet;
> >>> tidq->packet = tidq->prev_packet;
> >>> tidq->prev_packet = tmp;
> >>> + thread__put(tidq->prev_thread);
> >>> + tidq->prev_thread = thread__get(tidq->thread);
> >>
> >> Maybe cs_etm__packet_swap() is not the best place to update
> >> "tidq->prev_thread", since swapping packet doesn't mean it's necessarily
> >> thread switching; can we move this change into the cs_etm__set_thread()?
> >>
> >
> > Yeah that might make more sense. I can move it there if we decide to
> > keep this change.
> >
>
> Unfortunately I don't think I can make this change. It seems like
> putting the previous thread swap in cs_etm__set_thread() has different
> semantics to keeping all the swaps together in cs_etm__packet_swap().

Thanks for trying this.

> This is because if you swap the thread in cs_etm__packet_swap() the
> previous packet and next packet can have the _same_ thread if there
> happened to be no change. However if you only swap previous thread in
> cs_etm__set_thread(), that means that the previous thread is always
> different to the next one. This has a huge difference on the decoding
> because two adjacent packets on the same thread will say they branched
> from the previous thread that ran, not the previous thread on the
> previous packet.

Seems to me, this is a synchronization issue between the field
'tidq->prev_thread' and 'tidq->prev_packet'.

It's still hard for me to understand "two adjacent packets on the same
thread will say they branched from the previous thread that ran", IIUC,
even we move thread swapping into cs_etm__set_thread(), if the two
adjacent packets are in the same thread context, we can skip to update
fields 'tidq->prev_thread' and 'tidq->prev_packet'.

So I am curious if below cs_etm__set_thread() works or not?

static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
struct cs_etm_traceid_queue *tidq, pid_t tid)
{
struct machine *machine = &etm->session->machines.host;

/* No context switching, bail out */
if ((tidq->thread->tid != tid)
return;

/* If tid is -1, we simply use idle thread context */
if (tid == -1)
goto find_idle_thread;

/*
* The new incoming tid is different from current thread,
* so it's to switch to the next thread context.
*/

/* Swap thread contexts */
thread__put(tidq->prev_thread);
tidq->prev_thread = thread__get(tidq->thread);

/* Find thread context for new tid */
thread__zput(tidq->thread);
tidq->thread = machine__find_thread(machine, -1, tid);

find_idle_thread:
/* Couldn't find a known thread */
if (!tidq->thread)
tidq->thread = machine__idle_thread(machine);
}

Thanks,
Leo

2023-06-08 10:53:10

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On Thu, Jun 08, 2023 at 06:25:55PM +0800, Leo Yan wrote:

[...]

> Seems to me, this is a synchronization issue between the field
> 'tidq->prev_thread' and 'tidq->prev_packet'.
>
> It's still hard for me to understand "two adjacent packets on the same
> thread will say they branched from the previous thread that ran", IIUC,
> even we move thread swapping into cs_etm__set_thread(), if the two
> adjacent packets are in the same thread context, we can skip to update
> fields 'tidq->prev_thread' and 'tidq->prev_packet'.

Sorry for typo, here should be:

... skip to update fields 'tidq->prev_thread' and 'tidq->thread'.

> So I am curious if below cs_etm__set_thread() works or not?
>
> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> struct cs_etm_traceid_queue *tidq, pid_t tid)
> {
> struct machine *machine = &etm->session->machines.host;
>
> /* No context switching, bail out */
> if ((tidq->thread->tid != tid)
> return;
>
> /* If tid is -1, we simply use idle thread context */
> if (tid == -1)
> goto find_idle_thread;
>
> /*
> * The new incoming tid is different from current thread,
> * so it's to switch to the next thread context.
> */
>
> /* Swap thread contexts */
> thread__put(tidq->prev_thread);
> tidq->prev_thread = thread__get(tidq->thread);
>
> /* Find thread context for new tid */
> thread__zput(tidq->thread);
> tidq->thread = machine__find_thread(machine, -1, tid);
>
> find_idle_thread:
> /* Couldn't find a known thread */
> if (!tidq->thread)
> tidq->thread = machine__idle_thread(machine);
> }
>
> Thanks,
> Leo

2023-06-09 11:08:01

[permalink] [raw]

Subject: Re: [PATCH 2/4] perf cs-etm: Use previous thread for branch sample source IP

On 08/06/2023 11:25, Leo Yan wrote:
> On Thu, Jun 08, 2023 at 10:34:42AM +0100, James Clark wrote:
>
> [...]
>
>>>>> @@ -616,6 +618,8 @@ static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
>>>>> tmp = tidq->packet;
>>>>> tidq->packet = tidq->prev_packet;
>>>>> tidq->prev_packet = tmp;
>>>>> + thread__put(tidq->prev_thread);
>>>>> + tidq->prev_thread = thread__get(tidq->thread);
>>>>
>>>> Maybe cs_etm__packet_swap() is not the best place to update
>>>> "tidq->prev_thread", since swapping packet doesn't mean it's necessarily
>>>> thread switching; can we move this change into the cs_etm__set_thread()?
>>>>
>>>
>>> Yeah that might make more sense. I can move it there if we decide to
>>> keep this change.
>>>
>>
>> Unfortunately I don't think I can make this change. It seems like
>> putting the previous thread swap in cs_etm__set_thread() has different
>> semantics to keeping all the swaps together in cs_etm__packet_swap().
>
> Thanks for trying this.
>
>> This is because if you swap the thread in cs_etm__packet_swap() the
>> previous packet and next packet can have the _same_ thread if there
>> happened to be no change. However if you only swap previous thread in
>> cs_etm__set_thread(), that means that the previous thread is always
>> different to the next one. This has a huge difference on the decoding
>> because two adjacent packets on the same thread will say they branched
>> from the previous thread that ran, not the previous thread on the
>> previous packet.
>
> Seems to me, this is a synchronization issue between the field
> 'tidq->prev_thread' and 'tidq->prev_packet'.
>
> It's still hard for me to understand "two adjacent packets on the same
> thread will say they branched from the previous thread that ran", IIUC,
> even we move thread swapping into cs_etm__set_thread(), if the two
> adjacent packets are in the same thread context, we can skip to update
> fields 'tidq->prev_thread' and 'tidq->prev_packet'.
>
> So I am curious if below cs_etm__set_thread() works or not?
>
> static void cs_etm__set_thread(struct cs_etm_auxtrace *etm,
> struct cs_etm_traceid_queue *tidq, pid_t tid)
> {
> struct machine *machine = &etm->session->machines.host;
>
> /* No context switching, bail out */
> if ((tidq->thread->tid != tid)
> return;
>
> /* If tid is -1, we simply use idle thread context */
> if (tid == -1)
> goto find_idle_thread;
>
> /*
> * The new incoming tid is different from current thread,
> * so it's to switch to the next thread context.
> */
>
> /* Swap thread contexts */
> thread__put(tidq->prev_thread);
> tidq->prev_thread = thread__get(tidq->thread);
>
> /* Find thread context for new tid */
> thread__zput(tidq->thread);
> tidq->thread = machine__find_thread(machine, -1, tid);
>
> find_idle_thread:
> /* Couldn't find a known thread */
> if (!tidq->thread)
> tidq->thread = machine__idle_thread(machine);
> }
>

I tried this change but I still don't think it's giving the right
results. Tracking previous thread in cs_etm__set_thread() changes the
semantics of being "the thread for the previous packet" to being "the
previous different thread of an unknown old packet". If you imagine the
packets and thread changes are like this (where <d> is a discontinuity
packet):

<--thread 1--> <--thread 2-------------------> <------thread 3-->
<d> <--packet 1--> <d> <--packet 2--> <packet 3--> <d> <--packet 4-->

Branches are generated using the last IP of the previous packet, and the
first IP of the next packet, ignoring everything in between as they are
just sequential instructions.

So assuming there are discontinuity packets between the thread switches
we should get:

thread 1 branches from packet 1 to 0x0
thread 2 branches from 0x0 to packet 2
thread 2 branches from packet 2 to packet 3
thread 2 branches from packet 3 to 0x0
thread 3 branches from 0x0 to packet 4

By tracking the previous thread for each packet, packet 2 and 3 stay in
thread 2.

If we track the previous thread instead, then both packet 2 and 3 would
continue to look like they branch from thread 1 like this:

thread ? branches from packet 1 to 0x0
thread 1 branches from 0x0 to packet 2
thread 1 branches from packet 2 to packet 3
thread 1 branches from packet 3 to 0x0
thread 2 branches from 0x0 to packet 4

Everything gets shifted back by 1 thread. I don't see the issue of
keeping all the swap stuff together in one place. Maybe there is an
issue with the naming of prev_thread? It's not really the previous
thread, it's the previous _packets_ thread. It might be the same thread
as the current one if there was no switch:

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index ca01109c3fc4..f3c73c86010a 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -86,8 +86,8 @@ struct cs_etm_traceid_queue {
size_t last_branch_pos;
union perf_event *event_buf;
struct thread *thread;
- struct thread *prev_thread;
- ocsd_ex_level prev_el;
+ struct thread *prev_packet_thread;
+ ocsd_ex_level prev_packet_el;
ocsd_ex_level el;
struct branch_stack *last_branch;

> Thanks,
> Leo

2023-06-10 01:35:30