Hello,
This is a mix of bug fixes and optimization in the data type profiling.
Firstly it now caches global variables and looks them up by address later.
This will be good for performance as well as improves the success rates
because some variables are defined in a separate file (compile unit) and
has no info in the call site for some reason.
Also it properly checks instructions that use more than one register for
a memory access like x86 SIB addressing. And check the type of stack
variables correctly and discard constant values (without type info).
Thanks,
Namhyung
Namhyung Kim (6):
perf dwarf-aux: Add die_collect_global_vars()
perf annotate-data: Collect global variables in advance
perf annotate-data: Handle direct global variable access
perf annotate-data: Check memory access with two registers
perf annotate-data: Handle multi regs in find_data_type_block()
perf annotate-data: Check kind of stack variables
tools/perf/util/annotate-data.c | 157 ++++++++++++++++++++++++++------
tools/perf/util/dwarf-aux.c | 62 +++++++++++++
tools/perf/util/dwarf-aux.h | 8 ++
3 files changed, 197 insertions(+), 30 deletions(-)
--
2.45.0.rc1.225.g2a3ae87e7f-goog
This function is to search all global variables in the CU. We want to
have the list of global variables at once and match them later.
Cc: Masami Hiramatsu <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/dwarf-aux.c | 62 +++++++++++++++++++++++++++++++++++++
tools/perf/util/dwarf-aux.h | 8 +++++
2 files changed, 70 insertions(+)
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 40cfbdfe2d75..c0a492e65388 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1630,6 +1630,68 @@ void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types)
die_find_child(sc_die, __die_collect_vars_cb, (void *)var_types, &die_mem);
}
+
+static int __die_collect_global_vars_cb(Dwarf_Die *die_mem, void *arg)
+{
+ struct die_var_type **var_types = arg;
+ Dwarf_Die type_die;
+ int tag = dwarf_tag(die_mem);
+ Dwarf_Attribute attr;
+ Dwarf_Addr base, start, end;
+ Dwarf_Op *ops;
+ size_t nops;
+ struct die_var_type *vt;
+
+ if (tag != DW_TAG_variable)
+ return DIE_FIND_CB_SIBLING;
+
+ if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL)
+ return DIE_FIND_CB_SIBLING;
+
+ /* Only collect the location with an absolute address. */
+ if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0)
+ return DIE_FIND_CB_SIBLING;
+
+ if (ops->atom != DW_OP_addr)
+ return DIE_FIND_CB_SIBLING;
+
+ if (!check_allowed_ops(ops, nops))
+ return DIE_FIND_CB_SIBLING;
+
+ if (die_get_real_type(die_mem, &type_die) == NULL)
+ return DIE_FIND_CB_SIBLING;
+
+ vt = malloc(sizeof(*vt));
+ if (vt == NULL)
+ return DIE_FIND_CB_END;
+
+ vt->die_off = dwarf_dieoffset(&type_die);
+ vt->addr = ops->number;
+ vt->reg = -1;
+ vt->offset = 0;
+ vt->next = *var_types;
+ *var_types = vt;
+
+ return DIE_FIND_CB_SIBLING;
+}
+
+/**
+ * die_collect_global_vars - Save all global variables
+ * @cu_die: a CU DIE
+ * @var_types: a pointer to save the resulting list
+ *
+ * Save all global variables in the @cu_die and save them to @var_types.
+ * The @var_types is a singly-linked list containing type and location info.
+ * Actual type can be retrieved using dwarf_offdie() with 'die_off' later.
+ *
+ * Callers should free @var_types.
+ */
+void die_collect_global_vars(Dwarf_Die *cu_die, struct die_var_type **var_types)
+{
+ Dwarf_Die die_mem;
+
+ die_find_child(cu_die, __die_collect_global_vars_cb, (void *)var_types, &die_mem);
+}
#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */
#ifdef HAVE_DWARF_CFI_SUPPORT
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index b0f25fbf9668..24446412b869 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -171,6 +171,9 @@ Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr,
/* Save all variables and parameters in this scope */
void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types);
+/* Save all global variables in this CU */
+void die_collect_global_vars(Dwarf_Die *cu_die, struct die_var_type **var_types);
+
#else /* HAVE_DWARF_GETLOCATIONS_SUPPORT */
static inline int die_get_var_range(Dwarf_Die *sp_die __maybe_unused,
@@ -203,6 +206,11 @@ static inline void die_collect_vars(Dwarf_Die *sc_die __maybe_unused,
{
}
+static inline void die_collect_global_vars(Dwarf_Die *cu_die __maybe_unused,
+ struct die_var_type **var_types __maybe_unused)
+{
+}
+
#endif /* HAVE_DWARF_GETLOCATIONS_SUPPORT */
#ifdef HAVE_DWARF_CFI_SUPPORT
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Like per-cpu base offset array, sometimes it accesses the global
variable directly using the offset. Allow this type of instructions as
long as it finds a global variable for the address.
movslq %edi, %rcx
mov -0x7dc94ae0(,%rcx,8), %rcx <<<--- here
As %rcx has a valid type (i.e. array index) from the first instruction,
it will be checked by the first case in check_matching_type(). But as
it's not a pointer type, the match will fail. But in this case, it
should check if it accesses the kernel global array variable.
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate-data.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 4dd0911904f2..f1e52a531563 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1256,14 +1256,19 @@ static int check_matching_type(struct type_state *state,
if (state->regs[reg].ok && state->regs[reg].kind == TSR_KIND_TYPE) {
int tag = dwarf_tag(&state->regs[reg].type);
- pr_debug_dtp("\n");
-
/*
* Normal registers should hold a pointer (or array) to
* dereference a memory location.
*/
- if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type)
+ if (tag != DW_TAG_pointer_type && tag != DW_TAG_array_type) {
+ if (dloc->op->offset < 0 && reg != state->stack_reg)
+ goto check_kernel;
+
+ pr_debug_dtp("\n");
return -1;
+ }
+
+ pr_debug_dtp("\n");
/* Remove the pointer and get the target type */
if (die_get_real_type(&state->regs[reg].type, type_die) == NULL)
@@ -1376,12 +1381,14 @@ static int check_matching_type(struct type_state *state,
return -1;
}
- if (map__dso(dloc->ms->map)->kernel && arch__is(dloc->arch, "x86")) {
+check_kernel:
+ if (map__dso(dloc->ms->map)->kernel) {
u64 addr;
int offset;
/* Direct this-cpu access like "%gs:0x34740" */
- if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm) {
+ if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm &&
+ arch__is(dloc->arch, "x86")) {
pr_debug_dtp(" this-cpu var\n");
addr = dloc->op->offset;
@@ -1394,17 +1401,13 @@ static int check_matching_type(struct type_state *state,
return -1;
}
- /* Access to per-cpu base like "-0x7dcf0500(,%rdx,8)" */
+ /* Access to global variable like "-0x7dcf0500(,%rdx,8)" */
if (dloc->op->offset < 0 && reg != state->stack_reg) {
- const char *var_name = NULL;
-
addr = (s64) dloc->op->offset;
- if (get_global_var_info(dloc, addr, &var_name, &offset) &&
- !strcmp(var_name, "__per_cpu_offset") && offset == 0 &&
- get_global_var_type(cu_die, dloc, dloc->ip, addr,
+ if (get_global_var_type(cu_die, dloc, dloc->ip, addr,
&offset, type_die)) {
- pr_debug_dtp(" percpu base\n");
+ pr_debug_dtp(" global var\n");
dloc->type_offset = offset;
return 1;
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Currently it looks up global variables from the current CU using address
and name. But it sometimes fails to find a variable as the variable can
come from a different CU - but it's still strange it failed to find a
declaration for some reason.
Anyway, it can collect all global variables from all CU once and then
lookup them later on. This slightly improves the success rate of my
test data set.
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate-data.c | 57 +++++++++++++++++++++++++++++++--
1 file changed, 55 insertions(+), 2 deletions(-)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 12d5faff3b7a..4dd0911904f2 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -28,6 +28,8 @@
/* register number of the stack pointer */
#define X86_REG_SP 7
+static void delete_var_types(struct die_var_type *var_types);
+
enum type_state_kind {
TSR_KIND_INVALID = 0,
TSR_KIND_TYPE,
@@ -557,8 +559,8 @@ static bool global_var__add(struct data_loc_info *dloc, u64 addr,
if (gvar == NULL)
return false;
- gvar->name = strdup(name);
- if (gvar->name == NULL) {
+ gvar->name = name ? strdup(name) : NULL;
+ if (name && gvar->name == NULL) {
free(gvar);
return false;
}
@@ -612,6 +614,53 @@ static bool get_global_var_info(struct data_loc_info *dloc, u64 addr,
return true;
}
+static void global_var__collect(struct data_loc_info *dloc)
+{
+ Dwarf *dwarf = dloc->di->dbg;
+ Dwarf_Off off, next_off;
+ Dwarf_Die cu_die, type_die;
+ size_t header_size;
+
+ /* Iterate all CU and collect global variables that have no location in a register. */
+ off = 0;
+ while (dwarf_nextcu(dwarf, off, &next_off, &header_size,
+ NULL, NULL, NULL) == 0) {
+ struct die_var_type *var_types = NULL;
+ struct die_var_type *pos;
+
+ if (dwarf_offdie(dwarf, off + header_size, &cu_die) == NULL) {
+ off = next_off;
+ continue;
+ }
+
+ die_collect_global_vars(&cu_die, &var_types);
+
+ for (pos = var_types; pos; pos = pos->next) {
+ const char *var_name = NULL;
+ int var_offset = 0;
+
+ if (pos->reg != -1)
+ continue;
+
+ if (!dwarf_offdie(dwarf, pos->die_off, &type_die))
+ continue;
+
+ if (!get_global_var_info(dloc, pos->addr, &var_name,
+ &var_offset))
+ continue;
+
+ if (var_offset != 0)
+ continue;
+
+ global_var__add(dloc, pos->addr, var_name, &type_die);
+ }
+
+ delete_var_types(var_types);
+
+ off = next_off;
+ }
+}
+
static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
u64 ip, u64 var_addr, int *var_offset,
Dwarf_Die *type_die)
@@ -620,8 +669,12 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
int offset;
const char *var_name = NULL;
struct global_var_entry *gvar;
+ struct dso *dso = map__dso(dloc->ms->map);
Dwarf_Die var_die;
+ if (RB_EMPTY_ROOT(&dso->global_vars))
+ global_var__collect(dloc);
+
gvar = global_var__find(dloc, var_addr);
if (gvar) {
if (!dwarf_offdie(dloc->di->dbg, gvar->die_offset, type_die))
--
2.45.0.rc1.225.g2a3ae87e7f-goog
The following instruction pattern is used to access a global variable.
mov $0x231c0, %rax
movsql %edi, %rcx
mov -0x7dc94ae0(,%rcx,8), %rcx
cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
The first instruction set the address of the per-cpu variable (here, it
is 'runqueus' of struct rq). The second instruction seems like a cpu
number of the per-cpu base. The third instruction get the base offset
of per-cpu area for that cpu. The last instruction compares the value
of the per-cpu variable at the offset of 0xa60.
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++++++----
1 file changed, 39 insertions(+), 5 deletions(-)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index f1e52a531563..245e3ef3e2ff 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1031,22 +1031,37 @@ static void update_insn_state_x86(struct type_state *state,
else if (has_reg_type(state, sreg) &&
state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
u64 ip = dloc->ms->sym->start + dl->al.offset;
+ u64 var_addr = src->offset;
int offset;
+ if (src->multi_regs) {
+ int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
+
+ if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+ state->regs[reg2].kind == TSR_KIND_CONST)
+ var_addr += state->regs[reg2].imm_value;
+ }
+
/*
* In kernel, %gs points to a per-cpu region for the
* current CPU. Access with a constant offset should
* be treated as a global variable access.
*/
- if (get_global_var_type(cu_die, dloc, ip, src->offset,
+ if (get_global_var_type(cu_die, dloc, ip, var_addr,
&offset, &type_die) &&
die_get_member_type(&type_die, offset, &type_die)) {
tsr->type = type_die;
tsr->kind = TSR_KIND_TYPE;
tsr->ok = true;
- pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
- insn_offset, src->offset, sreg, dst->reg1);
+ if (src->multi_regs) {
+ pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
+ insn_offset, src->offset, src->reg1,
+ src->reg2, dst->reg1);
+ } else {
+ pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
+ insn_offset, src->offset, sreg, dst->reg1);
+ }
pr_debug_type_name(&tsr->type, tsr->kind);
} else {
tsr->ok = false;
@@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state,
pr_debug_dtp(" percpu var\n");
+ if (dloc->op->multi_regs) {
+ int reg2 = dloc->op->reg2;
+
+ if (dloc->op->reg2 == reg)
+ reg2 = dloc->op->reg1;
+
+ if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+ state->regs[reg2].kind == TSR_KIND_CONST)
+ var_addr += state->regs[reg2].imm_value;
+ }
+
if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
&var_offset, type_die)) {
dloc->type_offset = var_offset;
@@ -1527,8 +1553,16 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
cu_die, type_die);
if (found > 0) {
- pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n",
- dloc->op->offset, reg, dloc->type_offset);
+ char buf[64];
+
+ if (dloc->op->multi_regs)
+ snprintf(buf, sizeof(buf), "reg%d, reg%d",
+ dloc->op->reg1, dloc->op->reg2);
+ else
+ snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
+
+ pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
+ dloc->op->offset, buf, dloc->type_offset);
pr_debug_type_name(type_die, TSR_KIND_TYPE);
ret = 0;
break;
--
2.45.0.rc1.225.g2a3ae87e7f-goog
The instruction tracking should be the same for the both registers.
Just do it once and compare the result with multi regs as with the
previous patches. Then we don't need to call find_data_type_block()
separately for each reg. Let's remove the 'reg' argument from the
relevant functions.
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate-data.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 245e3ef3e2ff..68fe7999f033 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1258,11 +1258,12 @@ static void setup_stack_canary(struct data_loc_info *dloc)
* are similar to global variables and no additional info is needed.
*/
static int check_matching_type(struct type_state *state,
- struct data_loc_info *dloc, int reg,
+ struct data_loc_info *dloc,
Dwarf_Die *cu_die, Dwarf_Die *type_die)
{
Dwarf_Word size;
u32 insn_offset = dloc->ip - dloc->ms->sym->start;
+ int reg = dloc->op->reg1;
pr_debug_dtp("chk [%x] reg%d offset=%#x ok=%d kind=%d",
insn_offset, reg, dloc->op->offset,
@@ -1448,7 +1449,7 @@ static int check_matching_type(struct type_state *state,
}
/* Iterate instructions in basic blocks and update type table */
-static int find_data_type_insn(struct data_loc_info *dloc, int reg,
+static int find_data_type_insn(struct data_loc_info *dloc,
struct list_head *basic_blocks,
struct die_var_type *var_types,
Dwarf_Die *cu_die, Dwarf_Die *type_die)
@@ -1481,7 +1482,7 @@ static int find_data_type_insn(struct data_loc_info *dloc, int reg,
update_var_state(&state, dloc, addr, dl->al.offset, var_types);
if (this_ip == dloc->ip) {
- ret = check_matching_type(&state, dloc, reg,
+ ret = check_matching_type(&state, dloc,
cu_die, type_die);
goto out;
}
@@ -1502,7 +1503,7 @@ static int find_data_type_insn(struct data_loc_info *dloc, int reg,
* Construct a list of basic blocks for each scope with variables and try to find
* the data type by updating a type state table through instructions.
*/
-static int find_data_type_block(struct data_loc_info *dloc, int reg,
+static int find_data_type_block(struct data_loc_info *dloc,
Dwarf_Die *cu_die, Dwarf_Die *scopes,
int nr_scopes, Dwarf_Die *type_die)
{
@@ -1550,7 +1551,7 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
fixup_var_address(var_types, start);
/* Find from start of this scope to the target instruction */
- found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
+ found = find_data_type_insn(dloc, &basic_blocks, var_types,
cu_die, type_die);
if (found > 0) {
char buf[64];
@@ -1716,8 +1717,13 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
goto out;
}
+ if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) {
+ reg = loc->reg2;
+ goto retry;
+ }
+
if (reg != DWARF_REG_PC) {
- ret = find_data_type_block(dloc, reg, &cu_die, scopes,
+ ret = find_data_type_block(dloc, &cu_die, scopes,
nr_scopes, type_die);
if (ret == 0) {
ann_data_stat.insn_track++;
@@ -1725,11 +1731,6 @@ static int find_data_type_die(struct data_loc_info *dloc, Dwarf_Die *type_die)
}
}
- if (loc->multi_regs && reg == loc->reg1 && loc->reg1 != loc->reg2) {
- reg = loc->reg2;
- goto retry;
- }
-
if (ret < 0) {
pr_debug_dtp("no variable found\n");
ann_data_stat.no_var++;
--
2.45.0.rc1.225.g2a3ae87e7f-goog
I sometimes see ("unknown type") in the result and it was because it
didn't check the type of stack variables properly during the instruction
tracking. The stack can carry constant values (without type info) and
if the target instruction is accessing the stack location, it resulted
in the "unknown type".
Maybe we could pick one of integer types for the constant, but it
doesn't really mean anything useful. Let's just drop the stack slot if
it doesn't have a valid type info.
Here's an example how it got the unknown type.
Note that 0xffffff48 = -0xb8.
-----------------------------------------------------------
find data type for 0xffffff48(reg6) at ...
CU for ...
frame base: cfa=0 fbreg=6
scope: [2/2] (die:11cb97f)
bb: [37 - 3a]
var [37] reg15 type='int' size=0x4 (die:0x1180633)
bb: [40 - 4b]
mov [40] imm=0x1 -> reg13
var [45] reg8 type='sigset_t*' size=0x8 (die:0x11a39ee)
mov [45] imm=0x1 -> reg2 <--- here reg2 has a constant
bb: [215 - 237]
mov [218] reg2 -> -0xb8(stack) constant <--- and save it to the stack
mov [225] reg13 -> -0xc4(stack) constant
call [22f] find_task_by_vgpid
call [22f] return -> reg0 type='struct task_struct*' size=0x8 (die:0x11881e8)
bb: [5c8 - 5cf]
bb: [2fb - 302]
mov [2fb] -0xc4(stack) -> reg13 constant
bb: [13b - 14d]
mov [143] 0xd50(reg3) -> reg5 type='struct task_struct*' size=0x8 (die:0xa31f3c)
bb: [153 - 153]
chk [153] reg6 offset=0xffffff48 ok=0 kind=0 fbreg <--- access here
found by insn track: 0xffffff48(reg6) type-offset=0
type='G<EF>^K<F6><AF>U' size=0 (die:0xffffffffffffffff)
Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/annotate-data.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 68fe7999f033..2c98813f95cd 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1314,6 +1314,9 @@ static int check_matching_type(struct type_state *state,
return -1;
}
+ if (stack->kind != TSR_KIND_TYPE)
+ return 0;
+
*type_die = stack->type;
/* Update the type offset from the start of slot */
dloc->type_offset -= stack->offset;
@@ -1343,6 +1346,9 @@ static int check_matching_type(struct type_state *state,
return -1;
}
+ if (stack->kind != TSR_KIND_TYPE)
+ return 0;
+
*type_die = stack->type;
/* Update the type offset from the start of slot */
dloc->type_offset -= fboff + stack->offset;
--
2.45.0.rc1.225.g2a3ae87e7f-goog
On Wed, May 01, 2024 at 11:00:07PM -0700, Namhyung Kim wrote:
> Currently it looks up global variables from the current CU using address
> and name. But it sometimes fails to find a variable as the variable can
> come from a different CU - but it's still strange it failed to find a
> declaration for some reason.
>
> Anyway, it can collect all global variables from all CU once and then
> lookup them later on. This slightly improves the success rate of my
> test data set.
It would be interesting you could provide examples from your test data
set, i.e. after this patch these extra global variables were considered
in the test results, with some tool output, etc.
This would help intersested parties to reproduce your results,
validate the end result, etc.
- Arnaldo
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> tools/perf/util/annotate-data.c | 57 +++++++++++++++++++++++++++++++--
> 1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
> index 12d5faff3b7a..4dd0911904f2 100644
> --- a/tools/perf/util/annotate-data.c
> +++ b/tools/perf/util/annotate-data.c
> @@ -28,6 +28,8 @@
> /* register number of the stack pointer */
> #define X86_REG_SP 7
>
> +static void delete_var_types(struct die_var_type *var_types);
> +
> enum type_state_kind {
> TSR_KIND_INVALID = 0,
> TSR_KIND_TYPE,
> @@ -557,8 +559,8 @@ static bool global_var__add(struct data_loc_info *dloc, u64 addr,
> if (gvar == NULL)
> return false;
>
> - gvar->name = strdup(name);
> - if (gvar->name == NULL) {
> + gvar->name = name ? strdup(name) : NULL;
> + if (name && gvar->name == NULL) {
> free(gvar);
> return false;
> }
> @@ -612,6 +614,53 @@ static bool get_global_var_info(struct data_loc_info *dloc, u64 addr,
> return true;
> }
>
> +static void global_var__collect(struct data_loc_info *dloc)
> +{
> + Dwarf *dwarf = dloc->di->dbg;
> + Dwarf_Off off, next_off;
> + Dwarf_Die cu_die, type_die;
> + size_t header_size;
> +
> + /* Iterate all CU and collect global variables that have no location in a register. */
> + off = 0;
> + while (dwarf_nextcu(dwarf, off, &next_off, &header_size,
> + NULL, NULL, NULL) == 0) {
> + struct die_var_type *var_types = NULL;
> + struct die_var_type *pos;
> +
> + if (dwarf_offdie(dwarf, off + header_size, &cu_die) == NULL) {
> + off = next_off;
> + continue;
> + }
> +
> + die_collect_global_vars(&cu_die, &var_types);
> +
> + for (pos = var_types; pos; pos = pos->next) {
> + const char *var_name = NULL;
> + int var_offset = 0;
> +
> + if (pos->reg != -1)
> + continue;
> +
> + if (!dwarf_offdie(dwarf, pos->die_off, &type_die))
> + continue;
> +
> + if (!get_global_var_info(dloc, pos->addr, &var_name,
> + &var_offset))
> + continue;
> +
> + if (var_offset != 0)
> + continue;
> +
> + global_var__add(dloc, pos->addr, var_name, &type_die);
> + }
> +
> + delete_var_types(var_types);
> +
> + off = next_off;
> + }
> +}
> +
> static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
> u64 ip, u64 var_addr, int *var_offset,
> Dwarf_Die *type_die)
> @@ -620,8 +669,12 @@ static bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
> int offset;
> const char *var_name = NULL;
> struct global_var_entry *gvar;
> + struct dso *dso = map__dso(dloc->ms->map);
> Dwarf_Die var_die;
>
> + if (RB_EMPTY_ROOT(&dso->global_vars))
> + global_var__collect(dloc);
> +
> gvar = global_var__find(dloc, var_addr);
> if (gvar) {
> if (!dwarf_offdie(dloc->di->dbg, gvar->die_offset, type_die))
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> The following instruction pattern is used to access a global variable.
>
> mov $0x231c0, %rax
> movsql %edi, %rcx
> mov -0x7dc94ae0(,%rcx,8), %rcx
> cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
>
> The first instruction set the address of the per-cpu variable (here, it
> is 'runqueus' of struct rq). The second instruction seems like a cpu
You mean 'runqueues', i.e. this one:
kernel/sched/core.c
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
?
But that 0xa60 would be in an alignment hole, at least in:
$ pahole --hex rq | egrep 0xa40 -A12
struct mm_struct * prev_mm; /* 0xa40 0x8 */
unsigned int clock_update_flags; /* 0xa48 0x4 */
/* XXX 4 bytes hole, try to pack */
u64 clock; /* 0xa50 0x8 */
/* XXX 40 bytes hole, try to pack */
/* --- cacheline 42 boundary (2688 bytes) --- */
u64 clock_task __attribute__((__aligned__(64))); /* 0xa80 0x8 */
u64 clock_pelt; /* 0xa88 0x8 */
long unsigned int lost_idle_time; /* 0xa90 0x8 */
$ uname -a
Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
$
The paragraph then reads:
----
The first instruction set the address of the per-cpu variable (here, it
is 'runqueues' of type 'struct rq'). The second instruction seems like
a cpu number of the per-cpu base. The third instruction get the base
offset of per-cpu area for that cpu. The last instruction compares the
value of the per-cpu variable at the offset of 0xa60.
----
Ok?
> number of the per-cpu base. The third instruction get the base offset
> of per-cpu area for that cpu. The last instruction compares the value
> of the per-cpu variable at the offset of 0xa60.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++++++----
> 1 file changed, 39 insertions(+), 5 deletions(-)
>
> diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
> index f1e52a531563..245e3ef3e2ff 100644
> --- a/tools/perf/util/annotate-data.c
> +++ b/tools/perf/util/annotate-data.c
> @@ -1031,22 +1031,37 @@ static void update_insn_state_x86(struct type_state *state,
> else if (has_reg_type(state, sreg) &&
> state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
> u64 ip = dloc->ms->sym->start + dl->al.offset;
> + u64 var_addr = src->offset;
> int offset;
>
> + if (src->multi_regs) {
> + int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
> +
> + if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
> + state->regs[reg2].kind == TSR_KIND_CONST)
> + var_addr += state->regs[reg2].imm_value;
> + }
> +
> /*
> * In kernel, %gs points to a per-cpu region for the
> * current CPU. Access with a constant offset should
> * be treated as a global variable access.
> */
> - if (get_global_var_type(cu_die, dloc, ip, src->offset,
> + if (get_global_var_type(cu_die, dloc, ip, var_addr,
> &offset, &type_die) &&
> die_get_member_type(&type_die, offset, &type_die)) {
> tsr->type = type_die;
> tsr->kind = TSR_KIND_TYPE;
> tsr->ok = true;
>
> - pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
> - insn_offset, src->offset, sreg, dst->reg1);
> + if (src->multi_regs) {
> + pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
> + insn_offset, src->offset, src->reg1,
> + src->reg2, dst->reg1);
> + } else {
> + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
> + insn_offset, src->offset, sreg, dst->reg1);
> + }
> pr_debug_type_name(&tsr->type, tsr->kind);
> } else {
> tsr->ok = false;
> @@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state,
>
> pr_debug_dtp(" percpu var\n");
>
> + if (dloc->op->multi_regs) {
> + int reg2 = dloc->op->reg2;
> +
> + if (dloc->op->reg2 == reg)
> + reg2 = dloc->op->reg1;
> +
> + if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
> + state->regs[reg2].kind == TSR_KIND_CONST)
> + var_addr += state->regs[reg2].imm_value;
> + }
> +
> if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
> &var_offset, type_die)) {
> dloc->type_offset = var_offset;
> @@ -1527,8 +1553,16 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
> found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
> cu_die, type_die);
> if (found > 0) {
> - pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n",
> - dloc->op->offset, reg, dloc->type_offset);
> + char buf[64];
> +
> + if (dloc->op->multi_regs)
> + snprintf(buf, sizeof(buf), "reg%d, reg%d",
> + dloc->op->reg1, dloc->op->reg2);
> + else
> + snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
> +
> + pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
> + dloc->op->offset, buf, dloc->type_offset);
> pr_debug_type_name(type_die, TSR_KIND_TYPE);
> ret = 0;
> break;
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
On Wed, May 01, 2024 at 11:00:05PM -0700, Namhyung Kim wrote:
> Hello,
>
> This is a mix of bug fixes and optimization in the data type profiling.
>
> Firstly it now caches global variables and looks them up by address later.
> This will be good for performance as well as improves the success rates
> because some variables are defined in a separate file (compile unit) and
> has no info in the call site for some reason.
>
> Also it properly checks instructions that use more than one register for
> a memory access like x86 SIB addressing. And check the type of stack
> variables correctly and discard constant values (without type info).
Applied locally, doing build tests.
- Arnaldo
On Thu, May 2, 2024 at 7:05 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> > The following instruction pattern is used to access a global variable.
> >
> > mov $0x231c0, %rax
> > movsql %edi, %rcx
> > mov -0x7dc94ae0(,%rcx,8), %rcx
> > cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
> >
> > The first instruction set the address of the per-cpu variable (here, it
> > is 'runqueus' of struct rq). The second instruction seems like a cpu
>
> You mean 'runqueues', i.e. this one:
>
> kernel/sched/core.c
> DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>
> ?
Right, sorry for the typo.
>
> But that 0xa60 would be in an alignment hole, at least in:
>
> $ pahole --hex rq | egrep 0xa40 -A12
> struct mm_struct * prev_mm; /* 0xa40 0x8 */
> unsigned int clock_update_flags; /* 0xa48 0x4 */
>
> /* XXX 4 bytes hole, try to pack */
>
> u64 clock; /* 0xa50 0x8 */
>
> /* XXX 40 bytes hole, try to pack */
>
> /* --- cacheline 42 boundary (2688 bytes) --- */
> u64 clock_task __attribute__((__aligned__(64))); /* 0xa80 0x8 */
> u64 clock_pelt; /* 0xa88 0x8 */
> long unsigned int lost_idle_time; /* 0xa90 0x8 */
> $ uname -a
> Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
> $
This would be different on kernel version, config and
other changes like backports or local modifications.
On my system, it was cpu_stop_work.arg.
$ pahole --hex rq | grep 0xa40 -C1
/* --- cacheline 41 boundary (2624 bytes) --- */
struct cpu_stop_work active_balance_work; /* 0xa40 0x30 */
int cpu; /* 0xa70 0x4 */
$ pahole --hex cpu_stop_work
struct cpu_stop_work {
struct list_head list; /* 0 0x10 */
cpu_stop_fn_t fn; /* 0x10 0x8 */
long unsigned int caller; /* 0x18 0x8 */
void * arg; /* 0x20 0x8 */
struct cpu_stop_done * done; /* 0x28 0x8 */
/* size: 48, cachelines: 1, members: 5 */
/* last cacheline: 48 bytes */
};
>
> The paragraph then reads:
>
> ----
> The first instruction set the address of the per-cpu variable (here, it
> is 'runqueues' of type 'struct rq'). The second instruction seems like
> a cpu number of the per-cpu base. The third instruction get the base
> offset of per-cpu area for that cpu. The last instruction compares the
> value of the per-cpu variable at the offset of 0xa60.
> ----
>
> Ok?
Yep, looks good.
Thanks,
Namhyung
On Thu, May 2, 2024 at 6:50 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
>
> On Wed, May 01, 2024 at 11:00:07PM -0700, Namhyung Kim wrote:
> > Currently it looks up global variables from the current CU using address
> > and name. But it sometimes fails to find a variable as the variable can
> > come from a different CU - but it's still strange it failed to find a
> > declaration for some reason.
> >
> > Anyway, it can collect all global variables from all CU once and then
> > lookup them later on. This slightly improves the success rate of my
> > test data set.
>
> It would be interesting you could provide examples from your test data
> set, i.e. after this patch these extra global variables were considered
> in the test results, with some tool output, etc.
>
> This would help intersested parties to reproduce your results,
> validate the end result, etc.
Hmm.. ok. I'll try to find public data that I can share.
But it's just a perf data file recorded with perf mem.
Thanks,
Namhyung
On Thu, May 2, 2024 at 11:23 AM Namhyung Kim <[email protected]> wrote:
>
> On Thu, May 2, 2024 at 6:50 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
> >
> > On Wed, May 01, 2024 at 11:00:07PM -0700, Namhyung Kim wrote:
> > > Currently it looks up global variables from the current CU using address
> > > and name. But it sometimes fails to find a variable as the variable can
> > > come from a different CU - but it's still strange it failed to find a
> > > declaration for some reason.
> > >
> > > Anyway, it can collect all global variables from all CU once and then
> > > lookup them later on. This slightly improves the success rate of my
> > > test data set.
> >
> > It would be interesting you could provide examples from your test data
> > set, i.e. after this patch these extra global variables were considered
> > in the test results, with some tool output, etc.
Here's the example:
Before
-----------------------------------------------------------
find data type for 0x6014(PC) at entry_SYSCALL_64+0x7
CU for arch/x86/entry/entry_64.S (die:0x85e1d)
no variable found
After
-----------------------------------------------------------
find data type for 0x6014(PC) at entry_SYSCALL_64+0x7
CU for arch/x86/entry/entry_64.S (die:0x85e1d)
found by addr=0x6014 type_offset=0x14
type='struct tss_struct' size=0x5000 (die:0x74dbe1)
It was asm code so it doesn't have the type info in the CU.
With this change it can see the type (from a different CU).
It seems we have a per-cpu variable called cpu_tss_rw at
address 0x6000.
$ nm vmlinux | grep 6000 | grep tss
0000000000006000 D cpu_tss_rw
Thanks,
Namhyung
On Thu, May 02, 2024 at 11:14:50AM -0700, Namhyung Kim wrote:
> On Thu, May 2, 2024 at 7:05 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
> >
> > On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> > > The following instruction pattern is used to access a global variable.
> > >
> > > mov $0x231c0, %rax
> > > movsql %edi, %rcx
> > > mov -0x7dc94ae0(,%rcx,8), %rcx
> > > cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
> > >
> > > The first instruction set the address of the per-cpu variable (here, it
> > > is 'runqueus' of struct rq). The second instruction seems like a cpu
> >
> > You mean 'runqueues', i.e. this one:
> >
> > kernel/sched/core.c
> > DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> >
> > ?
>
> Right, sorry for the typo.
>
> >
> > But that 0xa60 would be in an alignment hole, at least in:
> >
> > $ pahole --hex rq | egrep 0xa40 -A12
> > struct mm_struct * prev_mm; /* 0xa40 0x8 */
> > unsigned int clock_update_flags; /* 0xa48 0x4 */
> >
> > /* XXX 4 bytes hole, try to pack */
> >
> > u64 clock; /* 0xa50 0x8 */
> >
> > /* XXX 40 bytes hole, try to pack */
> >
> > /* --- cacheline 42 boundary (2688 bytes) --- */
> > u64 clock_task __attribute__((__aligned__(64))); /* 0xa80 0x8 */
> > u64 clock_pelt; /* 0xa88 0x8 */
> > long unsigned int lost_idle_time; /* 0xa90 0x8 */
> > $ uname -a
> > Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
> > $
>
> This would be different on kernel version, config and
> other changes like backports or local modifications.
>
> On my system, it was cpu_stop_work.arg.
Sure, so please include the pahole output for the data that lead you to
the conclusions in the explanation for the results obtained, so that we
can have a better mental map of all the pieces and thus get convinced of
the results and have a way to try to reproduce it in our systems.
In the future we will be grateful to this effort when looking back at
these patches :-)
Thanks for all your work in these features!
- Arnaldo
> $ pahole --hex rq | grep 0xa40 -C1
> /* --- cacheline 41 boundary (2624 bytes) --- */
> struct cpu_stop_work active_balance_work; /* 0xa40 0x30 */
> int cpu; /* 0xa70 0x4 */
>
> $ pahole --hex cpu_stop_work
> struct cpu_stop_work {
> struct list_head list; /* 0 0x10 */
> cpu_stop_fn_t fn; /* 0x10 0x8 */
> long unsigned int caller; /* 0x18 0x8 */
> void * arg; /* 0x20 0x8 */
> struct cpu_stop_done * done; /* 0x28 0x8 */
>
> /* size: 48, cachelines: 1, members: 5 */
> /* last cacheline: 48 bytes */
> };
>
>
> >
> > The paragraph then reads:
> >
> > ----
> > The first instruction set the address of the per-cpu variable (here, it
> > is 'runqueues' of type 'struct rq'). The second instruction seems like
> > a cpu number of the per-cpu base. The third instruction get the base
> > offset of per-cpu area for that cpu. The last instruction compares the
> > value of the per-cpu variable at the offset of 0xa60.
> > ----
> >
> > Ok?
>
> Yep, looks good.
>
> Thanks,
> Namhyung