2009-12-30 03:17:22

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 0/3] perf_event: fix getting symbol error if kernel is relocatable

Hi,

If the kernel is relocatable, perf tools can't get symbol
name correctly, See: http://lkml.org/lkml/2009/12/20/218

The purpose of this patchset is to fix this bug, and it base
on my previously patchset: http://lkml.org/lkml/2009/12/29/4
since it used 'inject' event

arch/x86/boot/compressed/head_32.S | 2 ++
arch/x86/boot/compressed/head_64.S | 3 +++
arch/x86/include/asm/bootparam.h | 3 ++-
arch/x86/kernel/asm-offsets_32.c | 1 +
arch/x86/kernel/asm-offsets_64.c | 1 +
arch/x86/kernel/cpu/perf_event.c | 10 ++++++++++
include/linux/perf_event.h | 1 +
kernel/perf_event.c | 23 +++++++++++++++++++++--
tools/perf/builtin-record.c | 3 +++
tools/perf/util/session.c | 6 ++++++
tools/perf/util/symbol.c | 13 +++++++++++++
tools/perf/util/symbol.h | 3 ++-
12 files changed, 65 insertions(+), 4 deletions(-)


2009-12-30 03:18:51

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/3] x86: record relocation offset

Record relocation offset, perf tools will use it
to adjust kernel symbol address

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/boot/compressed/head_32.S | 2 ++
arch/x86/boot/compressed/head_64.S | 3 +++
arch/x86/include/asm/bootparam.h | 3 ++-
arch/x86/kernel/asm-offsets_32.c | 1 +
arch/x86/kernel/asm-offsets_64.c | 1 +
arch/x86/kernel/cpu/perf_event.c | 4 ++++
6 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f543b70..dc9748a 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -151,6 +151,8 @@ relocated:
movl %ebp, %ebx
subl $LOAD_PHYSICAL_ADDR, %ebx
jz 2f /* Nothing to be done if loaded at compiled addr. */
+
+ movl %ebx, BP_relocate_offset(%esi)
/*
* Process relocations.
*/
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index faff0dc..8170f32 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -90,6 +90,9 @@ ENTRY(startup_32)
addl %eax, %ebx
notl %eax
andl %eax, %ebx
+ movl %ebx, %eax
+ subl $LOAD_PHYSICAL_ADDR, %eax
+ movl %eax, BP_relocate_offset(%esi)
#else
movl $LOAD_PHYSICAL_ADDR, %ebx
#endif
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 6be33d8..80b8d1f 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -88,7 +88,8 @@ struct boot_params {
__u8 _pad2[4]; /* 0x054 */
__u64 tboot_addr; /* 0x058 */
struct ist_info ist_info; /* 0x060 */
- __u8 _pad3[16]; /* 0x070 */
+ __s32 relocate_offset; /* 0x070 */
+ __u8 _pad3[12]; /* 0x074 */
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
__u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
struct sys_desc_table sys_desc_table; /* 0x0a0 */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf64..8028e3b 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -148,4 +148,5 @@ void foo(void)
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_relocate_offset, boot_params, relocate_offset);
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeed..fbfa5f3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -127,6 +127,7 @@ int main(void)
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_relocate_offset, boot_params, relocate_offset);

BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c223b7e..11d2a7d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -26,8 +26,10 @@
#include <asm/apic.h>
#include <asm/stacktrace.h>
#include <asm/nmi.h>
+#include <asm/setup.h>

static u64 perf_event_mask __read_mostly;
+static s32 relocate_offset;

/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 4
@@ -2173,6 +2175,8 @@ void __init init_hw_perf_events(void)

pr_info("Performance Events: ");

+ relocate_offset = boot_params.relocate_offset;
+
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
err = intel_pmu_init();
--
1.6.1.2

2009-12-30 03:19:44

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/3] perf_event: support getting relocation offset

Support getting relocation offset by using 'inject' event

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 6 ++++++
include/linux/perf_event.h | 1 +
kernel/perf_event.c | 23 +++++++++++++++++++++--
3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 11d2a7d..fbb39f6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2470,3 +2470,9 @@ void hw_perf_event_setup_online(int cpu)
{
init_debug_store_on_cpu(cpu);
}
+
+int perf_inject_get_relocate_offset(u64 *offset)
+{
+ *offset = relocate_offset;
+ return 0;
+}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6c93f88..2671234 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -428,6 +428,7 @@ enum perf_event_type {

enum perf_inject_event {
PERF_INJECT_HZ = 0x01,
+ PERF_INJECT_RELOCATE_OFFSET = 0x02,
};

enum perf_callchain_context {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9343c6c..9331426 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2012,6 +2012,12 @@ unlock:
return ret;
}

+extern __weak int perf_inject_get_relocate_offset(u64 *offset)
+{
+ *offset = 0;
+ return 0;
+}
+
static int perf_inject_get_hz(u64 *hz)
{
*hz = HZ;
@@ -2050,10 +2056,23 @@ exit:

static int perf_inject_ioctl(struct perf_event *event, unsigned int arg)
{
- if (!arg || arg & ~PERF_INJECT_HZ)
+ int ret = 0;
+
+ if (!arg || arg & ~(PERF_INJECT_HZ | PERF_INJECT_RELOCATE_OFFSET))
return -EINVAL;

- return perf_inject_event(event, PERF_INJECT_HZ, perf_inject_get_hz);
+ if (arg & PERF_INJECT_HZ) {
+ ret = perf_inject_event(event, PERF_INJECT_HZ,
+ perf_inject_get_hz);
+ if (ret)
+ goto exit;
+ }
+
+ if (arg & PERF_INJECT_RELOCATE_OFFSET)
+ ret = perf_inject_event(event, PERF_INJECT_RELOCATE_OFFSET,
+ perf_inject_get_relocate_offset);
+exit:
+ return ret;
}

static int perf_event_set_output(struct perf_event *event, int output_fd);
--
1.6.1.2

2009-12-30 03:20:48

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 3/3] perf tools: adjust symbol address

Using relocation offset adjust symbol address if we get
kernel symbol name form elf file

Signed-off-by: Xiao Guangrong <[email protected]>
---
tools/perf/builtin-record.c | 3 +++
tools/perf/util/session.c | 6 ++++++
tools/perf/util/symbol.c | 13 +++++++++++++
tools/perf/util/symbol.h | 2 ++
4 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d13601d..4d86969 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -436,6 +436,9 @@ static int __cmd_record(int argc, const char **argv)
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);

+ /* Always get relocation offset */
+ inject_events |= PERF_INJECT_RELOCATE_OFFSET;
+
if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
perror("failed to create pipes");
exit(-1);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 74f43af..92d4811 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -243,6 +243,12 @@ static int perf_session__process_event(struct perf_session *self,
case PERF_RECORD_UNTHROTTLE:
return ops->unthrottle(event, self);
case PERF_RECORD_INJECT:
+ if (event->inject.inject_event_id ==
+ PERF_INJECT_RELOCATE_OFFSET) {
+ update_relocate_offset((s32)event->inject.value);
+ return 0;
+ }
+
return ops->inject(event, self);
default:
self->unknown_events++;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 79ca6a0..5b58d34 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -19,6 +19,18 @@
#define NT_GNU_BUILD_ID 3
#endif

+static s32 relocate_offset;
+void update_relocate_offset(s32 offset)
+{
+ relocate_offset = offset;
+}
+
+static inline void update_kernel_address(GElf_Sym *sym, bool kernel)
+{
+ if (kernel)
+ sym->st_value += relocate_offset;
+}
+
enum dso_origin {
DSO__ORIG_KERNEL = 0,
DSO__ORIG_JAVA_JIT,
@@ -1012,6 +1024,7 @@ static int dso__load_sym(struct dso *self, struct map *map,
if (demangled != NULL)
elf_name = demangled;
new_symbol:
+ update_kernel_address(&sym, kernel);
f = symbol__new(sym.st_value, sym.st_size, elf_name);
free(demangled);
if (!f)
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index f27e158..129b4ec 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -10,6 +10,8 @@

#define DEBUG_CACHE_DIR ".debug"

+void update_relocate_offset(s32 offset);
+
#ifdef HAVE_CPLUS_DEMANGLE
extern char *cplus_demangle(const char *, int);

--
1.6.1.2

2009-12-30 13:10:40

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf tools: adjust symbol address

Em Wed, Dec 30, 2009 at 11:18:55AM +0800, Xiao Guangrong escreveu:
> Using relocation offset adjust symbol address if we get
> kernel symbol name form elf file
>
> diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
> index 79ca6a0..5b58d34 100644
> --- a/tools/perf/util/symbol.c
> +++ b/tools/perf/util/symbol.c
> @@ -19,6 +19,18 @@
> #define NT_GNU_BUILD_ID 3
> #endif
>
> +static s32 relocate_offset;
> +void update_relocate_offset(s32 offset)
> +{
> + relocate_offset = offset;
> +}
> +
> +static inline void update_kernel_address(GElf_Sym *sym, bool kernel)
> +{
> + if (kernel)
> + sym->st_value += relocate_offset;
> +}

This should be done on the dso level, we may process multiple kernels,
some with relocation, some without.

Humm, even at the _map_ level, because then we can just use the normal
map_ip mechanism, this time the not using the identity map stuff, i.e.
we have to get an IP from some event, then subtract the relocate_offset
that will be in map->start.

- Arnaldo

2009-12-30 13:15:32

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Em Wed, Dec 30, 2009 at 11:16:57AM +0800, Xiao Guangrong escreveu:
> Record relocation offset, perf tools will use it
> to adjust kernel symbol address
> Signed-off-by: Xiao Guangrong <[email protected]>

> arch/x86/boot/compressed/head_32.S | 2 ++
> arch/x86/boot/compressed/head_64.S | 3 +++
> arch/x86/include/asm/bootparam.h | 3 ++-
> arch/x86/kernel/asm-offsets_32.c | 1 +
> arch/x86/kernel/asm-offsets_64.c | 1 +
> arch/x86/kernel/cpu/perf_event.c | 4 ++++
<SNIP>
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -26,8 +26,10 @@
> #include <asm/nmi.h>
> +#include <asm/setup.h>
>
> static u64 perf_event_mask __read_mostly;
> +static s32 relocate_offset;
>
> @@ -2173,6 +2175,8 @@ void __init init_hw_perf_events(void)
> pr_info("Performance Events: ");
>
> + relocate_offset = boot_params.relocate_offset;
> switch (boot_cpu_data.x86_vendor) {
> case X86_VENDOR_INTEL:

I'm no expert on the intricacies of boot_params, but all the other hunks
seems sensible, but can't we provide a non-perf specific way of getting
the relocate_offset? I guess other tools would also love to have it.

What about systemtap, don't they solve this in some other way? Frank?

- Arnaldo

2009-12-30 19:50:46

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

On 12/30/2009 05:15 AM, Arnaldo Carvalho de Melo wrote:
>
> I'm no expert on the intricacies of boot_params, but all the other hunks
> seems sensible, but can't we provide a non-perf specific way of getting
> the relocate_offset? I guess other tools would also love to have it.
>
> What about systemtap, don't they solve this in some other way? Frank?
>

I at one point proposed that boot_params should be exported in toto via
sysfs. This got rather brutally shut down as "it's just a debugging
feature" and got moved to debugfs (/debug/boot_params/data). However,
the entire boot_params structure is available there.

Regardless of the reporting method, the patch passing this in by
modifying the early assembly code, though, is more than a little
pointless. The kernel already knows where it is loaded -- obviously, by
sheer necessity -- and knows how it was itself configured, and as such
we can do this calculation in C code without modifying boot_params or
the early bootstrap.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2009-12-30 20:39:58

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Em Wed, Dec 30, 2009 at 11:45:30AM -0800, H. Peter Anvin escreveu:
> On 12/30/2009 05:15 AM, Arnaldo Carvalho de Melo wrote:
> > I'm no expert on the intricacies of boot_params, but all the other hunks
> > seems sensible, but can't we provide a non-perf specific way of getting
> > the relocate_offset? I guess other tools would also love to have it.

> > What about systemtap, don't they solve this in some other way? Frank?
>
> I at one point proposed that boot_params should be exported in toto via
> sysfs. This got rather brutally shut down as "it's just a debugging
> feature" and got moved to debugfs (/debug/boot_params/data). However,
> the entire boot_params structure is available there.
>
> Regardless of the reporting method, the patch passing this in by
> modifying the early assembly code, though, is more than a little
> pointless. The kernel already knows where it is loaded -- obviously, by
> sheer necessity -- and knows how it was itself configured, and as such
> we can do this calculation in C code without modifying boot_params or
> the early bootstrap.

Yeah, rereading the start of this discussion it now seems to me that
what is happening is that a valid vmlinux is found, i.e. one with the
same buildid as the buildid found in the perf.data file but then the
kernel, at the time of perf record, was relocated, not matching what is
in the vmlinux file.

So what we need to do is to figure this out at 'perf record' time and
encode this in the header, so that later, at 'perf report' time, we can
use a matching vmlinux and do the relocation (store this relocation
offset in struct map->start for the kernel map) to get the right
results.

Problem is that at 'perf record' time we may not have access to the
vmlinux file, and thus not be able to figure out the relocation applied
in that boot.

Then, at a later time, and possibly on another machine, on another arch,
we try to map back IPs to symbols, the /proc/kallsyms is completely
unrelated and we now have a vmlinux unrelocated...

So we need a way to get the relocation applied at 'perf record' time and
encode it in the perf.data header. Ideas about how to do that?

- Arnaldo

2009-12-30 21:59:11

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Em Wed, Dec 30, 2009 at 06:39:36PM -0200, Arnaldo Carvalho de Melo escreveu:
> Em Wed, Dec 30, 2009 at 11:45:30AM -0800, H. Peter Anvin escreveu:
> > The kernel already knows where it is loaded -- obviously, by sheer
> > necessity -- and knows how it was itself configured, and as such we
> > can do this calculation in C code without modifying boot_params or
> > the early bootstrap.
>
> Problem is that at 'perf record' time we may not have access to the
> vmlinux file, and thus not be able to figure out the relocation applied
> in that boot.
>
> Then, at a later time, and possibly on another machine, on another arch,
> we try to map back IPs to symbols, the /proc/kallsyms is completely
> unrelated and we now have a vmlinux unrelocated...
>
> So we need a way to get the relocation applied at 'perf record' time and
> encode it in the perf.data header. Ideas about how to do that?

Well, I guess we could do the _stext trick again, storing its value,
taken from /proc/kallsyms, into the perf.data header, then figuring out
the relocation by looking at its value in the vmlinux symtab.

There were concerns in the past about relying on _stext, IIRC, James?

- Arnaldo

2009-12-30 22:14:47

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Are we concerned about virtual or physical addresses, here? I'm assuming virtual; in that case do note that we only actually relocate the kernel on 32 bits - on 64 bits the relocation is done at the page table level since we need the high map anyway.

On 32 bits one can compare any one symbol before and after relocation - it obviously doesn't matter which symbol as long as it is the same. The kernel start will be given by _text or startup_32; if that feels too "fuzzy" we could of course add a specific kernel start symbol explicitly for that purpose.

"Arnaldo Carvalho de Melo" <[email protected]> wrote:

>Em Wed, Dec 30, 2009 at 06:39:36PM -0200, Arnaldo Carvalho de Melo escreveu:
>> Em Wed, Dec 30, 2009 at 11:45:30AM -0800, H. Peter Anvin escreveu:
>> > The kernel already knows where it is loaded -- obviously, by sheer
>> > necessity -- and knows how it was itself configured, and as such we
>> > can do this calculation in C code without modifying boot_params or
>> > the early bootstrap.
>>
>> Problem is that at 'perf record' time we may not have access to the
>> vmlinux file, and thus not be able to figure out the relocation applied
>> in that boot.
>>
>> Then, at a later time, and possibly on another machine, on another arch,
>> we try to map back IPs to symbols, the /proc/kallsyms is completely
>> unrelated and we now have a vmlinux unrelocated...
>>
>> So we need a way to get the relocation applied at 'perf record' time and
>> encode it in the perf.data header. Ideas about how to do that?
>
>Well, I guess we could do the _stext trick again, storing its value,
>taken from /proc/kallsyms, into the perf.data header, then figuring out
>the relocation by looking at its value in the vmlinux symtab.
>
>There were concerns in the past about relying on _stext, IIRC, James?
>
>- Arnaldo

--
Sent from my mobile phone. Please excuse any lack of formatting.

2009-12-30 22:23:10

by James Bottomley

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

On Wed, 2009-12-30 at 19:58 -0200, Arnaldo Carvalho de Melo wrote:
> Em Wed, Dec 30, 2009 at 06:39:36PM -0200, Arnaldo Carvalho de Melo escreveu:
> > Em Wed, Dec 30, 2009 at 11:45:30AM -0800, H. Peter Anvin escreveu:
> > > The kernel already knows where it is loaded -- obviously, by sheer
> > > necessity -- and knows how it was itself configured, and as such we
> > > can do this calculation in C code without modifying boot_params or
> > > the early bootstrap.
> >
> > Problem is that at 'perf record' time we may not have access to the
> > vmlinux file, and thus not be able to figure out the relocation applied
> > in that boot.
> >
> > Then, at a later time, and possibly on another machine, on another arch,
> > we try to map back IPs to symbols, the /proc/kallsyms is completely
> > unrelated and we now have a vmlinux unrelocated...
> >
> > So we need a way to get the relocation applied at 'perf record' time and
> > encode it in the perf.data header. Ideas about how to do that?
>
> Well, I guess we could do the _stext trick again, storing its value,
> taken from /proc/kallsyms, into the perf.data header, then figuring out
> the relocation by looking at its value in the vmlinux symtab.

So reading the thread, I think the problem only exists for x86 compiled
as a relocateable kernel.

> There were concerns in the past about relying on _stext, IIRC, James?

Well, the original concerns were that _text relative relocation
resolution only works for the core kernel, not for modules.
Additionally, the kernel is in several sections, most notably init and
runtime ... these may get loaded at different locations so _text
relative symbol resolution won't work in init sections.

Right at the moment, only x86 and ppc do a relocatable kernel, and, as I
understand the process, the whole kernel image gets a relative offset
applied, so all sections get the same offset. Thus, for this case it
looks like computing the offset from any known symbol would work
(including _text).

James



2009-12-30 23:31:58

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Modules are a completely separate thing - they are linked (not even just relocated) at insertion time, so they need to be tracked separately.

The statement that a _text-based relocation is insufficient is false. The entire x86-32 monolithic kernel is relocated as a unit. The x86-64 kernel, too, is relocated as a unit, but using the page tables, which means it always runs at the compile-time-selected virtual address.

-hpa

"James Bottomley" <[email protected]> wrote:

>On Wed, 2009-12-30 at 19:58 -0200, Arnaldo Carvalho de Melo wrote:
>> Em Wed, Dec 30, 2009 at 06:39:36PM -0200, Arnaldo Carvalho de Melo escreveu:
>> > Em Wed, Dec 30, 2009 at 11:45:30AM -0800, H. Peter Anvin escreveu:
>> > > The kernel already knows where it is loaded -- obviously, by sheer
>> > > necessity -- and knows how it was itself configured, and as such we
>> > > can do this calculation in C code without modifying boot_params or
>> > > the early bootstrap.
>> >
>> > Problem is that at 'perf record' time we may not have access to the
>> > vmlinux file, and thus not be able to figure out the relocation applied
>> > in that boot.
>> >
>> > Then, at a later time, and possibly on another machine, on another arch,
>> > we try to map back IPs to symbols, the /proc/kallsyms is completely
>> > unrelated and we now have a vmlinux unrelocated...
>> >
>> > So we need a way to get the relocation applied at 'perf record' time and
>> > encode it in the perf.data header. Ideas about how to do that?
>>
>> Well, I guess we could do the _stext trick again, storing its value,
>> taken from /proc/kallsyms, into the perf.data header, then figuring out
>> the relocation by looking at its value in the vmlinux symtab.
>
>So reading the thread, I think the problem only exists for x86 compiled
>as a relocateable kernel.
>
>> There were concerns in the past about relying on _stext, IIRC, James?
>
>Well, the original concerns were that _text relative relocation
>resolution only works for the core kernel, not for modules.
>Additionally, the kernel is in several sections, most notably init and
>runtime ... these may get loaded at different locations so _text
>relative symbol resolution won't work in init sections.
>
>Right at the moment, only x86 and ppc do a relocatable kernel, and, as I
>understand the process, the whole kernel image gets a relative offset
>applied, so all sections get the same offset. Thus, for this case it
>looks like computing the offset from any known symbol would work
>(including _text).
>
>James
>
>
>
>

--
Sent from my mobile phone. Please excuse any lack of formatting.

2009-12-30 23:41:57

by James Bottomley

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

On Wed, 2009-12-30 at 15:26 -0800, H. Peter Anvin wrote:
> Modules are a completely separate thing - they are linked (not even
> just relocated) at insertion time, so they need to be tracked
> separately.

The reasons I gave was why _text relocation didn't work properly for
systemtap. The first paragraph was just giving a precis of history
explaining to Arnaldo why he remembered there was a problem with _text
based relocations.

> The statement that a _text-based relocation is insufficient is false.
> The entire x86-32 monolithic kernel is relocated as a unit. The
> x86-64 kernel, too, is relocated as a unit, but using the page tables,
> which means it always runs at the compile-time-selected virtual
> address.

Confused now ... you just repeated what I said in the second paragraph,
but made it sound like you are disagreeing?

James

2009-12-30 23:50:01

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

On 12/30/2009 03:41 PM, James Bottomley wrote:
> On Wed, 2009-12-30 at 15:26 -0800, H. Peter Anvin wrote:
>> Modules are a completely separate thing - they are linked (not even
>> just relocated) at insertion time, so they need to be tracked
>> separately.
>
> The reasons I gave was why _text relocation didn't work properly for
> systemtap. The first paragraph was just giving a precis of history
> explaining to Arnaldo why he remembered there was a problem with _text
> based relocations.
>
>> The statement that a _text-based relocation is insufficient is false.
>> The entire x86-32 monolithic kernel is relocated as a unit. The
>> x86-64 kernel, too, is relocated as a unit, but using the page tables,
>> which means it always runs at the compile-time-selected virtual
>> address.
>
> Confused now ... you just repeated what I said in the second paragraph,
> but made it sound like you are disagreeing?
>

We might have a bit of a context mismatch.

The first I saw of this thread was a proposed patch that would give the
relocation offset of the monolithic kernel, both on 32 and 64 bits,
without any explanation of the usage model. As such, from my point of
view this has always been about the monolithic kernel, until your post
mentioned modules (which the proposed patch would have done nothing about.)

The monolithic kernel offset is a single scalar constant; each module,
of course, is completely different.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2009-12-31 00:31:12

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Em Wed, Dec 30, 2009 at 03:46:03PM -0800, H. Peter Anvin escreveu:
> On 12/30/2009 03:41 PM, James Bottomley wrote:
> > On Wed, 2009-12-30 at 15:26 -0800, H. Peter Anvin wrote:
> >> The statement that a _text-based relocation is insufficient is false.
> >> The entire x86-32 monolithic kernel is relocated as a unit. The
> >> x86-64 kernel, too, is relocated as a unit, but using the page tables,
> >> which means it always runs at the compile-time-selected virtual
> >> address.
> >
> > Confused now ... you just repeated what I said in the second paragraph,
> > but made it sound like you are disagreeing?
>
> We might have a bit of a context mismatch.
>
> The first I saw of this thread was a proposed patch that would give the
> relocation offset of the monolithic kernel, both on 32 and 64 bits,
> without any explanation of the usage model. As such, from my point of
> view this has always been about the monolithic kernel, until your post
> mentioned modules (which the proposed patch would have done nothing about.)
>
> The monolithic kernel offset is a single scalar constant; each module,
> of course, is completely different.

Conclusion: at 'perf record' time store the address of a well know
symbol (_text) into the perf.data header. Later, at perf report time, if
using a vmlinux file, calculate the relocation by subtracting the same
well known symbol from the one stored in the header.

So no need for ioctl or boot stuff.

I'll do that tomorrow, if Xiao doesn't beats me to it :-)

- Arnaldo

2009-12-31 00:53:33

by Frank Ch. Eigler

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Hi -

james wrote:

> The reasons I gave was why _text relocation didn't work properly for
> systemtap. The first paragraph was just giving a precis of history
> [...]

The issues you listed (kernel .init sections, modules) have nothing to
do with kernel relocation, and systemtap has apprx. never had problem
calculating relevant addresses in their presence. The systemtap
problem you are probably thinking of relates to the kernel's former
inability to insert kprobes within .init sections.

- FChE

2009-12-31 03:00:40

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Hi Peter,

Thanks for you review and tell us the better way to get relocation offset.

H. Peter Anvin wrote:

>
> The first I saw of this thread was a proposed patch that would give the
> relocation offset of the monolithic kernel, both on 32 and 64 bits,
> without any explanation of the usage model. As such, from my point of
> view this has always been about the monolithic kernel, until your post
> mentioned modules (which the proposed patch would have done nothing about.)
>

We no need care modules symbols since we get module load address from
'/proc/modules', no matter is relocated or not. And perf tools just use
this way. So, it done nothing about it in my patch, maybe i should mention
it in my patch's changlog

Thanks,
Xiao





2009-12-31 03:01:28

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf tools: adjust symbol address



Arnaldo Carvalho de Melo wrote:
> Em Wed, Dec 30, 2009 at 11:18:55AM +0800, Xiao Guangrong escreveu:
>> Using relocation offset adjust symbol address if we get
>> kernel symbol name form elf file
>>
>> diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
>> index 79ca6a0..5b58d34 100644
>> --- a/tools/perf/util/symbol.c
>> +++ b/tools/perf/util/symbol.c
>> @@ -19,6 +19,18 @@
>> #define NT_GNU_BUILD_ID 3
>> #endif
>>
>> +static s32 relocate_offset;
>> +void update_relocate_offset(s32 offset)
>> +{
>> + relocate_offset = offset;
>> +}
>> +
>> +static inline void update_kernel_address(GElf_Sym *sym, bool kernel)
>> +{
>> + if (kernel)
>> + sym->st_value += relocate_offset;
>> +}
>
> This should be done on the dso level, we may process multiple kernels,
> some with relocation, some without.
>

Sorry, what does the 'multiple kernels' means?
vmlinux* not know it's relocation or not, and we just handle the _specific_
kernel at the time which match the current system.

And i think adjust symbol address on the dso level is not a good idea since
we hardly know it's a module address or kernel address on dso level, i.e.
hardly to know whether the address need adjust.

> Humm, even at the _map_ level, because then we can just use the normal
> map_ip mechanism, this time the not using the identity map stuff, i.e.
> we have to get an IP from some event, then subtract the relocate_offset
> that will be in map->start.
>

The 'inject' event is the first event in perf.data since we inject it at the
first counter file and inject 'relocation offset' before enable event, i.e.
before getting symbol address we have got the 'relocation offset' already.

Thanks,
Xiao

2009-12-31 03:01:55

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset



Arnaldo Carvalho de Melo wrote:

>
> Conclusion: at 'perf record' time store the address of a well know
> symbol (_text) into the perf.data header. Later, at perf report time, if
> using a vmlinux file, calculate the relocation by subtracting the same
> well known symbol from the one stored in the header.
>
> So no need for ioctl or boot stuff.
>

I'm little confused, how to get the load symbol address?
It's not a good way, if you get it from '/proc/kallsyms', we can't assume kernel
has this file.

> I'll do that tomorrow, if Xiao doesn't beats me to it :-)
>

Of course, please do if you have a better way :-)

Thanks,
Xiao

2009-12-31 10:29:55

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf tools: adjust symbol address

Em Thu, Dec 31, 2009 at 10:59:35AM +0800, Xiao Guangrong escreveu:
>
>
> Arnaldo Carvalho de Melo wrote:
> > Em Wed, Dec 30, 2009 at 11:18:55AM +0800, Xiao Guangrong escreveu:
> >> Using relocation offset adjust symbol address if we get
> >> kernel symbol name form elf file
> >>
> >> diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
> >> index 79ca6a0..5b58d34 100644
> >> --- a/tools/perf/util/symbol.c
> >> +++ b/tools/perf/util/symbol.c
> >> @@ -19,6 +19,18 @@
> >> #define NT_GNU_BUILD_ID 3
> >> #endif
> >>
> >> +static s32 relocate_offset;
> >> +void update_relocate_offset(s32 offset)
> >> +{
> >> + relocate_offset = offset;
> >> +}
> >> +
> >> +static inline void update_kernel_address(GElf_Sym *sym, bool kernel)
> >> +{
> >> + if (kernel)
> >> + sym->st_value += relocate_offset;
> >> +}

> > This should be done on the dso level, we may process multiple kernels,
> > some with relocation, some without.

> Sorry, what does the 'multiple kernels' means?

perf diff? I think you are combining the 'perf record' and 'perf report'
steps as a single action...

> vmlinux* not know it's relocation or not, and we just handle the _specific_
> kernel at the time which match the current system.

'perf record' in one machine + 'perf report' in another should be
possible.

> And i think adjust symbol address on the dso level is not a good idea since
> we hardly know it's a module address or kernel address on dso level, i.e.
> hardly to know whether the address need adjust.

yeah, as I contininued below, its not the right level, for other
reasons.

> > Humm, even at the _map_ level, because then we can just use the normal
> > map_ip mechanism, this time the not using the identity map stuff, i.e.
> > we have to get an IP from some event, then subtract the relocate_offset
> > that will be in map->start.
> >
>
> The 'inject' event is the first event in perf.data since we inject it at the
> first counter file and inject 'relocation offset' before enable event, i.e.
> before getting symbol address we have got the 'relocation offset' already.

There is no need for kernel changes, as the discussion showed.

- Arnaldo

2009-12-31 10:36:30

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset

Em Thu, Dec 31, 2009 at 11:00:00AM +0800, Xiao Guangrong escreveu:
> Arnaldo Carvalho de Melo wrote:

> > Conclusion: at 'perf record' time store the address of a well know
> > symbol (_text) into the perf.data header. Later, at perf report time, if
> > using a vmlinux file, calculate the relocation by subtracting the same
> > well known symbol from the one stored in the header.

> > So no need for ioctl or boot stuff.

> I'm little confused, how to get the load symbol address?
> It's not a good way, if you get it from '/proc/kallsyms', we can't assume kernel
> has this file.

Well, then its just a matter of exposing _text as
/sys/kernel/sections/.text, as we already have for modules:

[acme@ana linux-2.6-tip]$ cat /sys/module/ipv6/sections/.text
0xfa0c2000

Which matches

nf_conntrack_ipv6 17548 2 - Live 0xfa147000
ipv6 239420 32 ip6t_REJECT,nf_conntrack_ipv6, Live 0xfa0c2000
[acme@ana linux-2.6-tip]$

But even as a quick transational assist, we can look at kallsyms at
'perf record' time.

> > I'll do that tomorrow, if Xiao doesn't beats me to it :-)

> Of course, please do if you have a better way :-)

I meant, if you didn't write the patch first, while I was sleeping :-)

I'll work on it today after some coffee and errands.

Best Regards,

- Arnaldo

2009-12-31 10:50:56

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf tools: adjust symbol address



Arnaldo Carvalho de Melo wrote:

>> vmlinux* not know it's relocation or not, and we just handle the _specific_
>> kernel at the time which match the current system.
>
> 'perf record' in one machine + 'perf report' in another should be
> possible.
>

Yeah, but current code not support this since it not record kernel build-id in
perf.data while 'perf record' works, maybe you can fix it.

Xiao

2009-12-31 10:52:25

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset



Arnaldo Carvalho de Melo wrote:

>
> Well, then its just a matter of exposing _text as
> /sys/kernel/sections/.text, as we already have for modules:
>
> [acme@ana linux-2.6-tip]$ cat /sys/module/ipv6/sections/.text
> 0xfa0c2000
>
> Which matches
>
> nf_conntrack_ipv6 17548 2 - Live 0xfa147000
> ipv6 239420 32 ip6t_REJECT,nf_conntrack_ipv6, Live 0xfa0c2000
> [acme@ana linux-2.6-tip]$
>
> But even as a quick transational assist, we can look at kallsyms at
> 'perf record' time.
>

Ah, i see, it's really a nice way.

>>> I'll do that tomorrow, if Xiao doesn't beats me to it :-)
>
>> Of course, please do if you have a better way :-)
>
> I meant, if you didn't write the patch first, while I was sleeping :-)
>
> I'll work on it today after some coffee and errands.
>

Please do :-)

Xiao

2009-12-31 11:09:00

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf tools: adjust symbol address

Em Thu, Dec 31, 2009 at 06:49:01PM +0800, Xiao Guangrong escreveu:
>
>
> Arnaldo Carvalho de Melo wrote:
>
> >> vmlinux* not know it's relocation or not, and we just handle the _specific_
> >> kernel at the time which match the current system.
> >
> > 'perf record' in one machine + 'perf report' in another should be
> > possible.
> >
>
> Yeah, but current code not support this since it not record kernel build-id in
> perf.data while 'perf record' works, maybe you can fix it.

Huh? Checking...

[root@ana tmp]# perf record -a -f sleep 1s
[ perf record: Woken up 3 times to write data ]
[ perf record: Captured and wrote 0.330 MB perf.data (~14404 samples) ]
[root@ana tmp]# perf buildid-list | grep kallsyms
a3301c8755357466e39d71b46e8f69aecf83c0e8 [kernel.kallsyms]
a3301c8755357466e39d71b46e8f69aecf83c0e8 [kernel.kallsyms]
[root@ana tmp]#

In fact there is a bug, but the bug is either in 'perf buildid-list'
showing it twice or it being recorded twice in 'perf record'.

- Arnaldo

2009-12-31 11:32:01

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 3/3] perf tools: adjust symbol address



Arnaldo Carvalho de Melo wrote:
> Em Thu, Dec 31, 2009 at 06:49:01PM +0800, Xiao Guangrong escreveu:
>>
>> Arnaldo Carvalho de Melo wrote:
>>
>>>> vmlinux* not know it's relocation or not, and we just handle the _specific_
>>>> kernel at the time which match the current system.
>>> 'perf record' in one machine + 'perf report' in another should be
>>> possible.
>>>
>> Yeah, but current code not support this since it not record kernel build-id in
>> perf.data while 'perf record' works, maybe you can fix it.
>
> Huh? Checking...
>
> [root@ana tmp]# perf record -a -f sleep 1s
> [ perf record: Woken up 3 times to write data ]
> [ perf record: Captured and wrote 0.330 MB perf.data (~14404 samples) ]
> [root@ana tmp]# perf buildid-list | grep kallsyms
> a3301c8755357466e39d71b46e8f69aecf83c0e8 [kernel.kallsyms]
> a3301c8755357466e39d71b46e8f69aecf83c0e8 [kernel.kallsyms]
> [root@ana tmp]#
>
> In fact there is a bug, but the bug is either in 'perf buildid-list'
> showing it twice or it being recorded twice in 'perf record'.
>

Ah, it seems some perf tools like 'perf kmem/sched' not use the build-id which it's
recorded in perf.data and it use the current kernel's build-id:

perf kmem/sched -> perf_session__new() -> perf_session__create_kernel_maps() ->
map_groups__create_kernel_maps() -> dsos__create_kernel():

sysfs__read_build_id("/sys/kernel/notes", kernel->build_id,
sizeof(kernel->build_id)

Hope i not misunderstand it. :-)

Xiao

2010-01-01 09:28:00

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86: record relocation offset


* Arnaldo Carvalho de Melo <[email protected]> wrote:

> Em Thu, Dec 31, 2009 at 11:00:00AM +0800, Xiao Guangrong escreveu:
> > Arnaldo Carvalho de Melo wrote:
>
> > > Conclusion: at 'perf record' time store the address of a well know
> > > symbol (_text) into the perf.data header. Later, at perf report time, if
> > > using a vmlinux file, calculate the relocation by subtracting the same
> > > well known symbol from the one stored in the header.
>
> > > So no need for ioctl or boot stuff.
>
> > I'm little confused, how to get the load symbol address?
> > It's not a good way, if you get it from '/proc/kallsyms', we can't assume kernel
> > has this file.
>
> Well, then its just a matter of exposing _text as
> /sys/kernel/sections/.text, as we already have for modules:
>
> [acme@ana linux-2.6-tip]$ cat /sys/module/ipv6/sections/.text
> 0xfa0c2000
>
> Which matches
>
> nf_conntrack_ipv6 17548 2 - Live 0xfa147000
> ipv6 239420 32 ip6t_REJECT,nf_conntrack_ipv6, Live 0xfa0c2000
> [acme@ana linux-2.6-tip]$

Yeah, that's a good idea and pretty complementary to the existing scheme for
modules.

> But even as a quick transational assist, we can look at kallsyms at 'perf
> record' time.

Yes, we should do that as a fallback mechanism.

(Initially this 'fallback' will be the primary method, until the
sections/.text extension gets merged upstream.)

Thanks,

Ingo