2021-12-12 11:34:06

by Yinan Liu

[permalink] [raw]
Subject: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

When the kernel starts, the initialization of ftrace takes
up a portion of the time (approximately 6~8ms) to sort mcount
addresses. We can save this time by moving mcount-sorting to
compile time.

Signed-off-by: Yinan Liu <[email protected]>
Reported-by: kernel test robot <[email protected]>
Reported-by: kernel test robot <[email protected]>
---
kernel/trace/ftrace.c | 11 +++-
scripts/Makefile | 6 +-
scripts/link-vmlinux.sh | 6 +-
scripts/sorttable.c | 2 +
scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++-
5 files changed, 137 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index be5f6b32a012..d5751a96213b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6412,8 +6412,15 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;

- sort(start, count, sizeof(*start),
- ftrace_cmp_ips, NULL);
+ /*
+ * Sorting mcount in vmlinux at build time depend on
+ * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in
+ * modules can not be sorted at build time.
+ */
+ if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) {
+ sort(start, count, sizeof(*start),
+ ftrace_cmp_ips, NULL);
+ }

start_pg = ftrace_allocate_pages(count);
if (!start_pg)
diff --git a/scripts/Makefile b/scripts/Makefile
index 9adb6d247818..b082d2f93357 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -17,6 +17,7 @@ hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
hostprogs-always-$(CONFIG_SYSTEM_REVOCATION_LIST) += extract-cert

HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include
+HOSTLDLIBS_sorttable = -lpthread
HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include
HOSTCFLAGS_sign-file.o = $(CRYPTO_CFLAGS)
HOSTLDLIBS_sign-file = $(CRYPTO_LIBS)
@@ -29,7 +30,10 @@ ARCH := x86
endif
HOSTCFLAGS_sorttable.o += -I$(srctree)/tools/arch/x86/include
HOSTCFLAGS_sorttable.o += -DUNWINDER_ORC_ENABLED
-HOSTLDLIBS_sorttable = -lpthread
+endif
+
+ifdef CONFIG_DYNAMIC_FTRACE
+HOSTCFLAGS_sorttable.o += -DMCOUNT_SORT_ENABLED
endif

# The following programs are only built on demand
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 5cdd9bc5c385..dd9955f45774 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -400,6 +400,9 @@ if [ -n "${CONFIG_DEBUG_INFO_BTF}" -a -n "${CONFIG_BPF}" ]; then
${RESOLVE_BTFIDS} vmlinux
fi

+info SYSMAP System.map
+mksysmap vmlinux System.map
+
if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then
info SORTTAB vmlinux
if ! sorttable vmlinux; then
@@ -408,9 +411,6 @@ if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then
fi
fi

-info SYSMAP System.map
-mksysmap vmlinux System.map
-
# step a (see comment above)
if [ -n "${CONFIG_KALLSYMS}" ]; then
mksysmap ${kallsyms_vmlinux} .tmp_System.map
diff --git a/scripts/sorttable.c b/scripts/sorttable.c
index b7c2ad71f9cf..70bdc787ddfb 100644
--- a/scripts/sorttable.c
+++ b/scripts/sorttable.c
@@ -30,6 +30,8 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <errno.h>
+#include <pthread.h>

#include <tools/be_byteshift.h>
#include <tools/le_byteshift.h>
diff --git a/scripts/sorttable.h b/scripts/sorttable.h
index a2baa2fefb13..d1fbee1f81ef 100644
--- a/scripts/sorttable.h
+++ b/scripts/sorttable.h
@@ -19,6 +19,9 @@

#undef extable_ent_size
#undef compare_extable
+#undef get_mcount_loc
+#undef sort_mcount_loc
+#undef elf_mcount_loc
#undef do_sort
#undef Elf_Addr
#undef Elf_Ehdr
@@ -41,6 +44,9 @@
#ifdef SORTTABLE_64
# define extable_ent_size 16
# define compare_extable compare_extable_64
+# define get_mcount_loc get_mcount_loc_64
+# define sort_mcount_loc sort_mcount_loc_64
+# define elf_mcount_loc elf_mcount_loc_64
# define do_sort do_sort_64
# define Elf_Addr Elf64_Addr
# define Elf_Ehdr Elf64_Ehdr
@@ -62,6 +68,9 @@
#else
# define extable_ent_size 8
# define compare_extable compare_extable_32
+# define get_mcount_loc get_mcount_loc_32
+# define sort_mcount_loc sort_mcount_loc_32
+# define elf_mcount_loc elf_mcount_loc_32
# define do_sort do_sort_32
# define Elf_Addr Elf32_Addr
# define Elf_Ehdr Elf32_Ehdr
@@ -84,8 +93,6 @@

#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED)
/* ORC unwinder only support X86_64 */
-#include <errno.h>
-#include <pthread.h>
#include <asm/orc_types.h>

#define ERRSTR_MAXSZ 256
@@ -191,7 +198,64 @@ static int compare_extable(const void *a, const void *b)
return 1;
return 0;
}
+#ifdef MCOUNT_SORT_ENABLED
+struct elf_mcount_loc {
+ Elf_Ehdr *ehdr;
+ Elf_Shdr *init_data_sec;
+ uint_t start_mcount_loc;
+ uint_t stop_mcount_loc;
+};
+
+/* Sort the addresses stored between __start_mcount_loc to __stop_mcount_loc in vmlinux */
+static void *sort_mcount_loc(void *arg)
+{
+ struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg;
+ uint_t offset = emloc->start_mcount_loc - _r(&(emloc->init_data_sec)->sh_addr)
+ + _r(&(emloc->init_data_sec)->sh_offset);
+ uint_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc;
+ unsigned char *start_loc = (void *)emloc->ehdr + offset;
+
+ qsort(start_loc, count/sizeof(uint_t), sizeof(uint_t), compare_extable);
+ return NULL;
+}
+
+/* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */
+static void get_mcount_loc(uint_t *_start, uint_t *_stop)
+{
+ FILE *file_start, *file_stop;
+ char start_buff[20];
+ char stop_buff[20];
+ int len = 0;
+
+ file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r");
+ if (!file_start) {
+ fprintf(stderr, "get start_mcount_loc error!");
+ return;
+ }
+
+ file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r");
+ if (!file_stop) {
+ fprintf(stderr, "get stop_mcount_loc error!");
+ pclose(file_start);
+ return;
+ }
+
+ while (fgets(start_buff, sizeof(start_buff), file_start) != NULL) {
+ len = strlen(start_buff);
+ start_buff[len - 1] = '\0';
+ }
+ *_start = strtoul(start_buff, NULL, 16);
+
+ while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) {
+ len = strlen(stop_buff);
+ stop_buff[len - 1] = '\0';
+ }
+ *_stop = strtoul(stop_buff, NULL, 16);

+ pclose(file_start);
+ pclose(file_stop);
+}
+#endif
static int do_sort(Elf_Ehdr *ehdr,
char const *const fname,
table_sort_t custom_sort)
@@ -217,6 +281,12 @@ static int do_sort(Elf_Ehdr *ehdr,
int idx;
unsigned int shnum;
unsigned int shstrndx;
+#ifdef MCOUNT_SORT_ENABLED
+ struct elf_mcount_loc mstruct;
+ uint_t _start_mcount_loc = 0;
+ uint_t _stop_mcount_loc = 0;
+ pthread_t mcount_sort_thread;
+#endif
#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED)
unsigned int orc_ip_size = 0;
unsigned int orc_size = 0;
@@ -253,6 +323,17 @@ static int do_sort(Elf_Ehdr *ehdr,
symtab_shndx = (Elf32_Word *)((const char *)ehdr +
_r(&s->sh_offset));

+#ifdef MCOUNT_SORT_ENABLED
+ /* locate the .init.data section in vmlinux */
+ if (!strcmp(secstrings + idx, ".init.data")) {
+ get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc);
+ mstruct.ehdr = ehdr;
+ mstruct.init_data_sec = s;
+ mstruct.start_mcount_loc = _start_mcount_loc;
+ mstruct.stop_mcount_loc = _stop_mcount_loc;
+ }
+#endif
+
#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED)
/* locate the ORC unwind tables */
if (!strcmp(secstrings + idx, ".orc_unwind_ip")) {
@@ -294,6 +375,23 @@ static int do_sort(Elf_Ehdr *ehdr,
goto out;
}
#endif
+
+#ifdef MCOUNT_SORT_ENABLED
+ if (!mstruct.init_data_sec || !_start_mcount_loc || !_stop_mcount_loc) {
+ fprintf(stderr,
+ "incomplete mcount's sort in file: %s\n",
+ fname);
+ goto out;
+ }
+
+ /* create thread to sort mcount_loc concurrently */
+ if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) {
+ fprintf(stderr,
+ "pthread_create mcount_sort_thread failed '%s': %s\n",
+ strerror(errno), fname);
+ goto out;
+ }
+#endif
if (!extab_sec) {
fprintf(stderr, "no __ex_table in file: %s\n", fname);
goto out;
@@ -376,5 +474,23 @@ static int do_sort(Elf_Ehdr *ehdr,
}
}
#endif
+
+#ifdef MCOUNT_SORT_ENABLED
+ if (mcount_sort_thread) {
+ void *retval = NULL;
+ /* wait for mcount sort done */
+ rc = pthread_join(mcount_sort_thread, &retval);
+ if (rc) {
+ fprintf(stderr,
+ "pthread_join failed '%s': %s\n",
+ strerror(errno), fname);
+ } else if (retval) {
+ rc = -1;
+ fprintf(stderr,
+ "failed to sort mcount '%s': %s\n",
+ (char *)retval, fname);
+ }
+ }
+#endif
return rc;
}
--
2.27.0



2022-01-22 00:45:21

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

Hi Yinan,

Yinan Liu <[email protected]> writes:

> When the kernel starts, the initialization of ftrace takes
> up a portion of the time (approximately 6~8ms) to sort mcount
> addresses. We can save this time by moving mcount-sorting to
> compile time.
>
> Signed-off-by: Yinan Liu <[email protected]>
> Reported-by: kernel test robot <[email protected]>
> Reported-by: kernel test robot <[email protected]>
> ---
> kernel/trace/ftrace.c | 11 +++-
> scripts/Makefile | 6 +-
> scripts/link-vmlinux.sh | 6 +-
> scripts/sorttable.c | 2 +
> scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++-
> 5 files changed, 137 insertions(+), 8 deletions(-)

while i like the idea, this unfortunately breaks ftrace on s390. The
reason for that is that the compiler generates relocation entries for
all the addresses in __mcount_loc. During boot, the s390 decompressor
iterates through all the relocations and overwrites the nicely
sorted list between __start_mcount_loc and __stop_mcount_loc with
the unsorted list because the relocations entries are not adjusted.

Of course we could just disable that option, but that would make us
different compared to x86 which i don't like. Adding code to sort the
relocation would of course also fix that, but i don't think it is a good
idea to rely on the order of relocations.

Any thoughts how a fix could look like, and whether that could also be a
problem on other architectures?

Thanks
Sven

2022-01-22 00:47:28

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

On Fri, Jan 21, 2022 at 10:46:36AM +0100, Sven Schnelle wrote:
> Hi Yinan,
>
> Yinan Liu <[email protected]> writes:
>
> > When the kernel starts, the initialization of ftrace takes
> > up a portion of the time (approximately 6~8ms) to sort mcount
> > addresses. We can save this time by moving mcount-sorting to
> > compile time.
> >
> > Signed-off-by: Yinan Liu <[email protected]>
> > Reported-by: kernel test robot <[email protected]>
> > Reported-by: kernel test robot <[email protected]>
> > ---
> > kernel/trace/ftrace.c | 11 +++-
> > scripts/Makefile | 6 +-
> > scripts/link-vmlinux.sh | 6 +-
> > scripts/sorttable.c | 2 +
> > scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++-
> > 5 files changed, 137 insertions(+), 8 deletions(-)
>
> while i like the idea, this unfortunately breaks ftrace on s390. The
> reason for that is that the compiler generates relocation entries for
> all the addresses in __mcount_loc. During boot, the s390 decompressor
> iterates through all the relocations and overwrites the nicely
> sorted list between __start_mcount_loc and __stop_mcount_loc with
> the unsorted list because the relocations entries are not adjusted.
>
> Of course we could just disable that option, but that would make us
> different compared to x86 which i don't like. Adding code to sort the
> relocation would of course also fix that, but i don't think it is a good
> idea to rely on the order of relocations.
>
> Any thoughts how a fix could look like, and whether that could also be a
> problem on other architectures?

Sven, thanks for figuring this out. Can you confirm that reverting
commit 72b3942a173c ("scripts: ftrace - move the sort-processing in
ftrace_init") fixes the problem?

This really should be addressed before rc1 is out, otherwise s390 is
broken if somebody enables ftrace.
Where "broken" translates to random crashes as soon as ftrace is
enabled, which again is nowadays quite common.

2022-01-22 00:49:36

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

Heiko Carstens <[email protected]> writes:

> On Fri, Jan 21, 2022 at 10:46:36AM +0100, Sven Schnelle wrote:
>> Hi Yinan,
>>
>> Yinan Liu <[email protected]> writes:
>>
>> > When the kernel starts, the initialization of ftrace takes
>> > up a portion of the time (approximately 6~8ms) to sort mcount
>> > addresses. We can save this time by moving mcount-sorting to
>> > compile time.
>> >
>> > Signed-off-by: Yinan Liu <[email protected]>
>> > Reported-by: kernel test robot <[email protected]>
>> > Reported-by: kernel test robot <[email protected]>
>> > ---
>> > kernel/trace/ftrace.c | 11 +++-
>> > scripts/Makefile | 6 +-
>> > scripts/link-vmlinux.sh | 6 +-
>> > scripts/sorttable.c | 2 +
>> > scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++-
>> > 5 files changed, 137 insertions(+), 8 deletions(-)
>>
>> while i like the idea, this unfortunately breaks ftrace on s390. The
>> reason for that is that the compiler generates relocation entries for
>> all the addresses in __mcount_loc. During boot, the s390 decompressor
>> iterates through all the relocations and overwrites the nicely
>> sorted list between __start_mcount_loc and __stop_mcount_loc with
>> the unsorted list because the relocations entries are not adjusted.
>>
>> Of course we could just disable that option, but that would make us
>> different compared to x86 which i don't like. Adding code to sort the
>> relocation would of course also fix that, but i don't think it is a good
>> idea to rely on the order of relocations.
>>
>> Any thoughts how a fix could look like, and whether that could also be a
>> problem on other architectures?
>
> Sven, thanks for figuring this out. Can you confirm that reverting
> commit 72b3942a173c ("scripts: ftrace - move the sort-processing in
> ftrace_init") fixes the problem?

Yes, reverting this commit fixes it.

> This really should be addressed before rc1 is out, otherwise s390 is
> broken if somebody enables ftrace.
> Where "broken" translates to random crashes as soon as ftrace is
> enabled, which again is nowadays quite common.

I wasn't able to reproduce these crashes on my systems so far. For the
readers here, we're seeing about 10-15 systems crashing every night,
usually in the 00basic/ ftrace testcases.

In most of the case it looks like register corruption, where some random
register is or'd or parts are overwritten with 0x0004000000000000,
sometimes 0x00f4000000000000. I haven't found yts found a commit that
might cause this.

/Sven

2022-01-22 00:52:40

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

Sven Schnelle <[email protected]> writes:

> Heiko Carstens <[email protected]> writes:
>
>> On Fri, Jan 21, 2022 at 10:46:36AM +0100, Sven Schnelle wrote:
>>> Hi Yinan,
>>>
>>> Yinan Liu <[email protected]> writes:
>>>
>>> > When the kernel starts, the initialization of ftrace takes
>>> > up a portion of the time (approximately 6~8ms) to sort mcount
>>> > addresses. We can save this time by moving mcount-sorting to
>>> > compile time.
>>> >
>>> > Signed-off-by: Yinan Liu <[email protected]>
>>> > Reported-by: kernel test robot <[email protected]>
>>> > Reported-by: kernel test robot <[email protected]>
>>> > ---
>>> > kernel/trace/ftrace.c | 11 +++-
>>> > scripts/Makefile | 6 +-
>>> > scripts/link-vmlinux.sh | 6 +-
>>> > scripts/sorttable.c | 2 +
>>> > scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++-
>>> > 5 files changed, 137 insertions(+), 8 deletions(-)
>>>
>>> while i like the idea, this unfortunately breaks ftrace on s390. The
>>> reason for that is that the compiler generates relocation entries for
>>> all the addresses in __mcount_loc. During boot, the s390 decompressor
>>> iterates through all the relocations and overwrites the nicely
>>> sorted list between __start_mcount_loc and __stop_mcount_loc with
>>> the unsorted list because the relocations entries are not adjusted.
>>>
>>> Of course we could just disable that option, but that would make us
>>> different compared to x86 which i don't like. Adding code to sort the
>>> relocation would of course also fix that, but i don't think it is a good
>>> idea to rely on the order of relocations.
>>>
>>> Any thoughts how a fix could look like, and whether that could also be a
>>> problem on other architectures?
>>
>> Sven, thanks for figuring this out. Can you confirm that reverting
>> commit 72b3942a173c ("scripts: ftrace - move the sort-processing in
>> ftrace_init") fixes the problem?
>
> Yes, reverting this commit fixes it.
>
>> This really should be addressed before rc1 is out, otherwise s390 is
>> broken if somebody enables ftrace.
>> Where "broken" translates to random crashes as soon as ftrace is
>> enabled, which again is nowadays quite common.
>
> I wasn't able to reproduce these crashes on my systems so far. For the
> readers here, we're seeing about 10-15 systems crashing every night,
> usually in the 00basic/ ftrace testcases.
>
> In most of the case it looks like register corruption, where some random
> register is or'd or parts are overwritten with 0x0004000000000000,
> sometimes 0x00f4000000000000. I haven't found yts found a commit that
> might cause this.

Thinking of it, 04 and f4 are exactly the bytes we're patching in our brcl
instructions right at the beginning of the function. So i guess that
because of this bug the ftrace code now writes those bytes to the wrong
location, sometimes hitting the register save area. I haven't verified
that, but i think there's a high likelyhood.

/Sven

2022-01-22 02:10:21

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

On Fri, 21 Jan 2022 11:42:28 +0100
Heiko Carstens <[email protected]> wrote:

> This really should be addressed before rc1 is out, otherwise s390 is
> broken if somebody enables ftrace.
> Where "broken" translates to random crashes as soon as ftrace is
> enabled, which again is nowadays quite common.

Instead of reverting, should we just add this patch?

-- Steve

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f468767bc287..752ed89a293b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -70,6 +70,13 @@ config HAVE_C_RECORDMCOUNT
help
C version of recordmcount available?

+config BUILDTIME_MCOUNT_SORT
+ bool
+ default y
+ depends on BUILDTIME_TABLE_SORT && !S390
+ help
+ Sort the mcount_loc section at build time.
+
config TRACER_MAX_TRACE
bool

@@ -918,7 +925,7 @@ config EVENT_TRACE_TEST_SYSCALLS
config FTRACE_SORT_STARTUP_TEST
bool "Verify compile time sorting of ftrace functions"
depends on DYNAMIC_FTRACE
- depends on BUILDTIME_TABLE_SORT
+ depends on BUILDTIME_MCOUNT_SORT
help
Sorting of the mcount_loc sections that is used to find the
where the ftrace knows where to patch functions for tracing
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 403e485bf091..b01e1fa62193 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6429,10 +6429,10 @@ static int ftrace_process_locs(struct module *mod,

/*
* Sorting mcount in vmlinux at build time depend on
- * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in
+ * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in
* modules can not be sorted at build time.
*/
- if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) {
+ if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) {
sort(start, count, sizeof(*start),
ftrace_cmp_ips, NULL);
} else {

2022-01-23 14:55:16

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v8] scripts: ftrace - move the sort-processing in ftrace_init

On Fri, Jan 21, 2022 at 01:11:48PM -0500, Steven Rostedt wrote:
> On Fri, 21 Jan 2022 11:42:28 +0100
> Heiko Carstens <[email protected]> wrote:
>
> > This really should be addressed before rc1 is out, otherwise s390 is
> > broken if somebody enables ftrace.
> > Where "broken" translates to random crashes as soon as ftrace is
> > enabled, which again is nowadays quite common.
>
> Instead of reverting, should we just add this patch?

That would work as well.

Tested-by: Heiko Carstens <[email protected]>