2021-05-07 00:04:44

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 01/13] objtool: Rewrite hashtable sizing

Currently objtool has 5 hashtables and sizes them 16 or 20 bits
depending on the --vmlinux argument.

However, a single side doesn't really work well for the 5 tables,
which among them, cover 3 different uses. Also, while vmlinux is
larger, there is still a very wide difference between a defconfig and
allyesconfig build, which again isn't optimally covered by a single
size.

Another aspect is the cost of elf_hash_init(), which for large tables
dominates the runtime for small input files. It turns out that all it
does it assign NULL, something that is required when using malloc().
However, when we allocate memory using mmap(), we're guaranteed to get
zero filled pages.

Therefore, rewrite the whole thing to:

1) use more dynamic sized tables, depending on the input file,
2) avoid the need for elf_hash_init() entirely by using mmap().

This speeds up a regular kernel build (100s to 98s for
x86_64-defconfig), and potentially dramatically speeds up vmlinux
processing.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
tools/objtool/elf.c | 95 +++++++++++++++++++++++-------------
tools/objtool/include/objtool/elf.h | 17 ++++--
2 files changed, 73 insertions(+), 39 deletions(-)

--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -9,6 +9,7 @@

#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/mman.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
@@ -27,21 +28,27 @@ static inline u32 str_hash(const char *s
return jhash(str, strlen(str), 0);
}

-static inline int elf_hash_bits(void)
-{
- return vmlinux ? ELF_HASH_BITS : 16;
-}
+#define __elf_table(name) (elf->name##_hash)
+#define __elf_bits(name) (elf->name##_bits)

-#define elf_hash_add(hashtable, node, key) \
- hlist_add_head(node, &hashtable[hash_min(key, elf_hash_bits())])
+#define elf_hash_add(name, node, key) \
+ hlist_add_head(node, &__elf_table(name)[hash_min(key, __elf_bits(name))])

-static void elf_hash_init(struct hlist_head *table)
-{
- __hash_init(table, 1U << elf_hash_bits());
-}
+#define elf_hash_for_each_possible(name, obj, member, key) \
+ hlist_for_each_entry(obj, &__elf_table(name)[hash_min(key, __elf_bits(name))], member)

-#define elf_hash_for_each_possible(name, obj, member, key) \
- hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member)
+#define elf_alloc_hash(name, size) \
+({ \
+ __elf_bits(name) = max(10, ilog2(size)); \
+ __elf_table(name) = mmap(NULL, sizeof(struct hlist_head) << __elf_bits(name), \
+ PROT_READ|PROT_WRITE, \
+ MAP_PRIVATE|MAP_ANON, -1, 0); \
+ if (__elf_table(name) == (void *)-1L) { \
+ WARN("mmap fail " #name); \
+ __elf_table(name) = NULL; \
+ } \
+ __elf_table(name); \
+})

static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b)
{
@@ -80,9 +87,10 @@ struct section *find_section_by_name(con
{
struct section *sec;

- elf_hash_for_each_possible(elf->section_name_hash, sec, name_hash, str_hash(name))
+ elf_hash_for_each_possible(section_name, sec, name_hash, str_hash(name)) {
if (!strcmp(sec->name, name))
return sec;
+ }

return NULL;
}
@@ -92,9 +100,10 @@ static struct section *find_section_by_i
{
struct section *sec;

- elf_hash_for_each_possible(elf->section_hash, sec, hash, idx)
+ elf_hash_for_each_possible(section, sec, hash, idx) {
if (sec->idx == idx)
return sec;
+ }

return NULL;
}
@@ -103,9 +112,10 @@ static struct symbol *find_symbol_by_ind
{
struct symbol *sym;

- elf_hash_for_each_possible(elf->symbol_hash, sym, hash, idx)
+ elf_hash_for_each_possible(symbol, sym, hash, idx) {
if (sym->idx == idx)
return sym;
+ }

return NULL;
}
@@ -170,9 +180,10 @@ struct symbol *find_symbol_by_name(const
{
struct symbol *sym;

- elf_hash_for_each_possible(elf->symbol_name_hash, sym, name_hash, str_hash(name))
+ elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) {
if (!strcmp(sym->name, name))
return sym;
+ }

return NULL;
}
@@ -189,8 +200,8 @@ struct reloc *find_reloc_by_dest_range(c
sec = sec->reloc;

for_offset_range(o, offset, offset + len) {
- elf_hash_for_each_possible(elf->reloc_hash, reloc, hash,
- sec_offset_hash(sec, o)) {
+ elf_hash_for_each_possible(reloc, reloc, hash,
+ sec_offset_hash(sec, o)) {
if (reloc->sec != sec)
continue;

@@ -228,6 +239,10 @@ static int read_sections(struct elf *elf
return -1;
}

+ if (!elf_alloc_hash(section, sections_nr) ||
+ !elf_alloc_hash(section_name, sections_nr))
+ return -1;
+
for (i = 0; i < sections_nr; i++) {
sec = malloc(sizeof(*sec));
if (!sec) {
@@ -274,12 +289,14 @@ static int read_sections(struct elf *elf
sec->len = sec->sh.sh_size;

list_add_tail(&sec->list, &elf->sections);
- elf_hash_add(elf->section_hash, &sec->hash, sec->idx);
- elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name));
+ elf_hash_add(section, &sec->hash, sec->idx);
+ elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
}

- if (stats)
+ if (stats) {
printf("nr_sections: %lu\n", (unsigned long)sections_nr);
+ printf("section_bits: %d\n", elf->section_bits);
+ }

/* sanity check, one more call to elf_nextscn() should return NULL */
if (elf_nextscn(elf->elf, s)) {
@@ -308,8 +325,8 @@ static void elf_add_symbol(struct elf *e
else
entry = &sym->sec->symbol_list;
list_add(&sym->list, entry);
- elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx);
- elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name));
+ elf_hash_add(symbol, &sym->hash, sym->idx);
+ elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name));

/*
* Don't store empty STT_NOTYPE symbols in the rbtree. They
@@ -343,6 +360,10 @@ static int read_symbols(struct elf *elf)

symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize;

+ if (!elf_alloc_hash(symbol, symbols_nr) ||
+ !elf_alloc_hash(symbol_name, symbols_nr))
+ return -1;
+
for (i = 0; i < symbols_nr; i++) {
sym = malloc(sizeof(*sym));
if (!sym) {
@@ -389,8 +410,10 @@ static int read_symbols(struct elf *elf)
elf_add_symbol(elf, sym);
}

- if (stats)
+ if (stats) {
printf("nr_symbols: %lu\n", (unsigned long)symbols_nr);
+ printf("symbol_bits: %d\n", elf->symbol_bits);
+ }

/* Create parent/child links for any cold subfunctions */
list_for_each_entry(sec, &elf->sections, list) {
@@ -479,7 +502,7 @@ int elf_add_reloc(struct elf *elf, struc
reloc->addend = addend;

list_add_tail(&reloc->list, &sec->reloc->reloc_list);
- elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
+ elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc));

sec->reloc->changed = true;

@@ -556,6 +579,15 @@ static int read_relocs(struct elf *elf)
unsigned int symndx;
unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;

+ sec = find_section_by_name(elf, ".text");
+ if (!sec) {
+ WARN("no .text");
+ return -1;
+ }
+
+ if (!elf_alloc_hash(reloc, sec->len / 16))
+ return -1;
+
list_for_each_entry(sec, &elf->sections, list) {
if ((sec->sh.sh_type != SHT_RELA) &&
(sec->sh.sh_type != SHT_REL))
@@ -600,7 +632,7 @@ static int read_relocs(struct elf *elf)
}

list_add_tail(&reloc->list, &sec->reloc_list);
- elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
+ elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc));

nr_reloc++;
}
@@ -611,6 +643,7 @@ static int read_relocs(struct elf *elf)
if (stats) {
printf("max_reloc: %lu\n", max_reloc);
printf("tot_reloc: %lu\n", tot_reloc);
+ printf("reloc_bits: %d\n", elf->reloc_bits);
}

return 0;
@@ -632,12 +665,6 @@ struct elf *elf_open_read(const char *na

INIT_LIST_HEAD(&elf->sections);

- elf_hash_init(elf->symbol_hash);
- elf_hash_init(elf->symbol_name_hash);
- elf_hash_init(elf->section_hash);
- elf_hash_init(elf->section_name_hash);
- elf_hash_init(elf->reloc_hash);
-
elf->fd = open(name, flags);
if (elf->fd == -1) {
fprintf(stderr, "objtool: Can't open '%s': %s\n",
@@ -850,8 +877,8 @@ struct section *elf_create_section(struc
return NULL;

list_add_tail(&sec->list, &elf->sections);
- elf_hash_add(elf->section_hash, &sec->hash, sec->idx);
- elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name));
+ elf_hash_add(section, &sec->hash, sec->idx);
+ elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));

elf->changed = true;

--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -84,11 +84,18 @@ struct elf {
bool changed;
char *name;
struct list_head sections;
- DECLARE_HASHTABLE(symbol_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(reloc_hash, ELF_HASH_BITS);
+
+ int symbol_bits;
+ int symbol_name_bits;
+ int section_bits;
+ int section_name_bits;
+ int reloc_bits;
+
+ struct hlist_head *symbol_hash;
+ struct hlist_head *symbol_name_hash;
+ struct hlist_head *section_hash;
+ struct hlist_head *section_name_hash;
+ struct hlist_head *reloc_hash;
};

#define OFFSET_STRIDE_BITS 4



2021-05-12 10:43:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On Thu, May 06, 2021 at 09:33:53PM +0200, Peter Zijlstra wrote:
> @@ -343,6 +360,10 @@ static int read_symbols(struct elf *elf)
>
> symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize;
>
> + if (!elf_alloc_hash(symbol, symbols_nr) ||
> + !elf_alloc_hash(symbol_name, symbols_nr))
> + return -1;
> +
> for (i = 0; i < symbols_nr; i++) {
> sym = malloc(sizeof(*sym));
> if (!sym) {

Ingo ran into the empty file without .symtab case with as-2.36.1, which
then means we don't even allocate the symbol hashes which then explodes
later.

The below seems to fix things.

---
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 6942357cd4a2..60bef847ee85 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -340,25 +340,19 @@ static int read_symbols(struct elf *elf)
{
struct section *symtab, *symtab_shndx, *sec;
struct symbol *sym, *pfunc;
- int symbols_nr, i;
+ int i, symbols_nr = 0;
char *coldstr;
Elf_Data *shndx_data = NULL;
Elf32_Word shndx;

symtab = find_section_by_name(elf, ".symtab");
- if (!symtab) {
- /*
- * A missing symbol table is actually possible if it's an empty
- * .o file. This can happen for thunk_64.o.
- */
- return 0;
- }
-
- symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
- if (symtab_shndx)
- shndx_data = symtab_shndx->data;
+ if (symtab) {
+ symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
+ if (symtab_shndx)
+ shndx_data = symtab_shndx->data;

- symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize;
+ symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize;
+ }

if (!elf_alloc_hash(symbol, symbols_nr) ||
!elf_alloc_hash(symbol_name, symbols_nr))

Subject: [tip: objtool/core] objtool: Rewrite hashtable sizing

The following commit has been merged into the objtool/core branch of tip:

Commit-ID: 25cf0d8aa2a3440ed32bf1f8df1310d6baf3f1e8
Gitweb: https://git.kernel.org/tip/25cf0d8aa2a3440ed32bf1f8df1310d6baf3f1e8
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 06 May 2021 21:33:53 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 12 May 2021 14:54:50 +02:00

objtool: Rewrite hashtable sizing

Currently objtool has 5 hashtables and sizes them 16 or 20 bits
depending on the --vmlinux argument.

However, a single side doesn't really work well for the 5 tables,
which among them, cover 3 different uses. Also, while vmlinux is
larger, there is still a very wide difference between a defconfig and
allyesconfig build, which again isn't optimally covered by a single
size.

Another aspect is the cost of elf_hash_init(), which for large tables
dominates the runtime for small input files. It turns out that all it
does it assign NULL, something that is required when using malloc().
However, when we allocate memory using mmap(), we're guaranteed to get
zero filled pages.

Therefore, rewrite the whole thing to:

1) use more dynamic sized tables, depending on the input file,
2) avoid the need for elf_hash_init() entirely by using mmap().

This speeds up a regular kernel build (100s to 98s for
x86_64-defconfig), and potentially dramatically speeds up vmlinux
processing.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
tools/objtool/elf.c | 113 ++++++++++++++++-----------
tools/objtool/include/objtool/elf.h | 17 ++--
2 files changed, 83 insertions(+), 47 deletions(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index d08f5f3..a8a0ee2 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -9,6 +9,7 @@

#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/mman.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
@@ -27,21 +28,27 @@ static inline u32 str_hash(const char *str)
return jhash(str, strlen(str), 0);
}

-static inline int elf_hash_bits(void)
-{
- return vmlinux ? ELF_HASH_BITS : 16;
-}
+#define __elf_table(name) (elf->name##_hash)
+#define __elf_bits(name) (elf->name##_bits)

-#define elf_hash_add(hashtable, node, key) \
- hlist_add_head(node, &hashtable[hash_min(key, elf_hash_bits())])
+#define elf_hash_add(name, node, key) \
+ hlist_add_head(node, &__elf_table(name)[hash_min(key, __elf_bits(name))])

-static void elf_hash_init(struct hlist_head *table)
-{
- __hash_init(table, 1U << elf_hash_bits());
-}
+#define elf_hash_for_each_possible(name, obj, member, key) \
+ hlist_for_each_entry(obj, &__elf_table(name)[hash_min(key, __elf_bits(name))], member)

-#define elf_hash_for_each_possible(name, obj, member, key) \
- hlist_for_each_entry(obj, &name[hash_min(key, elf_hash_bits())], member)
+#define elf_alloc_hash(name, size) \
+({ \
+ __elf_bits(name) = max(10, ilog2(size)); \
+ __elf_table(name) = mmap(NULL, sizeof(struct hlist_head) << __elf_bits(name), \
+ PROT_READ|PROT_WRITE, \
+ MAP_PRIVATE|MAP_ANON, -1, 0); \
+ if (__elf_table(name) == (void *)-1L) { \
+ WARN("mmap fail " #name); \
+ __elf_table(name) = NULL; \
+ } \
+ __elf_table(name); \
+})

static bool symbol_to_offset(struct rb_node *a, const struct rb_node *b)
{
@@ -80,9 +87,10 @@ struct section *find_section_by_name(const struct elf *elf, const char *name)
{
struct section *sec;

- elf_hash_for_each_possible(elf->section_name_hash, sec, name_hash, str_hash(name))
+ elf_hash_for_each_possible(section_name, sec, name_hash, str_hash(name)) {
if (!strcmp(sec->name, name))
return sec;
+ }

return NULL;
}
@@ -92,9 +100,10 @@ static struct section *find_section_by_index(struct elf *elf,
{
struct section *sec;

- elf_hash_for_each_possible(elf->section_hash, sec, hash, idx)
+ elf_hash_for_each_possible(section, sec, hash, idx) {
if (sec->idx == idx)
return sec;
+ }

return NULL;
}
@@ -103,9 +112,10 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx)
{
struct symbol *sym;

- elf_hash_for_each_possible(elf->symbol_hash, sym, hash, idx)
+ elf_hash_for_each_possible(symbol, sym, hash, idx) {
if (sym->idx == idx)
return sym;
+ }

return NULL;
}
@@ -170,9 +180,10 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
{
struct symbol *sym;

- elf_hash_for_each_possible(elf->symbol_name_hash, sym, name_hash, str_hash(name))
+ elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) {
if (!strcmp(sym->name, name))
return sym;
+ }

return NULL;
}
@@ -189,8 +200,8 @@ struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *se
sec = sec->reloc;

for_offset_range(o, offset, offset + len) {
- elf_hash_for_each_possible(elf->reloc_hash, reloc, hash,
- sec_offset_hash(sec, o)) {
+ elf_hash_for_each_possible(reloc, reloc, hash,
+ sec_offset_hash(sec, o)) {
if (reloc->sec != sec)
continue;

@@ -228,6 +239,10 @@ static int read_sections(struct elf *elf)
return -1;
}

+ if (!elf_alloc_hash(section, sections_nr) ||
+ !elf_alloc_hash(section_name, sections_nr))
+ return -1;
+
for (i = 0; i < sections_nr; i++) {
sec = malloc(sizeof(*sec));
if (!sec) {
@@ -274,12 +289,14 @@ static int read_sections(struct elf *elf)
sec->len = sec->sh.sh_size;

list_add_tail(&sec->list, &elf->sections);
- elf_hash_add(elf->section_hash, &sec->hash, sec->idx);
- elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name));
+ elf_hash_add(section, &sec->hash, sec->idx);
+ elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
}

- if (stats)
+ if (stats) {
printf("nr_sections: %lu\n", (unsigned long)sections_nr);
+ printf("section_bits: %d\n", elf->section_bits);
+ }

/* sanity check, one more call to elf_nextscn() should return NULL */
if (elf_nextscn(elf->elf, s)) {
@@ -308,8 +325,8 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym)
else
entry = &sym->sec->symbol_list;
list_add(&sym->list, entry);
- elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx);
- elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name));
+ elf_hash_add(symbol, &sym->hash, sym->idx);
+ elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name));

/*
* Don't store empty STT_NOTYPE symbols in the rbtree. They
@@ -329,19 +346,25 @@ static int read_symbols(struct elf *elf)
Elf32_Word shndx;

symtab = find_section_by_name(elf, ".symtab");
- if (!symtab) {
+ if (symtab) {
+ symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
+ if (symtab_shndx)
+ shndx_data = symtab_shndx->data;
+
+ symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize;
+ } else {
/*
* A missing symbol table is actually possible if it's an empty
- * .o file. This can happen for thunk_64.o.
+ * .o file. This can happen for thunk_64.o. Make sure to at
+ * least allocate the symbol hash tables so we can do symbol
+ * lookups without crashing.
*/
- return 0;
+ symbols_nr = 0;
}

- symtab_shndx = find_section_by_name(elf, ".symtab_shndx");
- if (symtab_shndx)
- shndx_data = symtab_shndx->data;
-
- symbols_nr = symtab->sh.sh_size / symtab->sh.sh_entsize;
+ if (!elf_alloc_hash(symbol, symbols_nr) ||
+ !elf_alloc_hash(symbol_name, symbols_nr))
+ return -1;

for (i = 0; i < symbols_nr; i++) {
sym = malloc(sizeof(*sym));
@@ -389,8 +412,10 @@ static int read_symbols(struct elf *elf)
elf_add_symbol(elf, sym);
}

- if (stats)
+ if (stats) {
printf("nr_symbols: %lu\n", (unsigned long)symbols_nr);
+ printf("symbol_bits: %d\n", elf->symbol_bits);
+ }

/* Create parent/child links for any cold subfunctions */
list_for_each_entry(sec, &elf->sections, list) {
@@ -479,7 +504,7 @@ int elf_add_reloc(struct elf *elf, struct section *sec, unsigned long offset,
reloc->addend = addend;

list_add_tail(&reloc->list, &sec->reloc->reloc_list);
- elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
+ elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc));

sec->reloc->changed = true;

@@ -556,6 +581,15 @@ static int read_relocs(struct elf *elf)
unsigned int symndx;
unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;

+ sec = find_section_by_name(elf, ".text");
+ if (!sec) {
+ WARN("no .text");
+ return -1;
+ }
+
+ if (!elf_alloc_hash(reloc, sec->len / 16))
+ return -1;
+
list_for_each_entry(sec, &elf->sections, list) {
if ((sec->sh.sh_type != SHT_RELA) &&
(sec->sh.sh_type != SHT_REL))
@@ -600,7 +634,7 @@ static int read_relocs(struct elf *elf)
}

list_add_tail(&reloc->list, &sec->reloc_list);
- elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
+ elf_hash_add(reloc, &reloc->hash, reloc_hash(reloc));

nr_reloc++;
}
@@ -611,6 +645,7 @@ static int read_relocs(struct elf *elf)
if (stats) {
printf("max_reloc: %lu\n", max_reloc);
printf("tot_reloc: %lu\n", tot_reloc);
+ printf("reloc_bits: %d\n", elf->reloc_bits);
}

return 0;
@@ -632,12 +667,6 @@ struct elf *elf_open_read(const char *name, int flags)

INIT_LIST_HEAD(&elf->sections);

- elf_hash_init(elf->symbol_hash);
- elf_hash_init(elf->symbol_name_hash);
- elf_hash_init(elf->section_hash);
- elf_hash_init(elf->section_name_hash);
- elf_hash_init(elf->reloc_hash);
-
elf->fd = open(name, flags);
if (elf->fd == -1) {
fprintf(stderr, "objtool: Can't open '%s': %s\n",
@@ -850,8 +879,8 @@ struct section *elf_create_section(struct elf *elf, const char *name,
return NULL;

list_add_tail(&sec->list, &elf->sections);
- elf_hash_add(elf->section_hash, &sec->hash, sec->idx);
- elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name));
+ elf_hash_add(section, &sec->hash, sec->idx);
+ elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));

elf->changed = true;

diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 45e5ede..9008275 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -84,11 +84,18 @@ struct elf {
bool changed;
char *name;
struct list_head sections;
- DECLARE_HASHTABLE(symbol_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(reloc_hash, ELF_HASH_BITS);
+
+ int symbol_bits;
+ int symbol_name_bits;
+ int section_bits;
+ int section_name_bits;
+ int reloc_bits;
+
+ struct hlist_head *symbol_hash;
+ struct hlist_head *symbol_name_hash;
+ struct hlist_head *section_hash;
+ struct hlist_head *section_name_hash;
+ struct hlist_head *reloc_hash;
};

#define OFFSET_STRIDE_BITS 4

2021-06-10 18:17:02

by Nathan Chancellor

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

Hi Peter,

On Thu, May 06, 2021 at 09:33:53PM +0200, Peter Zijlstra wrote:
> Currently objtool has 5 hashtables and sizes them 16 or 20 bits
> depending on the --vmlinux argument.
>
> However, a single side doesn't really work well for the 5 tables,
> which among them, cover 3 different uses. Also, while vmlinux is
> larger, there is still a very wide difference between a defconfig and
> allyesconfig build, which again isn't optimally covered by a single
> size.
>
> Another aspect is the cost of elf_hash_init(), which for large tables
> dominates the runtime for small input files. It turns out that all it
> does it assign NULL, something that is required when using malloc().
> However, when we allocate memory using mmap(), we're guaranteed to get
> zero filled pages.
>
> Therefore, rewrite the whole thing to:
>
> 1) use more dynamic sized tables, depending on the input file,
> 2) avoid the need for elf_hash_init() entirely by using mmap().
>
> This speeds up a regular kernel build (100s to 98s for
> x86_64-defconfig), and potentially dramatically speeds up vmlinux
> processing.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

This patch as commit 25cf0d8aa2a3 ("objtool: Rewrite hashtable sizing")
in -tip causes a massive compile time regression with allmodconfig +
ThinLTO.

At v5.13-rc1, the performance penalty is only about 23%, as measured with
hyperfine for two runs [1]:

Benchmark #1: allmodconfig
Time (mean ± σ): 625.173 s ± 2.198 s [User: 35120.895 s, System: 2176.868 s]
Range (min … max): 623.619 s … 626.727 s 2 runs

Benchmark #2: allmodconfig with ThinLTO
Time (mean ± σ): 771.034 s ± 0.369 s [User: 39706.084 s, System: 2326.166 s]
Range (min … max): 770.773 s … 771.295 s 2 runs

Summary
'allmodconfig' ran
1.23 ± 0.00 times faster than 'allmodconfig with ThinLTO'

However, at 25cf0d8aa2a3, it is almost 150% on a 64-core server.

Benchmark #1: allmodconfig
Time (mean ± σ): 624.759 s ± 2.153 s [User: 35114.379 s, System: 2145.456 s]
Range (min … max): 623.237 s … 626.281 s 2 runs

Benchmark #2: allmodconfig with ThinLTO
Time (mean ± σ): 1555.377 s ± 12.806 s [User: 40558.463 s, System: 2310.139 s]
Range (min … max): 1546.321 s … 1564.432 s 2 runs

Summary
'allmodconfig' ran
2.49 ± 0.02 times faster than 'allmodconfig with ThinLTO'

Adding Sami because I am not sure why this patch would have much of an impact
in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
question.

If I can provide any further information or help debug, please let me know.

If you are interested in reproducing this locally, you will need a
fairly recent LLVM stack (I used the stable release/12.x branch) and to
cherry-pick commit 976aac5f8829 ("kcsan: Fix debugfs initcall return
type") to fix an unrelated build failure. My script [2] can build a
self-contained toolchain fairly quickly if you cannot get one from your
package manager. A command like below will speed up the build a bit:

$ ./build-llvm.py \
--branch "release/12.x" \
--build-stage1-only \
--install-stage1-only \
--projects "clang;lld" \
--targets X86

After adding the "install/bin" directory to PATH:

$ echo "CONFIG_GCOV_KERNEL=n
CONFIG_KASAN=n
CONFIG_LTO_CLANG_THIN=y" >allmod.config

$ make -skj"$(nproc)" LLVM=1 LLVM_IAS=1 allmodconfig all

[1]: https://github.com/sharkdp/hyperfine
[2]: https://github.com/ClangBuiltLinux/tc-build

Cheers,
Nathan

2021-06-10 18:45:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On Thu, Jun 10, 2021 at 11:14:51AM -0700, Nathan Chancellor wrote:

> This patch as commit 25cf0d8aa2a3 ("objtool: Rewrite hashtable sizing")
> in -tip causes a massive compile time regression with allmodconfig +
> ThinLTO.

Moo... the allyesconfig builds I used it on were much faster, but that
was on regular GCC vmlinux.o after linking.

> Adding Sami because I am not sure why this patch would have much of an impact
> in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
> question.
>
> If I can provide any further information or help debug, please let me know.
>
> If you are interested in reproducing this locally, you will need a
> fairly recent LLVM stack (I used the stable release/12.x branch) and to
> cherry-pick commit 976aac5f8829 ("kcsan: Fix debugfs initcall return
> type") to fix an unrelated build failure. My script [2] can build a
> self-contained toolchain fairly quickly if you cannot get one from your
> package manager. A command like below will speed up the build a bit:

Would something like llvm-13 from Debian be good enough?

$ clang-13 --version
Debian clang version 13.0.0-++20210418105309+a0898f0cecc7-1~exp1
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin

2021-06-10 18:54:22

by Sami Tolvanen

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On Thu, Jun 10, 2021 at 11:14 AM Nathan Chancellor <[email protected]> wrote:
> Adding Sami because I am not sure why this patch would have much of an impact
> in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
> question.

It's because LLVM enables -ffunction-sections with LTO, so using .text
section size to estimate the reloc hash table size isn't going to be
accurate, as confirmed by objtool output with --stats:

OBJTOOL vmlinux.o
nr_sections: 141481
section_bits: 17
nr_symbols: 215262
symbol_bits: 17
max_reloc: 24850
tot_reloc: 590890
reloc_bits: 10

Sami

2021-06-10 18:58:42

by Nathan Chancellor

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On 6/10/2021 11:43 AM, Peter Zijlstra wrote:
> On Thu, Jun 10, 2021 at 11:14:51AM -0700, Nathan Chancellor wrote:
>
>> This patch as commit 25cf0d8aa2a3 ("objtool: Rewrite hashtable sizing")
>> in -tip causes a massive compile time regression with allmodconfig +
>> ThinLTO.
>
> Moo... the allyesconfig builds I used it on were much faster, but that
> was on regular GCC vmlinux.o after linking.
>
>> Adding Sami because I am not sure why this patch would have much of an impact
>> in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
>> question.
>>
>> If I can provide any further information or help debug, please let me know.
>>
>> If you are interested in reproducing this locally, you will need a
>> fairly recent LLVM stack (I used the stable release/12.x branch) and to
>> cherry-pick commit 976aac5f8829 ("kcsan: Fix debugfs initcall return
>> type") to fix an unrelated build failure. My script [2] can build a
>> self-contained toolchain fairly quickly if you cannot get one from your
>> package manager. A command like below will speed up the build a bit:
>
> Would something like llvm-13 from Debian be good enough?
>
> $ clang-13 --version
> Debian clang version 13.0.0-++20210418105309+a0898f0cecc7-1~exp1
> Target: x86_64-pc-linux-gnu
> Thread model: posix
> InstalledDir: /usr/bin
>

Yes, that would work. That is what we use in our CI.

Looks like Sami gave a reply that explains it.

Cheers,
Nathan

2021-06-10 19:36:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On Thu, Jun 10, 2021 at 11:50:36AM -0700, Sami Tolvanen wrote:
> On Thu, Jun 10, 2021 at 11:14 AM Nathan Chancellor <[email protected]> wrote:
> > Adding Sami because I am not sure why this patch would have much of an impact
> > in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
> > question.
>
> It's because LLVM enables -ffunction-sections with LTO, so using .text
> section size to estimate the reloc hash table size isn't going to be
> accurate, as confirmed by objtool output with --stats:
>
> OBJTOOL vmlinux.o
> nr_sections: 141481
> section_bits: 17
> nr_symbols: 215262
> symbol_bits: 17
> max_reloc: 24850
> tot_reloc: 590890
> reloc_bits: 10

Bah. Would something like the *completely* untested below help with that?

---
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 25f6d293bc86..8676c7598728 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -288,6 +288,9 @@ static int read_sections(struct elf *elf)
}
sec->len = sec->sh.sh_size;

+ if (sec->sh.sh_flags & SHF_EXECINSTR)
+ elf->text_size += sec->len;
+
list_add_tail(&sec->list, &elf->sections);
elf_hash_add(section, &sec->hash, sec->idx);
elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
@@ -581,13 +584,7 @@ static int read_relocs(struct elf *elf)
unsigned int symndx;
unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;

- sec = find_section_by_name(elf, ".text");
- if (!sec) {
- WARN("no .text");
- return -1;
- }
-
- if (!elf_alloc_hash(reloc, sec->len / 16))
+ if (!elf_alloc_hash(reloc, elf->text_size / 16))
return -1;

list_for_each_entry(sec, &elf->sections, list) {
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 90082751f851..e34395047530 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -83,6 +83,7 @@ struct elf {
int fd;
bool changed;
char *name;
+ unsigned int text_size;
struct list_head sections;

int symbol_bits;

2021-06-10 19:46:26

by Sami Tolvanen

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On Thu, Jun 10, 2021 at 12:33 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jun 10, 2021 at 11:50:36AM -0700, Sami Tolvanen wrote:
> > On Thu, Jun 10, 2021 at 11:14 AM Nathan Chancellor <[email protected]> wrote:
> > > Adding Sami because I am not sure why this patch would have much of an impact
> > > in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
> > > question.
> >
> > It's because LLVM enables -ffunction-sections with LTO, so using .text
> > section size to estimate the reloc hash table size isn't going to be
> > accurate, as confirmed by objtool output with --stats:
> >
> > OBJTOOL vmlinux.o
> > nr_sections: 141481
> > section_bits: 17
> > nr_symbols: 215262
> > symbol_bits: 17
> > max_reloc: 24850
> > tot_reloc: 590890
> > reloc_bits: 10
>
> Bah. Would something like the *completely* untested below help with that?

Yes, that seems to work:

tot_reloc: 590890
reloc_bits: 19

Nathan, can you confirm if this fixes the regression for you?

Sami

2021-06-10 21:02:21

by Nathan Chancellor

[permalink] [raw]
Subject: Re: [PATCH 01/13] objtool: Rewrite hashtable sizing

On Thu, Jun 10, 2021 at 09:33:44PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 10, 2021 at 11:50:36AM -0700, Sami Tolvanen wrote:
> > On Thu, Jun 10, 2021 at 11:14 AM Nathan Chancellor <[email protected]> wrote:
> > > Adding Sami because I am not sure why this patch would have much of an impact
> > > in relation to LTO. https://git.kernel.org/tip/25cf0d8aa2a3 is the patch in
> > > question.
> >
> > It's because LLVM enables -ffunction-sections with LTO, so using .text
> > section size to estimate the reloc hash table size isn't going to be
> > accurate, as confirmed by objtool output with --stats:
> >
> > OBJTOOL vmlinux.o
> > nr_sections: 141481
> > section_bits: 17
> > nr_symbols: 215262
> > symbol_bits: 17
> > max_reloc: 24850
> > tot_reloc: 590890
> > reloc_bits: 10
>
> Bah. Would something like the *completely* untested below help with that?

LGTM, thanks for the quick fix!

Benchmark #1: allmodconfig
Time (mean ± σ): 624.555 s ± 2.089 s [User: 35109.967 s, System: 2146.215 s]
Range (min … max): 623.078 s … 626.032 s 2 runs

Benchmark #2: allmodconfig with ThinLTO
Time (mean ± σ): 769.959 s ± 1.819 s [User: 39692.409 s, System: 2308.010 s]
Range (min … max): 768.673 s … 771.245 s 2 runs

Summary
'allmodconfig' ran
1.23 ± 0.01 times faster than 'allmodconfig with ThinLTO'

Tested-by: Nathan Chancellor <[email protected]>

> ---
> diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
> index 25f6d293bc86..8676c7598728 100644
> --- a/tools/objtool/elf.c
> +++ b/tools/objtool/elf.c
> @@ -288,6 +288,9 @@ static int read_sections(struct elf *elf)
> }
> sec->len = sec->sh.sh_size;
>
> + if (sec->sh.sh_flags & SHF_EXECINSTR)
> + elf->text_size += sec->len;
> +
> list_add_tail(&sec->list, &elf->sections);
> elf_hash_add(section, &sec->hash, sec->idx);
> elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
> @@ -581,13 +584,7 @@ static int read_relocs(struct elf *elf)
> unsigned int symndx;
> unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;
>
> - sec = find_section_by_name(elf, ".text");
> - if (!sec) {
> - WARN("no .text");
> - return -1;
> - }
> -
> - if (!elf_alloc_hash(reloc, sec->len / 16))
> + if (!elf_alloc_hash(reloc, elf->text_size / 16))
> return -1;
>
> list_for_each_entry(sec, &elf->sections, list) {
> diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
> index 90082751f851..e34395047530 100644
> --- a/tools/objtool/include/objtool/elf.h
> +++ b/tools/objtool/include/objtool/elf.h
> @@ -83,6 +83,7 @@ struct elf {
> int fd;
> bool changed;
> char *name;
> + unsigned int text_size;
> struct list_head sections;
>
> int symbol_bits;

Subject: [tip: objtool/core] objtool: Improve reloc hash size guestimate

The following commit has been merged into the objtool/core branch of tip:

Commit-ID: d33b9035e14a35f6f2a5f067f0b156a93581811d
Gitweb: https://git.kernel.org/tip/d33b9035e14a35f6f2a5f067f0b156a93581811d
Author: Peter Zijlstra <[email protected]>
AuthorDate: Fri, 11 Jun 2021 08:33:36 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Mon, 14 Jun 2021 14:05:36 +02:00

objtool: Improve reloc hash size guestimate

Nathan reported that LLVM ThinLTO builds have a performance regression
with commit 25cf0d8aa2a3 ("objtool: Rewrite hashtable sizing"). Sami
was quick to note that this is due to their use of -ffunction-sections.

As a result the .text section is small and basing the number of relocs
off of that no longer works. Instead have read_sections() compute the
sum of all SHF_EXECINSTR sections and use that.

Fixes: 25cf0d8aa2a3 ("objtool: Rewrite hashtable sizing")
Reported-by: Nathan Chancellor <[email protected]>
Debugged-by: Sami Tolvanen <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Nathan Chancellor <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
tools/objtool/elf.c | 11 ++++-------
tools/objtool/include/objtool/elf.h | 1 +
2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index a8a0ee2..2371ccc 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -288,6 +288,9 @@ static int read_sections(struct elf *elf)
}
sec->len = sec->sh.sh_size;

+ if (sec->sh.sh_flags & SHF_EXECINSTR)
+ elf->text_size += sec->len;
+
list_add_tail(&sec->list, &elf->sections);
elf_hash_add(section, &sec->hash, sec->idx);
elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name));
@@ -581,13 +584,7 @@ static int read_relocs(struct elf *elf)
unsigned int symndx;
unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;

- sec = find_section_by_name(elf, ".text");
- if (!sec) {
- WARN("no .text");
- return -1;
- }
-
- if (!elf_alloc_hash(reloc, sec->len / 16))
+ if (!elf_alloc_hash(reloc, elf->text_size / 16))
return -1;

list_for_each_entry(sec, &elf->sections, list) {
diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h
index 9008275..e343950 100644
--- a/tools/objtool/include/objtool/elf.h
+++ b/tools/objtool/include/objtool/elf.h
@@ -83,6 +83,7 @@ struct elf {
int fd;
bool changed;
char *name;
+ unsigned int text_size;
struct list_head sections;

int symbol_bits;