2023-09-09 21:25:49

by pengdonglin

[permalink] [raw]
Subject: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

Currently, we are only using the linear search method to find the type id
by the name, which has a time complexity of O(n). This change involves
sorting the names of btf types in ascending order and using binary search,
which has a time complexity of O(log(n)). This idea was inspired by the
following patch:

60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").

At present, this improvement is only for searching in vmlinux's and
module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.

Another change is the search direction, where we search the BTF first and
then its base, the type id of the first matched btf_type will be returned.

Here is a time-consuming result that finding all the type ids of 67,819 kernel
functions in vmlinux's BTF by their names:

Before: 17000 ms
After: 10 ms

The average lookup performance has improved about 1700x at the above scenario.

However, this change will consume more memory, for example, 67,819 kernel
functions will allocate about 530KB memory.

Signed-off-by: Donglin Peng <[email protected]>
---
Changes in RFC v2:
- Fix the build issue reported by kernel test robot <[email protected]>
---
include/linux/btf.h | 1 +
kernel/bpf/btf.c | 300 ++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 291 insertions(+), 10 deletions(-)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index cac9f304e27a..6260a0668773 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
bool btf_is_module(const struct btf *btf);
struct module *btf_try_get_module(const struct btf *btf);
u32 btf_nr_types(const struct btf *btf);
+u32 btf_type_cnt(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m,
u32 expected_offset, u32 expected_size);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 817204d53372..51aa9f27853b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
struct btf_id_dtor_kfunc dtors[];
};

+enum {
+ BTF_ID_NAME_FUNC, /* function */
+ BTF_ID_NAME_STRUCT, /* struct */
+ BTF_ID_NAME_MAX
+};
+
+struct btf_id_name {
+ int id;
+ u32 name_off;
+};
+
+struct btf_id_name_map {
+ struct btf_id_name *id_name;
+ u32 count;
+};
+
+struct btf_id_name_maps {
+ struct btf_id_name_map map[BTF_ID_NAME_MAX];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -257,6 +277,7 @@ struct btf {
struct btf_kfunc_set_tab *kfunc_set_tab;
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
struct btf_struct_metas *struct_meta_tab;
+ struct btf_id_name_maps *id_name_maps;

/* split BTF support */
struct btf *base_btf;
@@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
return total;
}

+u32 btf_type_cnt(const struct btf *btf)
+{
+ return btf->start_id + btf->nr_types;
+}
+
+static inline u8 btf_id_name_idx_to_kind(int index)
+{
+ u8 kind;
+
+ switch (index) {
+ case BTF_ID_NAME_FUNC:
+ kind = BTF_KIND_FUNC;
+ break;
+ case BTF_ID_NAME_STRUCT:
+ kind = BTF_KIND_STRUCT;
+ break;
+ default:
+ kind = BTF_KIND_UNKN;
+ break;
+ }
+
+ return kind;
+}
+
+static inline int btf_id_name_kind_to_idx(u8 kind)
+{
+ int index;
+
+ switch (kind) {
+ case BTF_KIND_FUNC:
+ index = BTF_ID_NAME_FUNC;
+ break;
+ case BTF_KIND_STRUCT:
+ index = BTF_ID_NAME_STRUCT;
+ break;
+ default:
+ index = -1;
+ break;
+ }
+
+ return index;
+}
+
+static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
+ u32 size, const char *name,
+ struct btf_id_name **start,
+ struct btf_id_name **end,
+ const struct btf *btf)
+{
+ int ret;
+ int low, mid, high;
+ const char *name_buf;
+
+ low = 0;
+ high = size - 1;
+
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
+ ret = strcmp(name, name_buf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
+ }
+
+ if (low > high)
+ return -ESRCH;
+
+ if (start) {
+ low = mid;
+ while (low) {
+ name_buf = btf_name_by_offset(btf, id_name[low-1].name_off);
+ if (strcmp(name, name_buf))
+ break;
+ low--;
+ }
+ *start = &id_name[low];
+ }
+
+ if (end) {
+ high = mid;
+ while (high < size - 1) {
+ name_buf = btf_name_by_offset(btf, id_name[high+1].name_off);
+ if (strcmp(name, name_buf))
+ break;
+ high++;
+ }
+ *end = &id_name[high];
+ }
+
+ return id_name[mid].id;
+}
+
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
+ const struct btf_id_name_maps *maps;
+ const struct btf_id_name_map *map;
+ struct btf_id_name *start;
const struct btf_type *t;
const char *tname;
- u32 i, total;
+ int index = btf_id_name_kind_to_idx(kind);
+ s32 id, total;

- total = btf_nr_types(btf);
- for (i = 1; i < total; i++) {
- t = btf_type_by_id(btf, i);
- if (BTF_INFO_KIND(t->info) != kind)
- continue;
+ do {
+ maps = btf->id_name_maps;
+ if (index >= 0 && maps && maps->map[index].id_name) {
+ /* binary search */
+ map = &maps->map[index];
+ id = btf_find_by_name_bsearch(map->id_name,
+ map->count, name, &start, NULL, btf);
+ if (id > 0) {
+ /*
+ * Return the first one that
+ * matched
+ */
+ return start->id;
+ }
+ } else {
+ /* linear search */
+ total = btf_type_cnt(btf);
+ for (id = btf->start_id; id < total; id++) {
+ t = btf_type_by_id(btf, id);
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, name))
+ return id;
+ }
+ }

- tname = btf_name_by_offset(btf, t->name_off);
- if (!strcmp(tname, name))
- return i;
- }
+ btf = btf->base_btf;
+ } while (btf);

return -ENOENT;
}
@@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}

+static void btf_destroy_id_name(struct btf *btf, int index)
+{
+ struct btf_id_name_maps *maps = btf->id_name_maps;
+ struct btf_id_name_map *map = &maps->map[index];
+
+ if (map->id_name) {
+ kvfree(map->id_name);
+ map->id_name = NULL;
+ map->count = 0;
+ }
+}
+
+static void btf_destroy_id_name_map(struct btf *btf)
+{
+ int i;
+
+ if (!btf->id_name_maps)
+ return;
+
+ for (i = 0; i < BTF_ID_NAME_MAX; i++)
+ btf_destroy_id_name(btf, i);
+
+ kfree(btf->id_name_maps);
+ btf->id_name_maps = NULL;
+}
+
static void btf_free_kfunc_set_tab(struct btf *btf)
{
struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
@@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf *btf)

static void btf_free(struct btf *btf)
{
+ btf_destroy_id_name_map(btf);
btf_free_struct_meta_tab(btf);
btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
@@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
return kctx_type_id;
}

+static int btf_compare_id_name(const void *a, const void *b, const void *priv)
+{
+ const struct btf_id_name *ia = (const struct btf_id_name *)a;
+ const struct btf_id_name *ib = (const struct btf_id_name *)b;
+ const struct btf *btf = priv;
+ int ret;
+
+ /*
+ * Sort names in ascending order, if the name is same, sort ids in
+ * ascending order.
+ */
+ ret = strcmp(btf_name_by_offset(btf, ia->name_off),
+ btf_name_by_offset(btf, ib->name_off));
+ if (!ret)
+ ret = ia->id - ib->id;
+
+ return ret;
+}
+
+static int btf_create_id_name(struct btf *btf, int index)
+{
+ struct btf_id_name_maps *maps = btf->id_name_maps;
+ struct btf_id_name_map *map = &maps->map[index];
+ const struct btf_type *t;
+ struct btf_id_name *id_name;
+ const char *name;
+ int i, j = 0;
+ u32 total, count = 0;
+ u8 kind;
+
+ kind = btf_id_name_idx_to_kind(index);
+ if (kind == BTF_KIND_UNKN)
+ return -EINVAL;
+
+ if (map->id_name || map->count != 0)
+ return -EINVAL;
+
+ total = btf_type_cnt(btf);
+ for (i = btf->start_id; i < total; i++) {
+ t = btf_type_by_id(btf, i);
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+ name = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(name))
+ continue;
+ count++;
+ }
+
+ if (count == 0)
+ return 0;
+
+ id_name = kvcalloc(count, sizeof(struct btf_id_name),
+ GFP_KERNEL);
+ if (!id_name)
+ return -ENOMEM;
+
+ for (i = btf->start_id; i < total; i++) {
+ t = btf_type_by_id(btf, i);
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+ name = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(name))
+ continue;
+
+ id_name[j].id = i;
+ id_name[j].name_off = t->name_off;
+ j++;
+ }
+
+ sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
+ NULL, btf);
+
+ map->id_name = id_name;
+ map->count = count;
+
+ return 0;
+}
+
+static int btf_create_id_name_map(struct btf *btf)
+{
+ int err, i;
+ struct btf_id_name_maps *maps;
+
+ if (btf->id_name_maps)
+ return -EBUSY;
+
+ maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
+ if (!maps)
+ return -ENOMEM;
+
+ btf->id_name_maps = maps;
+
+ for (i = 0; i < BTF_ID_NAME_MAX; i++) {
+ err = btf_create_id_name(btf, i);
+ if (err < 0)
+ break;
+ }
+
+ return err;
+}
+
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)

@@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;

+ err = btf_create_id_name_map(btf);
+ if (err)
+ goto errout;
+
/* btf_parse_vmlinux() runs under bpf_verifier_lock */
bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);

@@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
errout:
btf_verifier_env_free(env);
if (btf) {
+ btf_destroy_id_name_map(btf);
kvfree(btf->types);
kfree(btf);
}
@@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;

+ err = btf_create_id_name_map(btf);
+ if (err)
+ goto errout;
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
+
return btf;

errout:
btf_verifier_env_free(env);
if (btf) {
+ btf_destroy_id_name_map(btf);
kvfree(btf->data);
kvfree(btf->types);
kfree(btf);
--
2.25.1


2023-09-12 05:02:08

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

在 2023/9/9 20:29, Masami Hiramatsu (Google) 写道:
> On Sat, 9 Sep 2023 02:16:46 -0700
> Donglin Peng <[email protected]> wrote:
>
>> Currently, we are only using the linear search method to find the type id
>> by the name, which has a time complexity of O(n). This change involves
>> sorting the names of btf types in ascending order and using binary search,
>> which has a time complexity of O(log(n)). This idea was inspired by the
>> following patch:
>>
>> 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
>>
>> At present, this improvement is only for searching in vmlinux's and
>> module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
>>
>> Another change is the search direction, where we search the BTF first and
>> then its base, the type id of the first matched btf_type will be returned.
>>
>> Here is a time-consuming result that finding all the type ids of 67,819 kernel
>> functions in vmlinux's BTF by their names:
>>
>> Before: 17000 ms
>> After: 10 ms
>
> Nice work!

Thank you

>
>>
>> The average lookup performance has improved about 1700x at the above scenario.
>>
>> However, this change will consume more memory, for example, 67,819 kernel
>> functions will allocate about 530KB memory.
>
> I'm not so familier with how the BTF is generated, what about making this
> list offline? Since BTF is static data, it is better to make the map when

The BTF is generated by pahole during the building of the kernel or modules.
Pahole is maintained in the project https://github.com/acmel/dwarves.
The log
printed by scripts/link-vmlinux.sh when generating BTF for vmlinux is as
follows:

LLVM_OBJCOPY=objcopy pahole -J --skip_encoding_btf_vars --btf_gen_floats
.tmp_vmlinux.btf

If making the list offline, the pahole needs to be modified or a new tool
needs to be written and maintained in the kernel tree. Therefore, it may
be simpler to make the list at runtime.

> it is built. And I also would like to suggest to make a new map to make
> another new map which maps the BTF ID and the address of the function, so
> that we can do binary search the BTF object from the function address.
> (The latter map should be built when CONFIG_BTF_ADDR_MAP=y)

It has been observed that two or more functions may have the same address
but different IDs. For example:

ffffffff81218370 t __do_sys_getuid16
ffffffff81218370 T __ia32_sys_getuid16
ffffffff81218370 T __x64_sys_getuid16

{
"id": 27911,
"kind": "FUNC",
"name": "__do_sys_getuid16",
"type_id": 4455,
"linkage": "static"
},{
"id": 20303,
"kind": "FUNC",
"name": "__ia32_sys_getuid16",
"type_id": 4455,
"linkage": "static"
},{
"id": 20304,
"kind": "FUNC",
"name": "__x64_sys_getuid16",
"type_id": 4455,
"linkage": "static"
},

It may be a issue to return which id. However, if only the FUNC_PROTO is
of concern, any one of them can be returned.

It may not be necessary to create a new list for function addresses because
the id_name map can be reused for this purpose. Here is an idea:

1. Use the function address to get the name by calling the function
sprint_symbol_no_offset.

2. Then call the function btf_find_by_name_kind to get the BTF ID.

Both sprint_symbol_no_offset and btf_find_by_name_kind use binary search.

>
> Thank you,
>
>>
>> Signed-off-by: Donglin Peng <[email protected]>
>> ---
>> Changes in RFC v2:
>> - Fix the build issue reported by kernel test robot <[email protected]>
>> ---
>> include/linux/btf.h | 1 +
>> kernel/bpf/btf.c | 300 ++++++++++++++++++++++++++++++++++++++++++--
>> 2 files changed, 291 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/btf.h b/include/linux/btf.h
>> index cac9f304e27a..6260a0668773 100644
>> --- a/include/linux/btf.h
>> +++ b/include/linux/btf.h
>> @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
>> bool btf_is_module(const struct btf *btf);
>> struct module *btf_try_get_module(const struct btf *btf);
>> u32 btf_nr_types(const struct btf *btf);
>> +u32 btf_type_cnt(const struct btf *btf);
>> bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
>> const struct btf_member *m,
>> u32 expected_offset, u32 expected_size);
>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>> index 817204d53372..51aa9f27853b 100644
>> --- a/kernel/bpf/btf.c
>> +++ b/kernel/bpf/btf.c
>> @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
>> struct btf_id_dtor_kfunc dtors[];
>> };
>>
>> +enum {
>> + BTF_ID_NAME_FUNC, /* function */
>> + BTF_ID_NAME_STRUCT, /* struct */
>> + BTF_ID_NAME_MAX
>> +};
>> +
>> +struct btf_id_name {
>> + int id;
>> + u32 name_off;
>> +};
>> +
>> +struct btf_id_name_map {
>> + struct btf_id_name *id_name;
>> + u32 count;
>> +};
>> +
>> +struct btf_id_name_maps {
>> + struct btf_id_name_map map[BTF_ID_NAME_MAX];
>> +};
>> +
>> struct btf {
>> void *data;
>> struct btf_type **types;
>> @@ -257,6 +277,7 @@ struct btf {
>> struct btf_kfunc_set_tab *kfunc_set_tab;
>> struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
>> struct btf_struct_metas *struct_meta_tab;
>> + struct btf_id_name_maps *id_name_maps;
>>
>> /* split BTF support */
>> struct btf *base_btf;
>> @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
>> return total;
>> }
>>
>> +u32 btf_type_cnt(const struct btf *btf)
>> +{
>> + return btf->start_id + btf->nr_types;
>> +}
>> +
>> +static inline u8 btf_id_name_idx_to_kind(int index)
>> +{
>> + u8 kind;
>> +
>> + switch (index) {
>> + case BTF_ID_NAME_FUNC:
>> + kind = BTF_KIND_FUNC;
>> + break;
>> + case BTF_ID_NAME_STRUCT:
>> + kind = BTF_KIND_STRUCT;
>> + break;
>> + default:
>> + kind = BTF_KIND_UNKN;
>> + break;
>> + }
>> +
>> + return kind;
>> +}
>> +
>> +static inline int btf_id_name_kind_to_idx(u8 kind)
>> +{
>> + int index;
>> +
>> + switch (kind) {
>> + case BTF_KIND_FUNC:
>> + index = BTF_ID_NAME_FUNC;
>> + break;
>> + case BTF_KIND_STRUCT:
>> + index = BTF_ID_NAME_STRUCT;
>> + break;
>> + default:
>> + index = -1;
>> + break;
>> + }
>> +
>> + return index;
>> +}
>> +
>> +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
>> + u32 size, const char *name,
>> + struct btf_id_name **start,
>> + struct btf_id_name **end,
>> + const struct btf *btf)
>> +{
>> + int ret;
>> + int low, mid, high;
>> + const char *name_buf;
>> +
>> + low = 0;
>> + high = size - 1;
>> +
>> + while (low <= high) {
>> + mid = low + (high - low) / 2;
>> + name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
>> + ret = strcmp(name, name_buf);
>> + if (ret > 0)
>> + low = mid + 1;
>> + else if (ret < 0)
>> + high = mid - 1;
>> + else
>> + break;
>> + }
>> +
>> + if (low > high)
>> + return -ESRCH;
>> +
>> + if (start) {
>> + low = mid;
>> + while (low) {
>> + name_buf = btf_name_by_offset(btf, id_name[low-1].name_off);
>> + if (strcmp(name, name_buf))
>> + break;
>> + low--;
>> + }
>> + *start = &id_name[low];
>> + }
>> +
>> + if (end) {
>> + high = mid;
>> + while (high < size - 1) {
>> + name_buf = btf_name_by_offset(btf, id_name[high+1].name_off);
>> + if (strcmp(name, name_buf))
>> + break;
>> + high++;
>> + }
>> + *end = &id_name[high];
>> + }
>> +
>> + return id_name[mid].id;
>> +}
>> +
>> s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
>> {
>> + const struct btf_id_name_maps *maps;
>> + const struct btf_id_name_map *map;
>> + struct btf_id_name *start;
>> const struct btf_type *t;
>> const char *tname;
>> - u32 i, total;
>> + int index = btf_id_name_kind_to_idx(kind);
>> + s32 id, total;
>>
>> - total = btf_nr_types(btf);
>> - for (i = 1; i < total; i++) {
>> - t = btf_type_by_id(btf, i);
>> - if (BTF_INFO_KIND(t->info) != kind)
>> - continue;
>> + do {
>> + maps = btf->id_name_maps;
>> + if (index >= 0 && maps && maps->map[index].id_name) {
>> + /* binary search */
>> + map = &maps->map[index];
>> + id = btf_find_by_name_bsearch(map->id_name,
>> + map->count, name, &start, NULL, btf);
>> + if (id > 0) {
>> + /*
>> + * Return the first one that
>> + * matched
>> + */
>> + return start->id;
>> + }
>> + } else {
>> + /* linear search */
>> + total = btf_type_cnt(btf);
>> + for (id = btf->start_id; id < total; id++) {
>> + t = btf_type_by_id(btf, id);
>> + if (BTF_INFO_KIND(t->info) != kind)
>> + continue;
>> +
>> + tname = btf_name_by_offset(btf, t->name_off);
>> + if (!strcmp(tname, name))
>> + return id;
>> + }
>> + }
>>
>> - tname = btf_name_by_offset(btf, t->name_off);
>> - if (!strcmp(tname, name))
>> - return i;
>> - }
>> + btf = btf->base_btf;
>> + } while (btf);
>>
>> return -ENOENT;
>> }
>> @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
>> spin_unlock_irqrestore(&btf_idr_lock, flags);
>> }
>>
>> +static void btf_destroy_id_name(struct btf *btf, int index)
>> +{
>> + struct btf_id_name_maps *maps = btf->id_name_maps;
>> + struct btf_id_name_map *map = &maps->map[index];
>> +
>> + if (map->id_name) {
>> + kvfree(map->id_name);
>> + map->id_name = NULL;
>> + map->count = 0;
>> + }
>> +}
>> +
>> +static void btf_destroy_id_name_map(struct btf *btf)
>> +{
>> + int i;
>> +
>> + if (!btf->id_name_maps)
>> + return;
>> +
>> + for (i = 0; i < BTF_ID_NAME_MAX; i++)
>> + btf_destroy_id_name(btf, i);
>> +
>> + kfree(btf->id_name_maps);
>> + btf->id_name_maps = NULL;
>> +}
>> +
>> static void btf_free_kfunc_set_tab(struct btf *btf)
>> {
>> struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
>> @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf *btf)
>>
>> static void btf_free(struct btf *btf)
>> {
>> + btf_destroy_id_name_map(btf);
>> btf_free_struct_meta_tab(btf);
>> btf_free_dtor_kfunc_tab(btf);
>> btf_free_kfunc_set_tab(btf);
>> @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
>> return kctx_type_id;
>> }
>>
>> +static int btf_compare_id_name(const void *a, const void *b, const void *priv)
>> +{
>> + const struct btf_id_name *ia = (const struct btf_id_name *)a;
>> + const struct btf_id_name *ib = (const struct btf_id_name *)b;
>> + const struct btf *btf = priv;
>> + int ret;
>> +
>> + /*
>> + * Sort names in ascending order, if the name is same, sort ids in
>> + * ascending order.
>> + */
>> + ret = strcmp(btf_name_by_offset(btf, ia->name_off),
>> + btf_name_by_offset(btf, ib->name_off));
>> + if (!ret)
>> + ret = ia->id - ib->id;
>> +
>> + return ret;
>> +}
>> +
>> +static int btf_create_id_name(struct btf *btf, int index)
>> +{
>> + struct btf_id_name_maps *maps = btf->id_name_maps;
>> + struct btf_id_name_map *map = &maps->map[index];
>> + const struct btf_type *t;
>> + struct btf_id_name *id_name;
>> + const char *name;
>> + int i, j = 0;
>> + u32 total, count = 0;
>> + u8 kind;
>> +
>> + kind = btf_id_name_idx_to_kind(index);
>> + if (kind == BTF_KIND_UNKN)
>> + return -EINVAL;
>> +
>> + if (map->id_name || map->count != 0)
>> + return -EINVAL;
>> +
>> + total = btf_type_cnt(btf);
>> + for (i = btf->start_id; i < total; i++) {
>> + t = btf_type_by_id(btf, i);
>> + if (BTF_INFO_KIND(t->info) != kind)
>> + continue;
>> + name = btf_name_by_offset(btf, t->name_off);
>> + if (str_is_empty(name))
>> + continue;
>> + count++;
>> + }
>> +
>> + if (count == 0)
>> + return 0;
>> +
>> + id_name = kvcalloc(count, sizeof(struct btf_id_name),
>> + GFP_KERNEL);
>> + if (!id_name)
>> + return -ENOMEM;
>> +
>> + for (i = btf->start_id; i < total; i++) {
>> + t = btf_type_by_id(btf, i);
>> + if (BTF_INFO_KIND(t->info) != kind)
>> + continue;
>> + name = btf_name_by_offset(btf, t->name_off);
>> + if (str_is_empty(name))
>> + continue;
>> +
>> + id_name[j].id = i;
>> + id_name[j].name_off = t->name_off;
>> + j++;
>> + }
>> +
>> + sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
>> + NULL, btf);
>> +
>> + map->id_name = id_name;
>> + map->count = count;
>> +
>> + return 0;
>> +}
>> +
>> +static int btf_create_id_name_map(struct btf *btf)
>> +{
>> + int err, i;
>> + struct btf_id_name_maps *maps;
>> +
>> + if (btf->id_name_maps)
>> + return -EBUSY;
>> +
>> + maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
>> + if (!maps)
>> + return -ENOMEM;
>> +
>> + btf->id_name_maps = maps;
>> +
>> + for (i = 0; i < BTF_ID_NAME_MAX; i++) {
>> + err = btf_create_id_name(btf, i);
>> + if (err < 0)
>> + break;
>> + }
>> +
>> + return err;
>> +}
>> +
>> BTF_ID_LIST(bpf_ctx_convert_btf_id)
>> BTF_ID(struct, bpf_ctx_convert)
>>
>> @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
>> if (err)
>> goto errout;
>>
>> + err = btf_create_id_name_map(btf);
>> + if (err)
>> + goto errout;
>> +
>> /* btf_parse_vmlinux() runs under bpf_verifier_lock */
>> bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
>>
>> @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
>> errout:
>> btf_verifier_env_free(env);
>> if (btf) {
>> + btf_destroy_id_name_map(btf);
>> kvfree(btf->types);
>> kfree(btf);
>> }
>> @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
>> if (err)
>> goto errout;
>>
>> + err = btf_create_id_name_map(btf);
>> + if (err)
>> + goto errout;
>> +
>> btf_verifier_env_free(env);
>> refcount_set(&btf->refcnt, 1);
>> +
>> return btf;
>>
>> errout:
>> btf_verifier_env_free(env);
>> if (btf) {
>> + btf_destroy_id_name_map(btf);
>> kvfree(btf->data);
>> kvfree(btf->types);
>> kfree(btf);
>> --
>> 2.25.1
>>
>
>

2023-09-12 15:05:17

by Eduard Zingerman

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> Currently, we are only using the linear search method to find the type id
> by the name, which has a time complexity of O(n). This change involves
> sorting the names of btf types in ascending order and using binary search,
> which has a time complexity of O(log(n)). This idea was inspired by the
> following patch:
>
> 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
>
> At present, this improvement is only for searching in vmlinux's and
> module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
>
> Another change is the search direction, where we search the BTF first and
> then its base, the type id of the first matched btf_type will be returned.
>
> Here is a time-consuming result that finding all the type ids of 67,819 kernel
> functions in vmlinux's BTF by their names:
>
> Before: 17000 ms
> After: 10 ms
>
> The average lookup performance has improved about 1700x at the above scenario.
>
> However, this change will consume more memory, for example, 67,819 kernel
> functions will allocate about 530KB memory.

Hi Donglin,

I think this is a good improvement. However, I wonder, why did you
choose to have a separate name map for each BTF kind?

I did some analysis for my local testing kernel config and got such numbers:
- total number of BTF objects: 97350
- number of FUNC and STRUCT objects: 51597
- number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
(these are all kinds for which lookup by name might make sense)
- number of named objects: 54246
- number of name collisions:
- unique names: 53985 counts
- 2 objects with the same name: 129 counts
- 3 objects with the same name: 3 counts

So, it appears that having a single map for all named objects makes
sense and would also simplify the implementation, what do you think?

Thanks,
Eduard

>
> Signed-off-by: Donglin Peng <[email protected]>
> ---
> Changes in RFC v2:
> - Fix the build issue reported by kernel test robot <[email protected]>
> ---
> include/linux/btf.h | 1 +
> kernel/bpf/btf.c | 300 ++++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 291 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/btf.h b/include/linux/btf.h
> index cac9f304e27a..6260a0668773 100644
> --- a/include/linux/btf.h
> +++ b/include/linux/btf.h
> @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
> bool btf_is_module(const struct btf *btf);
> struct module *btf_try_get_module(const struct btf *btf);
> u32 btf_nr_types(const struct btf *btf);
> +u32 btf_type_cnt(const struct btf *btf);
> bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
> const struct btf_member *m,
> u32 expected_offset, u32 expected_size);
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 817204d53372..51aa9f27853b 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
> struct btf_id_dtor_kfunc dtors[];
> };
>
> +enum {
> + BTF_ID_NAME_FUNC, /* function */
> + BTF_ID_NAME_STRUCT, /* struct */
> + BTF_ID_NAME_MAX
> +};
> +
> +struct btf_id_name {
> + int id;
> + u32 name_off;
> +};
> +
> +struct btf_id_name_map {
> + struct btf_id_name *id_name;
> + u32 count;
> +};
> +
> +struct btf_id_name_maps {
> + struct btf_id_name_map map[BTF_ID_NAME_MAX];
> +};
> +
> struct btf {
> void *data;
> struct btf_type **types;
> @@ -257,6 +277,7 @@ struct btf {
> struct btf_kfunc_set_tab *kfunc_set_tab;
> struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
> struct btf_struct_metas *struct_meta_tab;
> + struct btf_id_name_maps *id_name_maps;
>
> /* split BTF support */
> struct btf *base_btf;
> @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
> return total;
> }
>
> +u32 btf_type_cnt(const struct btf *btf)
> +{
> + return btf->start_id + btf->nr_types;
> +}
> +
> +static inline u8 btf_id_name_idx_to_kind(int index)
> +{
> + u8 kind;
> +
> + switch (index) {
> + case BTF_ID_NAME_FUNC:
> + kind = BTF_KIND_FUNC;
> + break;
> + case BTF_ID_NAME_STRUCT:
> + kind = BTF_KIND_STRUCT;
> + break;
> + default:
> + kind = BTF_KIND_UNKN;
> + break;
> + }
> +
> + return kind;
> +}
> +
> +static inline int btf_id_name_kind_to_idx(u8 kind)
> +{
> + int index;
> +
> + switch (kind) {
> + case BTF_KIND_FUNC:
> + index = BTF_ID_NAME_FUNC;
> + break;
> + case BTF_KIND_STRUCT:
> + index = BTF_ID_NAME_STRUCT;
> + break;
> + default:
> + index = -1;
> + break;
> + }
> +
> + return index;
> +}
> +
> +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
> + u32 size, const char *name,
> + struct btf_id_name **start,
> + struct btf_id_name **end,
> + const struct btf *btf)
> +{
> + int ret;
> + int low, mid, high;
> + const char *name_buf;
> +
> + low = 0;
> + high = size - 1;
> +
> + while (low <= high) {
> + mid = low + (high - low) / 2;
> + name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
> + ret = strcmp(name, name_buf);
> + if (ret > 0)
> + low = mid + 1;
> + else if (ret < 0)
> + high = mid - 1;
> + else
> + break;
> + }
> +
> + if (low > high)
> + return -ESRCH;
> +
> + if (start) {
> + low = mid;
> + while (low) {
> + name_buf = btf_name_by_offset(btf, id_name[low-1].name_off);
> + if (strcmp(name, name_buf))
> + break;
> + low--;
> + }
> + *start = &id_name[low];
> + }
> +
> + if (end) {
> + high = mid;
> + while (high < size - 1) {
> + name_buf = btf_name_by_offset(btf, id_name[high+1].name_off);
> + if (strcmp(name, name_buf))
> + break;
> + high++;
> + }
> + *end = &id_name[high];
> + }
> +
> + return id_name[mid].id;
> +}
> +
> s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
> {
> + const struct btf_id_name_maps *maps;
> + const struct btf_id_name_map *map;
> + struct btf_id_name *start;
> const struct btf_type *t;
> const char *tname;
> - u32 i, total;
> + int index = btf_id_name_kind_to_idx(kind);
> + s32 id, total;
>
> - total = btf_nr_types(btf);
> - for (i = 1; i < total; i++) {
> - t = btf_type_by_id(btf, i);
> - if (BTF_INFO_KIND(t->info) != kind)
> - continue;
> + do {
> + maps = btf->id_name_maps;
> + if (index >= 0 && maps && maps->map[index].id_name) {
> + /* binary search */
> + map = &maps->map[index];
> + id = btf_find_by_name_bsearch(map->id_name,
> + map->count, name, &start, NULL, btf);
> + if (id > 0) {
> + /*
> + * Return the first one that
> + * matched
> + */
> + return start->id;
> + }
> + } else {
> + /* linear search */
> + total = btf_type_cnt(btf);
> + for (id = btf->start_id; id < total; id++) {
> + t = btf_type_by_id(btf, id);
> + if (BTF_INFO_KIND(t->info) != kind)
> + continue;
> +
> + tname = btf_name_by_offset(btf, t->name_off);
> + if (!strcmp(tname, name))
> + return id;
> + }
> + }
>
> - tname = btf_name_by_offset(btf, t->name_off);
> - if (!strcmp(tname, name))
> - return i;
> - }
> + btf = btf->base_btf;
> + } while (btf);
>
> return -ENOENT;
> }
> @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
> spin_unlock_irqrestore(&btf_idr_lock, flags);
> }
>
> +static void btf_destroy_id_name(struct btf *btf, int index)
> +{
> + struct btf_id_name_maps *maps = btf->id_name_maps;
> + struct btf_id_name_map *map = &maps->map[index];
> +
> + if (map->id_name) {
> + kvfree(map->id_name);
> + map->id_name = NULL;
> + map->count = 0;
> + }
> +}
> +
> +static void btf_destroy_id_name_map(struct btf *btf)
> +{
> + int i;
> +
> + if (!btf->id_name_maps)
> + return;
> +
> + for (i = 0; i < BTF_ID_NAME_MAX; i++)
> + btf_destroy_id_name(btf, i);
> +
> + kfree(btf->id_name_maps);
> + btf->id_name_maps = NULL;
> +}
> +
> static void btf_free_kfunc_set_tab(struct btf *btf)
> {
> struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
> @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf *btf)
>
> static void btf_free(struct btf *btf)
> {
> + btf_destroy_id_name_map(btf);
> btf_free_struct_meta_tab(btf);
> btf_free_dtor_kfunc_tab(btf);
> btf_free_kfunc_set_tab(btf);
> @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
> return kctx_type_id;
> }
>
> +static int btf_compare_id_name(const void *a, const void *b, const void *priv)
> +{
> + const struct btf_id_name *ia = (const struct btf_id_name *)a;
> + const struct btf_id_name *ib = (const struct btf_id_name *)b;
> + const struct btf *btf = priv;
> + int ret;
> +
> + /*
> + * Sort names in ascending order, if the name is same, sort ids in
> + * ascending order.
> + */
> + ret = strcmp(btf_name_by_offset(btf, ia->name_off),
> + btf_name_by_offset(btf, ib->name_off));
> + if (!ret)
> + ret = ia->id - ib->id;
> +
> + return ret;
> +}
> +
> +static int btf_create_id_name(struct btf *btf, int index)
> +{
> + struct btf_id_name_maps *maps = btf->id_name_maps;
> + struct btf_id_name_map *map = &maps->map[index];
> + const struct btf_type *t;
> + struct btf_id_name *id_name;
> + const char *name;
> + int i, j = 0;
> + u32 total, count = 0;
> + u8 kind;
> +
> + kind = btf_id_name_idx_to_kind(index);
> + if (kind == BTF_KIND_UNKN)
> + return -EINVAL;
> +
> + if (map->id_name || map->count != 0)
> + return -EINVAL;
> +
> + total = btf_type_cnt(btf);
> + for (i = btf->start_id; i < total; i++) {
> + t = btf_type_by_id(btf, i);
> + if (BTF_INFO_KIND(t->info) != kind)
> + continue;
> + name = btf_name_by_offset(btf, t->name_off);
> + if (str_is_empty(name))
> + continue;
> + count++;
> + }
> +
> + if (count == 0)
> + return 0;
> +
> + id_name = kvcalloc(count, sizeof(struct btf_id_name),
> + GFP_KERNEL);
> + if (!id_name)
> + return -ENOMEM;
> +
> + for (i = btf->start_id; i < total; i++) {
> + t = btf_type_by_id(btf, i);
> + if (BTF_INFO_KIND(t->info) != kind)
> + continue;
> + name = btf_name_by_offset(btf, t->name_off);
> + if (str_is_empty(name))
> + continue;
> +
> + id_name[j].id = i;
> + id_name[j].name_off = t->name_off;
> + j++;
> + }
> +
> + sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
> + NULL, btf);
> +
> + map->id_name = id_name;
> + map->count = count;
> +
> + return 0;
> +}
> +
> +static int btf_create_id_name_map(struct btf *btf)
> +{
> + int err, i;
> + struct btf_id_name_maps *maps;
> +
> + if (btf->id_name_maps)
> + return -EBUSY;
> +
> + maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
> + if (!maps)
> + return -ENOMEM;
> +
> + btf->id_name_maps = maps;
> +
> + for (i = 0; i < BTF_ID_NAME_MAX; i++) {
> + err = btf_create_id_name(btf, i);
> + if (err < 0)
> + break;
> + }
> +
> + return err;
> +}
> +
> BTF_ID_LIST(bpf_ctx_convert_btf_id)
> BTF_ID(struct, bpf_ctx_convert)
>
> @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
> if (err)
> goto errout;
>
> + err = btf_create_id_name_map(btf);
> + if (err)
> + goto errout;
> +
> /* btf_parse_vmlinux() runs under bpf_verifier_lock */
> bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
>
> @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
> errout:
> btf_verifier_env_free(env);
> if (btf) {
> + btf_destroy_id_name_map(btf);
> kvfree(btf->types);
> kfree(btf);
> }
> @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
> if (err)
> goto errout;
>
> + err = btf_create_id_name_map(btf);
> + if (err)
> + goto errout;
> +
> btf_verifier_env_free(env);
> refcount_set(&btf->refcnt, 1);
> +
> return btf;
>
> errout:
> btf_verifier_env_free(env);
> if (btf) {
> + btf_destroy_id_name_map(btf);
> kvfree(btf->data);
> kvfree(btf->types);
> kfree(btf);

2023-09-12 19:25:30

by Eduard Zingerman

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]> wrote:
> >
> > On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
> > > On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> > > > Currently, we are only using the linear search method to find the type id
> > > > by the name, which has a time complexity of O(n). This change involves
> > > > sorting the names of btf types in ascending order and using binary search,
> > > > which has a time complexity of O(log(n)). This idea was inspired by the
> > > > following patch:
> > > >
> > > > 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
> > > >
> > > > At present, this improvement is only for searching in vmlinux's and
> > > > module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
> > > >
> > > > Another change is the search direction, where we search the BTF first and
> > > > then its base, the type id of the first matched btf_type will be returned.
> > > >
> > > > Here is a time-consuming result that finding all the type ids of 67,819 kernel
> > > > functions in vmlinux's BTF by their names:
> > > >
> > > > Before: 17000 ms
> > > > After: 10 ms
> > > >
> > > > The average lookup performance has improved about 1700x at the above scenario.
> > > >
> > > > However, this change will consume more memory, for example, 67,819 kernel
> > > > functions will allocate about 530KB memory.
> > >
> > > Hi Donglin,
> > >
> > > I think this is a good improvement. However, I wonder, why did you
> > > choose to have a separate name map for each BTF kind?
> > >
> > > I did some analysis for my local testing kernel config and got such numbers:
> > > - total number of BTF objects: 97350
> > > - number of FUNC and STRUCT objects: 51597
> > > - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
> > > (these are all kinds for which lookup by name might make sense)
> > > - number of named objects: 54246
> > > - number of name collisions:
> > > - unique names: 53985 counts
> > > - 2 objects with the same name: 129 counts
> > > - 3 objects with the same name: 3 counts
> > >
> > > So, it appears that having a single map for all named objects makes
> > > sense and would also simplify the implementation, what do you think?
> >
> > Some more numbers for my config:
> > - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
> > - 43575 funcs, log2 43575 = 15.4
> > Thus, having separate map for types vs functions might save ~1.7
> > search iterations. Is this a significant slowdown in practice?
>
> What do you propose to do in case of duplicates ?
> func and struct can have the same name, but they will have two different
> btf_ids. How do we store them ?
> Also we might add global vars to BTF. Such request came up several times.
> So we need to make sure our search approach scales to
> func, struct, vars. I don't recall whether we search any other kinds.
> Separate arrays for different kinds seems ok.
> It's a bit of code complexity, but it's not an increase in memory.

Binary search gives, say, lowest index of a thing with name A, then
increment index while name remains A looking for correct kind.
Given the name conflicts info from above, 99% of times there would be
no need to iterate and in very few cases there would a couple of iterations.

Same logic would be necessary with current approach if different BTF
kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
cohorts are mainly a way to split the tree for faster lookups, but
maybe that is not the main intent.

> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
> extra memory. That's quite a bit. Anything we can do to compress it?

That's an interesting question, from the top of my head:
pre-sort in pahole (re-assign IDs so that increasing ID also would
mean "increasing" name), shouldn't be that difficult.

> Folks requested vmlinux BTF to be a module, so it's loaded on demand.
> BTF memory consumption is a concern to many.
> I think before we add these per-kind search arrays we better make
> BTF optional as a module.

2023-09-12 19:57:22

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/12 10:06, pengdonglin wrote:
> 在 2023/9/9 20:29, Masami Hiramatsu (Google) 写道:
>> On Sat,  9 Sep 2023 02:16:46 -0700
>> Donglin Peng <[email protected]> wrote:
>>
>>> Currently, we are only using the linear search method to find the
>>> type id
>>> by the name, which has a time complexity of O(n). This change involves
>>> sorting the names of btf types in ascending order and using binary
>>> search,
>>> which has a time complexity of O(log(n)). This idea was inspired by the
>>> following patch:
>>>
>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>> kallsyms_lookup_name()").
>>>
>>> At present, this improvement is only for searching in vmlinux's and
>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>> BTF_KIND_STRUCT.
>>>
>>> Another change is the search direction, where we search the BTF first
>>> and
>>> then its base, the type id of the first matched btf_type will be
>>> returned.
>>>
>>> Here is a time-consuming result that finding all the type ids of
>>> 67,819 kernel
>>> functions in vmlinux's BTF by their names:
>>>
>>> Before: 17000 ms
>>> After:     10 ms
>>
>> Nice work!
>
> Thank you
>
>>
>>>
>>> The average lookup performance has improved about 1700x at the above
>>> scenario.
>>>
>>> However, this change will consume more memory, for example, 67,819
>>> kernel
>>> functions will allocate about 530KB memory.
>>
>> I'm not so familier with how the BTF is generated, what about making this
>> list offline? Since BTF is static data, it is better to make the map when
>
> The BTF is generated by pahole during the building of the kernel or
> modules.
> Pahole is maintained in the project https://github.com/acmel/dwarves.
> The log
> printed by scripts/link-vmlinux.sh when generating BTF for vmlinux is as
> follows:
>
> LLVM_OBJCOPY=objcopy pahole -J --skip_encoding_btf_vars --btf_gen_floats
> .tmp_vmlinux.btf
>
> If making the list offline, the pahole needs to be modified or a new tool
> needs to be written and maintained in the kernel tree. Therefore, it may
> be simpler to make the list at runtime.
>
>> it is built. And I also would like to suggest to make a new map to make
>> another new map which maps the BTF ID and the address of the function, so
>> that we can do binary search the BTF object from the function address.
>> (The latter map should be built when CONFIG_BTF_ADDR_MAP=y)
>
> It has been observed that two or more functions may have the same address
> but different IDs. For example:
>
>         ffffffff81218370 t __do_sys_getuid16
>         ffffffff81218370 T __ia32_sys_getuid16
>         ffffffff81218370 T __x64_sys_getuid16
>
>         {
>             "id": 27911,
>             "kind": "FUNC",
>             "name": "__do_sys_getuid16",
>             "type_id": 4455,
>             "linkage": "static"
>         },{
>             "id": 20303,
>             "kind": "FUNC",
>             "name": "__ia32_sys_getuid16",
>             "type_id": 4455,
>             "linkage": "static"
>         },{
>             "id": 20304,
>             "kind": "FUNC",
>             "name": "__x64_sys_getuid16",
>             "type_id": 4455,
>             "linkage": "static"
>         },
>
> It may be a issue to return which id. However, if only the FUNC_PROTO is
> of concern, any one of them can be returned.
>
> It may not be necessary to create a new list for function addresses because
>  the id_name map can be reused for this purpose. Here is an idea:
>
> 1. Use the function address to get the name by calling the function
>  sprint_symbol_no_offset.
>
> 2. Then call the function btf_find_by_name_kind to get the BTF ID.
>
> Both sprint_symbol_no_offset and btf_find_by_name_kind use binary search.
>

Here is a time-consuming test to compare two methods for finding all the
IDs of 67,823 kernel function by their addresses:

1. Using a new map for function addresses took 4 ms.

2. Using sprint_symbol_no_offset + btf_find_by_name_kind took 38 ms.

However, the former method requires more memory. For example, if we use
the following structure to hold the function address and the BTF ID:

struct btf_id_func {
void *addr;
int id;
};

We would need to allocate 1059K (67823 * 16) of memory.

>>
>> Thank you,
>>
>>>
>>> Signed-off-by: Donglin Peng <[email protected]>
>>> ---
>>> Changes in RFC v2:
>>>   - Fix the build issue reported by kernel test robot <[email protected]>
>>> ---
>>>   include/linux/btf.h |   1 +
>>>   kernel/bpf/btf.c    | 300 ++++++++++++++++++++++++++++++++++++++++++--
>>>   2 files changed, 291 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/btf.h b/include/linux/btf.h
>>> index cac9f304e27a..6260a0668773 100644
>>> --- a/include/linux/btf.h
>>> +++ b/include/linux/btf.h
>>> @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
>>>   bool btf_is_module(const struct btf *btf);
>>>   struct module *btf_try_get_module(const struct btf *btf);
>>>   u32 btf_nr_types(const struct btf *btf);
>>> +u32 btf_type_cnt(const struct btf *btf);
>>>   bool btf_member_is_reg_int(const struct btf *btf, const struct
>>> btf_type *s,
>>>                  const struct btf_member *m,
>>>                  u32 expected_offset, u32 expected_size);
>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>>> index 817204d53372..51aa9f27853b 100644
>>> --- a/kernel/bpf/btf.c
>>> +++ b/kernel/bpf/btf.c
>>> @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
>>>       struct btf_id_dtor_kfunc dtors[];
>>>   };
>>> +enum {
>>> +    BTF_ID_NAME_FUNC,    /* function */
>>> +    BTF_ID_NAME_STRUCT,    /* struct */
>>> +    BTF_ID_NAME_MAX
>>> +};
>>> +
>>> +struct btf_id_name {
>>> +    int id;
>>> +    u32 name_off;
>>> +};
>>> +
>>> +struct btf_id_name_map {
>>> +    struct btf_id_name *id_name;
>>> +    u32 count;
>>> +};
>>> +
>>> +struct btf_id_name_maps {
>>> +    struct btf_id_name_map map[BTF_ID_NAME_MAX];
>>> +};
>>> +
>>>   struct btf {
>>>       void *data;
>>>       struct btf_type **types;
>>> @@ -257,6 +277,7 @@ struct btf {
>>>       struct btf_kfunc_set_tab *kfunc_set_tab;
>>>       struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
>>>       struct btf_struct_metas *struct_meta_tab;
>>> +    struct btf_id_name_maps *id_name_maps;
>>>       /* split BTF support */
>>>       struct btf *base_btf;
>>> @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
>>>       return total;
>>>   }
>>> +u32 btf_type_cnt(const struct btf *btf)
>>> +{
>>> +    return btf->start_id + btf->nr_types;
>>> +}
>>> +
>>> +static inline u8 btf_id_name_idx_to_kind(int index)
>>> +{
>>> +    u8 kind;
>>> +
>>> +    switch (index) {
>>> +    case BTF_ID_NAME_FUNC:
>>> +        kind = BTF_KIND_FUNC;
>>> +        break;
>>> +    case BTF_ID_NAME_STRUCT:
>>> +        kind = BTF_KIND_STRUCT;
>>> +        break;
>>> +    default:
>>> +        kind = BTF_KIND_UNKN;
>>> +        break;
>>> +    }
>>> +
>>> +    return kind;
>>> +}
>>> +
>>> +static inline int btf_id_name_kind_to_idx(u8 kind)
>>> +{
>>> +    int index;
>>> +
>>> +    switch (kind) {
>>> +    case BTF_KIND_FUNC:
>>> +        index = BTF_ID_NAME_FUNC;
>>> +        break;
>>> +    case BTF_KIND_STRUCT:
>>> +        index = BTF_ID_NAME_STRUCT;
>>> +        break;
>>> +    default:
>>> +        index = -1;
>>> +        break;
>>> +    }
>>> +
>>> +    return index;
>>> +}
>>> +
>>> +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
>>> +                    u32 size, const char *name,
>>> +                    struct btf_id_name **start,
>>> +                    struct btf_id_name **end,
>>> +                    const struct btf *btf)
>>> +{
>>> +    int ret;
>>> +    int low, mid, high;
>>> +    const char *name_buf;
>>> +
>>> +    low = 0;
>>> +    high = size - 1;
>>> +
>>> +    while (low <= high) {
>>> +        mid = low + (high - low) / 2;
>>> +        name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
>>> +        ret = strcmp(name, name_buf);
>>> +        if (ret > 0)
>>> +            low = mid + 1;
>>> +        else if (ret < 0)
>>> +            high = mid - 1;
>>> +        else
>>> +            break;
>>> +    }
>>> +
>>> +    if (low > high)
>>> +        return -ESRCH;
>>> +
>>> +    if (start) {
>>> +        low = mid;
>>> +        while (low) {
>>> +            name_buf = btf_name_by_offset(btf,
>>> id_name[low-1].name_off);
>>> +            if (strcmp(name, name_buf))
>>> +                break;
>>> +            low--;
>>> +        }
>>> +        *start = &id_name[low];
>>> +    }
>>> +
>>> +    if (end) {
>>> +        high = mid;
>>> +        while (high < size - 1) {
>>> +            name_buf = btf_name_by_offset(btf,
>>> id_name[high+1].name_off);
>>> +            if (strcmp(name, name_buf))
>>> +                break;
>>> +            high++;
>>> +        }
>>> +        *end = &id_name[high];
>>> +    }
>>> +
>>> +    return id_name[mid].id;
>>> +}
>>> +
>>>   s32 btf_find_by_name_kind(const struct btf *btf, const char *name,
>>> u8 kind)
>>>   {
>>> +    const struct btf_id_name_maps *maps;
>>> +    const struct btf_id_name_map *map;
>>> +    struct btf_id_name *start;
>>>       const struct btf_type *t;
>>>       const char *tname;
>>> -    u32 i, total;
>>> +    int index = btf_id_name_kind_to_idx(kind);
>>> +    s32 id, total;
>>> -    total = btf_nr_types(btf);
>>> -    for (i = 1; i < total; i++) {
>>> -        t = btf_type_by_id(btf, i);
>>> -        if (BTF_INFO_KIND(t->info) != kind)
>>> -            continue;
>>> +    do {
>>> +        maps = btf->id_name_maps;
>>> +        if (index >= 0 && maps && maps->map[index].id_name) {
>>> +            /* binary search */
>>> +            map = &maps->map[index];
>>> +            id = btf_find_by_name_bsearch(map->id_name,
>>> +                map->count, name, &start, NULL, btf);
>>> +            if (id > 0) {
>>> +                /*
>>> +                 * Return the first one that
>>> +                 * matched
>>> +                 */
>>> +                return start->id;
>>> +            }
>>> +        } else {
>>> +            /* linear search */
>>> +            total = btf_type_cnt(btf);
>>> +            for (id = btf->start_id; id < total; id++) {
>>> +                t = btf_type_by_id(btf, id);
>>> +                if (BTF_INFO_KIND(t->info) != kind)
>>> +                    continue;
>>> +
>>> +                tname = btf_name_by_offset(btf, t->name_off);
>>> +                if (!strcmp(tname, name))
>>> +                    return id;
>>> +            }
>>> +        }
>>> -        tname = btf_name_by_offset(btf, t->name_off);
>>> -        if (!strcmp(tname, name))
>>> -            return i;
>>> -    }
>>> +        btf = btf->base_btf;
>>> +    } while (btf);
>>>       return -ENOENT;
>>>   }
>>> @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
>>>       spin_unlock_irqrestore(&btf_idr_lock, flags);
>>>   }
>>> +static void btf_destroy_id_name(struct btf *btf, int index)
>>> +{
>>> +    struct btf_id_name_maps *maps = btf->id_name_maps;
>>> +    struct btf_id_name_map *map = &maps->map[index];
>>> +
>>> +    if (map->id_name) {
>>> +        kvfree(map->id_name);
>>> +        map->id_name = NULL;
>>> +        map->count = 0;
>>> +    }
>>> +}
>>> +
>>> +static void btf_destroy_id_name_map(struct btf *btf)
>>> +{
>>> +    int i;
>>> +
>>> +    if (!btf->id_name_maps)
>>> +        return;
>>> +
>>> +    for (i = 0; i < BTF_ID_NAME_MAX; i++)
>>> +        btf_destroy_id_name(btf, i);
>>> +
>>> +    kfree(btf->id_name_maps);
>>> +    btf->id_name_maps = NULL;
>>> +}
>>> +
>>>   static void btf_free_kfunc_set_tab(struct btf *btf)
>>>   {
>>>       struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
>>> @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf
>>> *btf)
>>>   static void btf_free(struct btf *btf)
>>>   {
>>> +    btf_destroy_id_name_map(btf);
>>>       btf_free_struct_meta_tab(btf);
>>>       btf_free_dtor_kfunc_tab(btf);
>>>       btf_free_kfunc_set_tab(btf);
>>> @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct
>>> bpf_verifier_log *log, enum bpf_prog_type prog_ty
>>>       return kctx_type_id;
>>>   }
>>> +static int btf_compare_id_name(const void *a, const void *b, const
>>> void *priv)
>>> +{
>>> +    const struct btf_id_name *ia = (const struct btf_id_name *)a;
>>> +    const struct btf_id_name *ib = (const struct btf_id_name *)b;
>>> +    const struct btf *btf = priv;
>>> +    int ret;
>>> +
>>> +    /*
>>> +     * Sort names in ascending order, if the name is same, sort ids in
>>> +     * ascending order.
>>> +     */
>>> +    ret = strcmp(btf_name_by_offset(btf, ia->name_off),
>>> +             btf_name_by_offset(btf, ib->name_off));
>>> +    if (!ret)
>>> +        ret = ia->id - ib->id;
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static int btf_create_id_name(struct btf *btf, int index)
>>> +{
>>> +    struct btf_id_name_maps *maps = btf->id_name_maps;
>>> +    struct btf_id_name_map *map = &maps->map[index];
>>> +    const struct btf_type *t;
>>> +    struct btf_id_name *id_name;
>>> +    const char *name;
>>> +    int i, j = 0;
>>> +    u32 total, count = 0;
>>> +    u8 kind;
>>> +
>>> +    kind = btf_id_name_idx_to_kind(index);
>>> +    if (kind == BTF_KIND_UNKN)
>>> +        return -EINVAL;
>>> +
>>> +    if (map->id_name || map->count != 0)
>>> +        return -EINVAL;
>>> +
>>> +    total = btf_type_cnt(btf);
>>> +    for (i = btf->start_id; i < total; i++) {
>>> +        t = btf_type_by_id(btf, i);
>>> +        if (BTF_INFO_KIND(t->info) != kind)
>>> +            continue;
>>> +        name = btf_name_by_offset(btf, t->name_off);
>>> +        if (str_is_empty(name))
>>> +            continue;
>>> +        count++;
>>> +    }
>>> +
>>> +    if (count == 0)
>>> +        return 0;
>>> +
>>> +    id_name = kvcalloc(count, sizeof(struct btf_id_name),
>>> +               GFP_KERNEL);
>>> +    if (!id_name)
>>> +        return -ENOMEM;
>>> +
>>> +    for (i = btf->start_id; i < total; i++) {
>>> +        t = btf_type_by_id(btf, i);
>>> +        if (BTF_INFO_KIND(t->info) != kind)
>>> +            continue;
>>> +        name = btf_name_by_offset(btf, t->name_off);
>>> +        if (str_is_empty(name))
>>> +            continue;
>>> +
>>> +        id_name[j].id = i;
>>> +        id_name[j].name_off = t->name_off;
>>> +        j++;
>>> +    }
>>> +
>>> +    sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
>>> +           NULL, btf);
>>> +
>>> +    map->id_name = id_name;
>>> +    map->count = count;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int btf_create_id_name_map(struct btf *btf)
>>> +{
>>> +    int err, i;
>>> +    struct btf_id_name_maps *maps;
>>> +
>>> +    if (btf->id_name_maps)
>>> +        return -EBUSY;
>>> +
>>> +    maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
>>> +    if (!maps)
>>> +        return -ENOMEM;
>>> +
>>> +    btf->id_name_maps = maps;
>>> +
>>> +    for (i = 0; i < BTF_ID_NAME_MAX; i++) {
>>> +        err = btf_create_id_name(btf, i);
>>> +        if (err < 0)
>>> +            break;
>>> +    }
>>> +
>>> +    return err;
>>> +}
>>> +
>>>   BTF_ID_LIST(bpf_ctx_convert_btf_id)
>>>   BTF_ID(struct, bpf_ctx_convert)
>>> @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
>>>       if (err)
>>>           goto errout;
>>> +    err = btf_create_id_name_map(btf);
>>> +    if (err)
>>> +        goto errout;
>>> +
>>>       /* btf_parse_vmlinux() runs under bpf_verifier_lock */
>>>       bpf_ctx_convert.t = btf_type_by_id(btf,
>>> bpf_ctx_convert_btf_id[0]);
>>> @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
>>>   errout:
>>>       btf_verifier_env_free(env);
>>>       if (btf) {
>>> +        btf_destroy_id_name_map(btf);
>>>           kvfree(btf->types);
>>>           kfree(btf);
>>>       }
>>> @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const
>>> char *module_name, const void *data, u
>>>       if (err)
>>>           goto errout;
>>> +    err = btf_create_id_name_map(btf);
>>> +    if (err)
>>> +        goto errout;
>>> +
>>>       btf_verifier_env_free(env);
>>>       refcount_set(&btf->refcnt, 1);
>>> +
>>>       return btf;
>>>   errout:
>>>       btf_verifier_env_free(env);
>>>       if (btf) {
>>> +        btf_destroy_id_name_map(btf);
>>>           kvfree(btf->data);
>>>           kvfree(btf->types);
>>>           kfree(btf);
>>> --
>>> 2.25.1
>>>
>>
>>
>

2023-09-12 22:21:12

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]> wrote:
>
> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
> > On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]> wrote:
> > >
> > > On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
> > > > On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> > > > > Currently, we are only using the linear search method to find the type id
> > > > > by the name, which has a time complexity of O(n). This change involves
> > > > > sorting the names of btf types in ascending order and using binary search,
> > > > > which has a time complexity of O(log(n)). This idea was inspired by the
> > > > > following patch:
> > > > >
> > > > > 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
> > > > >
> > > > > At present, this improvement is only for searching in vmlinux's and
> > > > > module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
> > > > >
> > > > > Another change is the search direction, where we search the BTF first and
> > > > > then its base, the type id of the first matched btf_type will be returned.
> > > > >
> > > > > Here is a time-consuming result that finding all the type ids of 67,819 kernel
> > > > > functions in vmlinux's BTF by their names:
> > > > >
> > > > > Before: 17000 ms
> > > > > After: 10 ms
> > > > >
> > > > > The average lookup performance has improved about 1700x at the above scenario.
> > > > >
> > > > > However, this change will consume more memory, for example, 67,819 kernel
> > > > > functions will allocate about 530KB memory.
> > > >
> > > > Hi Donglin,
> > > >
> > > > I think this is a good improvement. However, I wonder, why did you
> > > > choose to have a separate name map for each BTF kind?
> > > >
> > > > I did some analysis for my local testing kernel config and got such numbers:
> > > > - total number of BTF objects: 97350
> > > > - number of FUNC and STRUCT objects: 51597
> > > > - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
> > > > (these are all kinds for which lookup by name might make sense)
> > > > - number of named objects: 54246
> > > > - number of name collisions:
> > > > - unique names: 53985 counts
> > > > - 2 objects with the same name: 129 counts
> > > > - 3 objects with the same name: 3 counts
> > > >
> > > > So, it appears that having a single map for all named objects makes
> > > > sense and would also simplify the implementation, what do you think?
> > >
> > > Some more numbers for my config:
> > > - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
> > > - 43575 funcs, log2 43575 = 15.4
> > > Thus, having separate map for types vs functions might save ~1.7
> > > search iterations. Is this a significant slowdown in practice?
> >
> > What do you propose to do in case of duplicates ?
> > func and struct can have the same name, but they will have two different
> > btf_ids. How do we store them ?
> > Also we might add global vars to BTF. Such request came up several times.
> > So we need to make sure our search approach scales to
> > func, struct, vars. I don't recall whether we search any other kinds.
> > Separate arrays for different kinds seems ok.
> > It's a bit of code complexity, but it's not an increase in memory.
>
> Binary search gives, say, lowest index of a thing with name A, then
> increment index while name remains A looking for correct kind.
> Given the name conflicts info from above, 99% of times there would be
> no need to iterate and in very few cases there would a couple of iterations.
>
> Same logic would be necessary with current approach if different BTF
> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
> cohorts are mainly a way to split the tree for faster lookups, but
> maybe that is not the main intent.
>
> > With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
> > extra memory. That's quite a bit. Anything we can do to compress it?
>
> That's an interesting question, from the top of my head:
> pre-sort in pahole (re-assign IDs so that increasing ID also would
> mean "increasing" name), shouldn't be that difficult.

That sounds great. kallsyms are pre-sorted at build time.
We should do the same with BTF.
I think GCC can emit BTF directly now and LLVM emits it for bpf progs too,
but since vmlinux and kernel module BTFs will keep being processed
through pahole we don't have to make gcc/llvm sort things right away.
pahole will be enough. The kernel might do 'is it sorted' check
during BTF validation and then use binary search or fall back to linear
when not-sorted == old pahole.

2023-09-13 00:02:55

by Eduard Zingerman

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> > Currently, we are only using the linear search method to find the type id
> > by the name, which has a time complexity of O(n). This change involves
> > sorting the names of btf types in ascending order and using binary search,
> > which has a time complexity of O(log(n)). This idea was inspired by the
> > following patch:
> >
> > 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
> >
> > At present, this improvement is only for searching in vmlinux's and
> > module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
> >
> > Another change is the search direction, where we search the BTF first and
> > then its base, the type id of the first matched btf_type will be returned.
> >
> > Here is a time-consuming result that finding all the type ids of 67,819 kernel
> > functions in vmlinux's BTF by their names:
> >
> > Before: 17000 ms
> > After: 10 ms
> >
> > The average lookup performance has improved about 1700x at the above scenario.
> >
> > However, this change will consume more memory, for example, 67,819 kernel
> > functions will allocate about 530KB memory.
>
> Hi Donglin,
>
> I think this is a good improvement. However, I wonder, why did you
> choose to have a separate name map for each BTF kind?
>
> I did some analysis for my local testing kernel config and got such numbers:
> - total number of BTF objects: 97350
> - number of FUNC and STRUCT objects: 51597
> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
> (these are all kinds for which lookup by name might make sense)
> - number of named objects: 54246
> - number of name collisions:
> - unique names: 53985 counts
> - 2 objects with the same name: 129 counts
> - 3 objects with the same name: 3 counts
>
> So, it appears that having a single map for all named objects makes
> sense and would also simplify the implementation, what do you think?

Some more numbers for my config:
- 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
- 43575 funcs, log2 43575 = 15.4
Thus, having separate map for types vs functions might save ~1.7
search iterations. Is this a significant slowdown in practice?

>
> Thanks,
> Eduard
>
> >
> > Signed-off-by: Donglin Peng <[email protected]>
> > ---
> > Changes in RFC v2:
> > - Fix the build issue reported by kernel test robot <[email protected]>
> > ---
> > include/linux/btf.h | 1 +
> > kernel/bpf/btf.c | 300 ++++++++++++++++++++++++++++++++++++++++++--
> > 2 files changed, 291 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/linux/btf.h b/include/linux/btf.h
> > index cac9f304e27a..6260a0668773 100644
> > --- a/include/linux/btf.h
> > +++ b/include/linux/btf.h
> > @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
> > bool btf_is_module(const struct btf *btf);
> > struct module *btf_try_get_module(const struct btf *btf);
> > u32 btf_nr_types(const struct btf *btf);
> > +u32 btf_type_cnt(const struct btf *btf);
> > bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
> > const struct btf_member *m,
> > u32 expected_offset, u32 expected_size);
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index 817204d53372..51aa9f27853b 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
> > struct btf_id_dtor_kfunc dtors[];
> > };
> >
> > +enum {
> > + BTF_ID_NAME_FUNC, /* function */
> > + BTF_ID_NAME_STRUCT, /* struct */
> > + BTF_ID_NAME_MAX
> > +};
> > +
> > +struct btf_id_name {
> > + int id;
> > + u32 name_off;
> > +};
> > +
> > +struct btf_id_name_map {
> > + struct btf_id_name *id_name;
> > + u32 count;
> > +};
> > +
> > +struct btf_id_name_maps {
> > + struct btf_id_name_map map[BTF_ID_NAME_MAX];
> > +};
> > +
> > struct btf {
> > void *data;
> > struct btf_type **types;
> > @@ -257,6 +277,7 @@ struct btf {
> > struct btf_kfunc_set_tab *kfunc_set_tab;
> > struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
> > struct btf_struct_metas *struct_meta_tab;
> > + struct btf_id_name_maps *id_name_maps;
> >
> > /* split BTF support */
> > struct btf *base_btf;
> > @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
> > return total;
> > }
> >
> > +u32 btf_type_cnt(const struct btf *btf)
> > +{
> > + return btf->start_id + btf->nr_types;
> > +}
> > +
> > +static inline u8 btf_id_name_idx_to_kind(int index)
> > +{
> > + u8 kind;
> > +
> > + switch (index) {
> > + case BTF_ID_NAME_FUNC:
> > + kind = BTF_KIND_FUNC;
> > + break;
> > + case BTF_ID_NAME_STRUCT:
> > + kind = BTF_KIND_STRUCT;
> > + break;
> > + default:
> > + kind = BTF_KIND_UNKN;
> > + break;
> > + }
> > +
> > + return kind;
> > +}
> > +
> > +static inline int btf_id_name_kind_to_idx(u8 kind)
> > +{
> > + int index;
> > +
> > + switch (kind) {
> > + case BTF_KIND_FUNC:
> > + index = BTF_ID_NAME_FUNC;
> > + break;
> > + case BTF_KIND_STRUCT:
> > + index = BTF_ID_NAME_STRUCT;
> > + break;
> > + default:
> > + index = -1;
> > + break;
> > + }
> > +
> > + return index;
> > +}
> > +
> > +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
> > + u32 size, const char *name,
> > + struct btf_id_name **start,
> > + struct btf_id_name **end,
> > + const struct btf *btf)
> > +{
> > + int ret;
> > + int low, mid, high;
> > + const char *name_buf;
> > +
> > + low = 0;
> > + high = size - 1;
> > +
> > + while (low <= high) {
> > + mid = low + (high - low) / 2;
> > + name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
> > + ret = strcmp(name, name_buf);
> > + if (ret > 0)
> > + low = mid + 1;
> > + else if (ret < 0)
> > + high = mid - 1;
> > + else
> > + break;
> > + }
> > +
> > + if (low > high)
> > + return -ESRCH;
> > +
> > + if (start) {
> > + low = mid;
> > + while (low) {
> > + name_buf = btf_name_by_offset(btf, id_name[low-1].name_off);
> > + if (strcmp(name, name_buf))
> > + break;
> > + low--;
> > + }
> > + *start = &id_name[low];
> > + }
> > +
> > + if (end) {
> > + high = mid;
> > + while (high < size - 1) {
> > + name_buf = btf_name_by_offset(btf, id_name[high+1].name_off);
> > + if (strcmp(name, name_buf))
> > + break;
> > + high++;
> > + }
> > + *end = &id_name[high];
> > + }
> > +
> > + return id_name[mid].id;
> > +}
> > +
> > s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
> > {
> > + const struct btf_id_name_maps *maps;
> > + const struct btf_id_name_map *map;
> > + struct btf_id_name *start;
> > const struct btf_type *t;
> > const char *tname;
> > - u32 i, total;
> > + int index = btf_id_name_kind_to_idx(kind);
> > + s32 id, total;
> >
> > - total = btf_nr_types(btf);
> > - for (i = 1; i < total; i++) {
> > - t = btf_type_by_id(btf, i);
> > - if (BTF_INFO_KIND(t->info) != kind)
> > - continue;
> > + do {
> > + maps = btf->id_name_maps;
> > + if (index >= 0 && maps && maps->map[index].id_name) {
> > + /* binary search */
> > + map = &maps->map[index];
> > + id = btf_find_by_name_bsearch(map->id_name,
> > + map->count, name, &start, NULL, btf);
> > + if (id > 0) {
> > + /*
> > + * Return the first one that
> > + * matched
> > + */
> > + return start->id;
> > + }
> > + } else {
> > + /* linear search */
> > + total = btf_type_cnt(btf);
> > + for (id = btf->start_id; id < total; id++) {
> > + t = btf_type_by_id(btf, id);
> > + if (BTF_INFO_KIND(t->info) != kind)
> > + continue;
> > +
> > + tname = btf_name_by_offset(btf, t->name_off);
> > + if (!strcmp(tname, name))
> > + return id;
> > + }
> > + }
> >
> > - tname = btf_name_by_offset(btf, t->name_off);
> > - if (!strcmp(tname, name))
> > - return i;
> > - }
> > + btf = btf->base_btf;
> > + } while (btf);
> >
> > return -ENOENT;
> > }
> > @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
> > spin_unlock_irqrestore(&btf_idr_lock, flags);
> > }
> >
> > +static void btf_destroy_id_name(struct btf *btf, int index)
> > +{
> > + struct btf_id_name_maps *maps = btf->id_name_maps;
> > + struct btf_id_name_map *map = &maps->map[index];
> > +
> > + if (map->id_name) {
> > + kvfree(map->id_name);
> > + map->id_name = NULL;
> > + map->count = 0;
> > + }
> > +}
> > +
> > +static void btf_destroy_id_name_map(struct btf *btf)
> > +{
> > + int i;
> > +
> > + if (!btf->id_name_maps)
> > + return;
> > +
> > + for (i = 0; i < BTF_ID_NAME_MAX; i++)
> > + btf_destroy_id_name(btf, i);
> > +
> > + kfree(btf->id_name_maps);
> > + btf->id_name_maps = NULL;
> > +}
> > +
> > static void btf_free_kfunc_set_tab(struct btf *btf)
> > {
> > struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
> > @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf *btf)
> >
> > static void btf_free(struct btf *btf)
> > {
> > + btf_destroy_id_name_map(btf);
> > btf_free_struct_meta_tab(btf);
> > btf_free_dtor_kfunc_tab(btf);
> > btf_free_kfunc_set_tab(btf);
> > @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
> > return kctx_type_id;
> > }
> >
> > +static int btf_compare_id_name(const void *a, const void *b, const void *priv)
> > +{
> > + const struct btf_id_name *ia = (const struct btf_id_name *)a;
> > + const struct btf_id_name *ib = (const struct btf_id_name *)b;
> > + const struct btf *btf = priv;
> > + int ret;
> > +
> > + /*
> > + * Sort names in ascending order, if the name is same, sort ids in
> > + * ascending order.
> > + */
> > + ret = strcmp(btf_name_by_offset(btf, ia->name_off),
> > + btf_name_by_offset(btf, ib->name_off));
> > + if (!ret)
> > + ret = ia->id - ib->id;
> > +
> > + return ret;
> > +}
> > +
> > +static int btf_create_id_name(struct btf *btf, int index)
> > +{
> > + struct btf_id_name_maps *maps = btf->id_name_maps;
> > + struct btf_id_name_map *map = &maps->map[index];
> > + const struct btf_type *t;
> > + struct btf_id_name *id_name;
> > + const char *name;
> > + int i, j = 0;
> > + u32 total, count = 0;
> > + u8 kind;
> > +
> > + kind = btf_id_name_idx_to_kind(index);
> > + if (kind == BTF_KIND_UNKN)
> > + return -EINVAL;
> > +
> > + if (map->id_name || map->count != 0)
> > + return -EINVAL;
> > +
> > + total = btf_type_cnt(btf);
> > + for (i = btf->start_id; i < total; i++) {
> > + t = btf_type_by_id(btf, i);
> > + if (BTF_INFO_KIND(t->info) != kind)
> > + continue;
> > + name = btf_name_by_offset(btf, t->name_off);
> > + if (str_is_empty(name))
> > + continue;
> > + count++;
> > + }
> > +
> > + if (count == 0)
> > + return 0;
> > +
> > + id_name = kvcalloc(count, sizeof(struct btf_id_name),
> > + GFP_KERNEL);
> > + if (!id_name)
> > + return -ENOMEM;
> > +
> > + for (i = btf->start_id; i < total; i++) {
> > + t = btf_type_by_id(btf, i);
> > + if (BTF_INFO_KIND(t->info) != kind)
> > + continue;
> > + name = btf_name_by_offset(btf, t->name_off);
> > + if (str_is_empty(name))
> > + continue;
> > +
> > + id_name[j].id = i;
> > + id_name[j].name_off = t->name_off;
> > + j++;
> > + }
> > +
> > + sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
> > + NULL, btf);
> > +
> > + map->id_name = id_name;
> > + map->count = count;
> > +
> > + return 0;
> > +}
> > +
> > +static int btf_create_id_name_map(struct btf *btf)
> > +{
> > + int err, i;
> > + struct btf_id_name_maps *maps;
> > +
> > + if (btf->id_name_maps)
> > + return -EBUSY;
> > +
> > + maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
> > + if (!maps)
> > + return -ENOMEM;
> > +
> > + btf->id_name_maps = maps;
> > +
> > + for (i = 0; i < BTF_ID_NAME_MAX; i++) {
> > + err = btf_create_id_name(btf, i);
> > + if (err < 0)
> > + break;
> > + }
> > +
> > + return err;
> > +}
> > +
> > BTF_ID_LIST(bpf_ctx_convert_btf_id)
> > BTF_ID(struct, bpf_ctx_convert)
> >
> > @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
> > if (err)
> > goto errout;
> >
> > + err = btf_create_id_name_map(btf);
> > + if (err)
> > + goto errout;
> > +
> > /* btf_parse_vmlinux() runs under bpf_verifier_lock */
> > bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
> >
> > @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
> > errout:
> > btf_verifier_env_free(env);
> > if (btf) {
> > + btf_destroy_id_name_map(btf);
> > kvfree(btf->types);
> > kfree(btf);
> > }
> > @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
> > if (err)
> > goto errout;
> >
> > + err = btf_create_id_name_map(btf);
> > + if (err)
> > + goto errout;
> > +
> > btf_verifier_env_free(env);
> > refcount_set(&btf->refcnt, 1);
> > +
> > return btf;
> >
> > errout:
> > btf_verifier_env_free(env);
> > if (btf) {
> > + btf_destroy_id_name_map(btf);
> > kvfree(btf->data);
> > kvfree(btf->types);
> > kfree(btf);
>

2023-09-13 07:00:27

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]> wrote:
>
> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
> > On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> > > Currently, we are only using the linear search method to find the type id
> > > by the name, which has a time complexity of O(n). This change involves
> > > sorting the names of btf types in ascending order and using binary search,
> > > which has a time complexity of O(log(n)). This idea was inspired by the
> > > following patch:
> > >
> > > 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
> > >
> > > At present, this improvement is only for searching in vmlinux's and
> > > module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
> > >
> > > Another change is the search direction, where we search the BTF first and
> > > then its base, the type id of the first matched btf_type will be returned.
> > >
> > > Here is a time-consuming result that finding all the type ids of 67,819 kernel
> > > functions in vmlinux's BTF by their names:
> > >
> > > Before: 17000 ms
> > > After: 10 ms
> > >
> > > The average lookup performance has improved about 1700x at the above scenario.
> > >
> > > However, this change will consume more memory, for example, 67,819 kernel
> > > functions will allocate about 530KB memory.
> >
> > Hi Donglin,
> >
> > I think this is a good improvement. However, I wonder, why did you
> > choose to have a separate name map for each BTF kind?
> >
> > I did some analysis for my local testing kernel config and got such numbers:
> > - total number of BTF objects: 97350
> > - number of FUNC and STRUCT objects: 51597
> > - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
> > (these are all kinds for which lookup by name might make sense)
> > - number of named objects: 54246
> > - number of name collisions:
> > - unique names: 53985 counts
> > - 2 objects with the same name: 129 counts
> > - 3 objects with the same name: 3 counts
> >
> > So, it appears that having a single map for all named objects makes
> > sense and would also simplify the implementation, what do you think?
>
> Some more numbers for my config:
> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
> - 43575 funcs, log2 43575 = 15.4
> Thus, having separate map for types vs functions might save ~1.7
> search iterations. Is this a significant slowdown in practice?

What do you propose to do in case of duplicates ?
func and struct can have the same name, but they will have two different
btf_ids. How do we store them ?
Also we might add global vars to BTF. Such request came up several times.
So we need to make sure our search approach scales to
func, struct, vars. I don't recall whether we search any other kinds.
Separate arrays for different kinds seems ok.
It's a bit of code complexity, but it's not an increase in memory.
With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
extra memory. That's quite a bit. Anything we can do to compress it?
Folks requested vmlinux BTF to be a module, so it's loaded on demand.
BTF memory consumption is a concern to many.
I think before we add these per-kind search arrays we better make
BTF optional as a module.

2023-09-13 08:23:42

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Tue, 12 Sep 2023 11:35:16 +0800
pengdonglin <[email protected]> wrote:

> On 2023/9/12 10:06, pengdonglin wrote:
> > 在 2023/9/9 20:29, Masami Hiramatsu (Google) 写道:
> >> On Sat,  9 Sep 2023 02:16:46 -0700
> >> Donglin Peng <[email protected]> wrote:
> >>
> >>> Currently, we are only using the linear search method to find the
> >>> type id
> >>> by the name, which has a time complexity of O(n). This change involves
> >>> sorting the names of btf types in ascending order and using binary
> >>> search,
> >>> which has a time complexity of O(log(n)). This idea was inspired by the
> >>> following patch:
> >>>
> >>> 60443c88f3a8 ("kallsyms: Improve the performance of
> >>> kallsyms_lookup_name()").
> >>>
> >>> At present, this improvement is only for searching in vmlinux's and
> >>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
> >>> BTF_KIND_STRUCT.
> >>>
> >>> Another change is the search direction, where we search the BTF first
> >>> and
> >>> then its base, the type id of the first matched btf_type will be
> >>> returned.
> >>>
> >>> Here is a time-consuming result that finding all the type ids of
> >>> 67,819 kernel
> >>> functions in vmlinux's BTF by their names:
> >>>
> >>> Before: 17000 ms
> >>> After:     10 ms
> >>
> >> Nice work!
> >
> > Thank you
> >
> >>
> >>>
> >>> The average lookup performance has improved about 1700x at the above
> >>> scenario.
> >>>
> >>> However, this change will consume more memory, for example, 67,819
> >>> kernel
> >>> functions will allocate about 530KB memory.
> >>
> >> I'm not so familier with how the BTF is generated, what about making this
> >> list offline? Since BTF is static data, it is better to make the map when
> >
> > The BTF is generated by pahole during the building of the kernel or
> > modules.
> > Pahole is maintained in the project https://github.com/acmel/dwarves.
> > The log
> > printed by scripts/link-vmlinux.sh when generating BTF for vmlinux is as
> > follows:
> >
> > LLVM_OBJCOPY=objcopy pahole -J --skip_encoding_btf_vars --btf_gen_floats
> > .tmp_vmlinux.btf
> >
> > If making the list offline, the pahole needs to be modified or a new tool
> > needs to be written and maintained in the kernel tree. Therefore, it may
> > be simpler to make the list at runtime.

Hmm, I'm not convinced, since this push the sorting cost to the kernel
boot process. To reduce the kernel boot time we need to drop this improvement.
But if pahole just sort it offline, in any case we can use this.

> >
> >> it is built. And I also would like to suggest to make a new map to make
> >> another new map which maps the BTF ID and the address of the function, so
> >> that we can do binary search the BTF object from the function address.
> >> (The latter map should be built when CONFIG_BTF_ADDR_MAP=y)
> >
> > It has been observed that two or more functions may have the same address
> > but different IDs. For example:
> >
> >         ffffffff81218370 t __do_sys_getuid16
> >         ffffffff81218370 T __ia32_sys_getuid16
> >         ffffffff81218370 T __x64_sys_getuid16
> >
> >         {
> >             "id": 27911,
> >             "kind": "FUNC",
> >             "name": "__do_sys_getuid16",
> >             "type_id": 4455,
> >             "linkage": "static"
> >         },{
> >             "id": 20303,
> >             "kind": "FUNC",
> >             "name": "__ia32_sys_getuid16",
> >             "type_id": 4455,
> >             "linkage": "static"
> >         },{
> >             "id": 20304,
> >             "kind": "FUNC",
> >             "name": "__x64_sys_getuid16",
> >             "type_id": 4455,
> >             "linkage": "static"
> >         },
> >
> > It may be a issue to return which id. However, if only the FUNC_PROTO is
> > of concern, any one of them can be returned.

In this case, since those have same type_id, returning either one is
good enough. Since those have same address, we can not identify even
if we use kallsyms ;)

> >
> > It may not be necessary to create a new list for function addresses because
> >  the id_name map can be reused for this purpose. Here is an idea:
> >
> > 1. Use the function address to get the name by calling the function
> >  sprint_symbol_no_offset.
> >
> > 2. Then call the function btf_find_by_name_kind to get the BTF ID.
> >
> > Both sprint_symbol_no_offset and btf_find_by_name_kind use binary search.
> >
>
> Here is a time-consuming test to compare two methods for finding all the
> IDs of 67,823 kernel function by their addresses:
>
> 1. Using a new map for function addresses took 4 ms.
>
> 2. Using sprint_symbol_no_offset + btf_find_by_name_kind took 38 ms.
>
> However, the former method requires more memory. For example, if we use
> the following structure to hold the function address and the BTF ID:
>
> struct btf_id_func {
> void *addr;
> int id;
> };
>
> We would need to allocate 1059K (67823 * 16) of memory.

OK, then it can be optional. Anyway, currently only kprobe event will
need it but in the cold path.

Thank you,

>
> >>
> >> Thank you,
> >>
> >>>
> >>> Signed-off-by: Donglin Peng <[email protected]>
> >>> ---
> >>> Changes in RFC v2:
> >>>   - Fix the build issue reported by kernel test robot <[email protected]>
> >>> ---
> >>>   include/linux/btf.h |   1 +
> >>>   kernel/bpf/btf.c    | 300 ++++++++++++++++++++++++++++++++++++++++++--
> >>>   2 files changed, 291 insertions(+), 10 deletions(-)
> >>>
> >>> diff --git a/include/linux/btf.h b/include/linux/btf.h
> >>> index cac9f304e27a..6260a0668773 100644
> >>> --- a/include/linux/btf.h
> >>> +++ b/include/linux/btf.h
> >>> @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
> >>>   bool btf_is_module(const struct btf *btf);
> >>>   struct module *btf_try_get_module(const struct btf *btf);
> >>>   u32 btf_nr_types(const struct btf *btf);
> >>> +u32 btf_type_cnt(const struct btf *btf);
> >>>   bool btf_member_is_reg_int(const struct btf *btf, const struct
> >>> btf_type *s,
> >>>                  const struct btf_member *m,
> >>>                  u32 expected_offset, u32 expected_size);
> >>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> >>> index 817204d53372..51aa9f27853b 100644
> >>> --- a/kernel/bpf/btf.c
> >>> +++ b/kernel/bpf/btf.c
> >>> @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
> >>>       struct btf_id_dtor_kfunc dtors[];
> >>>   };
> >>> +enum {
> >>> +    BTF_ID_NAME_FUNC,    /* function */
> >>> +    BTF_ID_NAME_STRUCT,    /* struct */
> >>> +    BTF_ID_NAME_MAX
> >>> +};
> >>> +
> >>> +struct btf_id_name {
> >>> +    int id;
> >>> +    u32 name_off;
> >>> +};
> >>> +
> >>> +struct btf_id_name_map {
> >>> +    struct btf_id_name *id_name;
> >>> +    u32 count;
> >>> +};
> >>> +
> >>> +struct btf_id_name_maps {
> >>> +    struct btf_id_name_map map[BTF_ID_NAME_MAX];
> >>> +};
> >>> +
> >>>   struct btf {
> >>>       void *data;
> >>>       struct btf_type **types;
> >>> @@ -257,6 +277,7 @@ struct btf {
> >>>       struct btf_kfunc_set_tab *kfunc_set_tab;
> >>>       struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
> >>>       struct btf_struct_metas *struct_meta_tab;
> >>> +    struct btf_id_name_maps *id_name_maps;
> >>>       /* split BTF support */
> >>>       struct btf *base_btf;
> >>> @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
> >>>       return total;
> >>>   }
> >>> +u32 btf_type_cnt(const struct btf *btf)
> >>> +{
> >>> +    return btf->start_id + btf->nr_types;
> >>> +}
> >>> +
> >>> +static inline u8 btf_id_name_idx_to_kind(int index)
> >>> +{
> >>> +    u8 kind;
> >>> +
> >>> +    switch (index) {
> >>> +    case BTF_ID_NAME_FUNC:
> >>> +        kind = BTF_KIND_FUNC;
> >>> +        break;
> >>> +    case BTF_ID_NAME_STRUCT:
> >>> +        kind = BTF_KIND_STRUCT;
> >>> +        break;
> >>> +    default:
> >>> +        kind = BTF_KIND_UNKN;
> >>> +        break;
> >>> +    }
> >>> +
> >>> +    return kind;
> >>> +}
> >>> +
> >>> +static inline int btf_id_name_kind_to_idx(u8 kind)
> >>> +{
> >>> +    int index;
> >>> +
> >>> +    switch (kind) {
> >>> +    case BTF_KIND_FUNC:
> >>> +        index = BTF_ID_NAME_FUNC;
> >>> +        break;
> >>> +    case BTF_KIND_STRUCT:
> >>> +        index = BTF_ID_NAME_STRUCT;
> >>> +        break;
> >>> +    default:
> >>> +        index = -1;
> >>> +        break;
> >>> +    }
> >>> +
> >>> +    return index;
> >>> +}
> >>> +
> >>> +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
> >>> +                    u32 size, const char *name,
> >>> +                    struct btf_id_name **start,
> >>> +                    struct btf_id_name **end,
> >>> +                    const struct btf *btf)
> >>> +{
> >>> +    int ret;
> >>> +    int low, mid, high;
> >>> +    const char *name_buf;
> >>> +
> >>> +    low = 0;
> >>> +    high = size - 1;
> >>> +
> >>> +    while (low <= high) {
> >>> +        mid = low + (high - low) / 2;
> >>> +        name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
> >>> +        ret = strcmp(name, name_buf);
> >>> +        if (ret > 0)
> >>> +            low = mid + 1;
> >>> +        else if (ret < 0)
> >>> +            high = mid - 1;
> >>> +        else
> >>> +            break;
> >>> +    }
> >>> +
> >>> +    if (low > high)
> >>> +        return -ESRCH;
> >>> +
> >>> +    if (start) {
> >>> +        low = mid;
> >>> +        while (low) {
> >>> +            name_buf = btf_name_by_offset(btf,
> >>> id_name[low-1].name_off);
> >>> +            if (strcmp(name, name_buf))
> >>> +                break;
> >>> +            low--;
> >>> +        }
> >>> +        *start = &id_name[low];
> >>> +    }
> >>> +
> >>> +    if (end) {
> >>> +        high = mid;
> >>> +        while (high < size - 1) {
> >>> +            name_buf = btf_name_by_offset(btf,
> >>> id_name[high+1].name_off);
> >>> +            if (strcmp(name, name_buf))
> >>> +                break;
> >>> +            high++;
> >>> +        }
> >>> +        *end = &id_name[high];
> >>> +    }
> >>> +
> >>> +    return id_name[mid].id;
> >>> +}
> >>> +
> >>>   s32 btf_find_by_name_kind(const struct btf *btf, const char *name,
> >>> u8 kind)
> >>>   {
> >>> +    const struct btf_id_name_maps *maps;
> >>> +    const struct btf_id_name_map *map;
> >>> +    struct btf_id_name *start;
> >>>       const struct btf_type *t;
> >>>       const char *tname;
> >>> -    u32 i, total;
> >>> +    int index = btf_id_name_kind_to_idx(kind);
> >>> +    s32 id, total;
> >>> -    total = btf_nr_types(btf);
> >>> -    for (i = 1; i < total; i++) {
> >>> -        t = btf_type_by_id(btf, i);
> >>> -        if (BTF_INFO_KIND(t->info) != kind)
> >>> -            continue;
> >>> +    do {
> >>> +        maps = btf->id_name_maps;
> >>> +        if (index >= 0 && maps && maps->map[index].id_name) {
> >>> +            /* binary search */
> >>> +            map = &maps->map[index];
> >>> +            id = btf_find_by_name_bsearch(map->id_name,
> >>> +                map->count, name, &start, NULL, btf);
> >>> +            if (id > 0) {
> >>> +                /*
> >>> +                 * Return the first one that
> >>> +                 * matched
> >>> +                 */
> >>> +                return start->id;
> >>> +            }
> >>> +        } else {
> >>> +            /* linear search */
> >>> +            total = btf_type_cnt(btf);
> >>> +            for (id = btf->start_id; id < total; id++) {
> >>> +                t = btf_type_by_id(btf, id);
> >>> +                if (BTF_INFO_KIND(t->info) != kind)
> >>> +                    continue;
> >>> +
> >>> +                tname = btf_name_by_offset(btf, t->name_off);
> >>> +                if (!strcmp(tname, name))
> >>> +                    return id;
> >>> +            }
> >>> +        }
> >>> -        tname = btf_name_by_offset(btf, t->name_off);
> >>> -        if (!strcmp(tname, name))
> >>> -            return i;
> >>> -    }
> >>> +        btf = btf->base_btf;
> >>> +    } while (btf);
> >>>       return -ENOENT;
> >>>   }
> >>> @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
> >>>       spin_unlock_irqrestore(&btf_idr_lock, flags);
> >>>   }
> >>> +static void btf_destroy_id_name(struct btf *btf, int index)
> >>> +{
> >>> +    struct btf_id_name_maps *maps = btf->id_name_maps;
> >>> +    struct btf_id_name_map *map = &maps->map[index];
> >>> +
> >>> +    if (map->id_name) {
> >>> +        kvfree(map->id_name);
> >>> +        map->id_name = NULL;
> >>> +        map->count = 0;
> >>> +    }
> >>> +}
> >>> +
> >>> +static void btf_destroy_id_name_map(struct btf *btf)
> >>> +{
> >>> +    int i;
> >>> +
> >>> +    if (!btf->id_name_maps)
> >>> +        return;
> >>> +
> >>> +    for (i = 0; i < BTF_ID_NAME_MAX; i++)
> >>> +        btf_destroy_id_name(btf, i);
> >>> +
> >>> +    kfree(btf->id_name_maps);
> >>> +    btf->id_name_maps = NULL;
> >>> +}
> >>> +
> >>>   static void btf_free_kfunc_set_tab(struct btf *btf)
> >>>   {
> >>>       struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
> >>> @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf
> >>> *btf)
> >>>   static void btf_free(struct btf *btf)
> >>>   {
> >>> +    btf_destroy_id_name_map(btf);
> >>>       btf_free_struct_meta_tab(btf);
> >>>       btf_free_dtor_kfunc_tab(btf);
> >>>       btf_free_kfunc_set_tab(btf);
> >>> @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct
> >>> bpf_verifier_log *log, enum bpf_prog_type prog_ty
> >>>       return kctx_type_id;
> >>>   }
> >>> +static int btf_compare_id_name(const void *a, const void *b, const
> >>> void *priv)
> >>> +{
> >>> +    const struct btf_id_name *ia = (const struct btf_id_name *)a;
> >>> +    const struct btf_id_name *ib = (const struct btf_id_name *)b;
> >>> +    const struct btf *btf = priv;
> >>> +    int ret;
> >>> +
> >>> +    /*
> >>> +     * Sort names in ascending order, if the name is same, sort ids in
> >>> +     * ascending order.
> >>> +     */
> >>> +    ret = strcmp(btf_name_by_offset(btf, ia->name_off),
> >>> +             btf_name_by_offset(btf, ib->name_off));
> >>> +    if (!ret)
> >>> +        ret = ia->id - ib->id;
> >>> +
> >>> +    return ret;
> >>> +}
> >>> +
> >>> +static int btf_create_id_name(struct btf *btf, int index)
> >>> +{
> >>> +    struct btf_id_name_maps *maps = btf->id_name_maps;
> >>> +    struct btf_id_name_map *map = &maps->map[index];
> >>> +    const struct btf_type *t;
> >>> +    struct btf_id_name *id_name;
> >>> +    const char *name;
> >>> +    int i, j = 0;
> >>> +    u32 total, count = 0;
> >>> +    u8 kind;
> >>> +
> >>> +    kind = btf_id_name_idx_to_kind(index);
> >>> +    if (kind == BTF_KIND_UNKN)
> >>> +        return -EINVAL;
> >>> +
> >>> +    if (map->id_name || map->count != 0)
> >>> +        return -EINVAL;
> >>> +
> >>> +    total = btf_type_cnt(btf);
> >>> +    for (i = btf->start_id; i < total; i++) {
> >>> +        t = btf_type_by_id(btf, i);
> >>> +        if (BTF_INFO_KIND(t->info) != kind)
> >>> +            continue;
> >>> +        name = btf_name_by_offset(btf, t->name_off);
> >>> +        if (str_is_empty(name))
> >>> +            continue;
> >>> +        count++;
> >>> +    }
> >>> +
> >>> +    if (count == 0)
> >>> +        return 0;
> >>> +
> >>> +    id_name = kvcalloc(count, sizeof(struct btf_id_name),
> >>> +               GFP_KERNEL);
> >>> +    if (!id_name)
> >>> +        return -ENOMEM;
> >>> +
> >>> +    for (i = btf->start_id; i < total; i++) {
> >>> +        t = btf_type_by_id(btf, i);
> >>> +        if (BTF_INFO_KIND(t->info) != kind)
> >>> +            continue;
> >>> +        name = btf_name_by_offset(btf, t->name_off);
> >>> +        if (str_is_empty(name))
> >>> +            continue;
> >>> +
> >>> +        id_name[j].id = i;
> >>> +        id_name[j].name_off = t->name_off;
> >>> +        j++;
> >>> +    }
> >>> +
> >>> +    sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
> >>> +           NULL, btf);
> >>> +
> >>> +    map->id_name = id_name;
> >>> +    map->count = count;
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static int btf_create_id_name_map(struct btf *btf)
> >>> +{
> >>> +    int err, i;
> >>> +    struct btf_id_name_maps *maps;
> >>> +
> >>> +    if (btf->id_name_maps)
> >>> +        return -EBUSY;
> >>> +
> >>> +    maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
> >>> +    if (!maps)
> >>> +        return -ENOMEM;
> >>> +
> >>> +    btf->id_name_maps = maps;
> >>> +
> >>> +    for (i = 0; i < BTF_ID_NAME_MAX; i++) {
> >>> +        err = btf_create_id_name(btf, i);
> >>> +        if (err < 0)
> >>> +            break;
> >>> +    }
> >>> +
> >>> +    return err;
> >>> +}
> >>> +
> >>>   BTF_ID_LIST(bpf_ctx_convert_btf_id)
> >>>   BTF_ID(struct, bpf_ctx_convert)
> >>> @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
> >>>       if (err)
> >>>           goto errout;
> >>> +    err = btf_create_id_name_map(btf);
> >>> +    if (err)
> >>> +        goto errout;
> >>> +
> >>>       /* btf_parse_vmlinux() runs under bpf_verifier_lock */
> >>>       bpf_ctx_convert.t = btf_type_by_id(btf,
> >>> bpf_ctx_convert_btf_id[0]);
> >>> @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
> >>>   errout:
> >>>       btf_verifier_env_free(env);
> >>>       if (btf) {
> >>> +        btf_destroy_id_name_map(btf);
> >>>           kvfree(btf->types);
> >>>           kfree(btf);
> >>>       }
> >>> @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const
> >>> char *module_name, const void *data, u
> >>>       if (err)
> >>>           goto errout;
> >>> +    err = btf_create_id_name_map(btf);
> >>> +    if (err)
> >>> +        goto errout;
> >>> +
> >>>       btf_verifier_env_free(env);
> >>>       refcount_set(&btf->refcnt, 1);
> >>> +
> >>>       return btf;
> >>>   errout:
> >>>       btf_verifier_env_free(env);
> >>>       if (btf) {
> >>> +        btf_destroy_id_name_map(btf);
> >>>           kvfree(btf->data);
> >>>           kvfree(btf->types);
> >>>           kfree(btf);
> >>> --
> >>> 2.25.1
> >>>
> >>
> >>
> >
>


--
Masami Hiramatsu (Google) <[email protected]>

2023-09-13 09:48:36

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/13 16:07, Masami Hiramatsu (Google) wrote:
> On Tue, 12 Sep 2023 11:35:16 +0800
> pengdonglin <[email protected]> wrote:
>
>> On 2023/9/12 10:06, pengdonglin wrote:
>>> 在 2023/9/9 20:29, Masami Hiramatsu (Google) 写道:
>>>> On Sat,  9 Sep 2023 02:16:46 -0700
>>>> Donglin Peng <[email protected]> wrote:
>>>>
>>>>> Currently, we are only using the linear search method to find the
>>>>> type id
>>>>> by the name, which has a time complexity of O(n). This change involves
>>>>> sorting the names of btf types in ascending order and using binary
>>>>> search,
>>>>> which has a time complexity of O(log(n)). This idea was inspired by the
>>>>> following patch:
>>>>>
>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>> kallsyms_lookup_name()").
>>>>>
>>>>> At present, this improvement is only for searching in vmlinux's and
>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>> BTF_KIND_STRUCT.
>>>>>
>>>>> Another change is the search direction, where we search the BTF first
>>>>> and
>>>>> then its base, the type id of the first matched btf_type will be
>>>>> returned.
>>>>>
>>>>> Here is a time-consuming result that finding all the type ids of
>>>>> 67,819 kernel
>>>>> functions in vmlinux's BTF by their names:
>>>>>
>>>>> Before: 17000 ms
>>>>> After:     10 ms
>>>>
>>>> Nice work!
>>>
>>> Thank you
>>>
>>>>
>>>>>
>>>>> The average lookup performance has improved about 1700x at the above
>>>>> scenario.
>>>>>
>>>>> However, this change will consume more memory, for example, 67,819
>>>>> kernel
>>>>> functions will allocate about 530KB memory.
>>>>
>>>> I'm not so familier with how the BTF is generated, what about making this
>>>> list offline? Since BTF is static data, it is better to make the map when
>>>
>>> The BTF is generated by pahole during the building of the kernel or
>>> modules.
>>> Pahole is maintained in the project https://github.com/acmel/dwarves.
>>> The log
>>> printed by scripts/link-vmlinux.sh when generating BTF for vmlinux is as
>>> follows:
>>>
>>> LLVM_OBJCOPY=objcopy pahole -J --skip_encoding_btf_vars --btf_gen_floats
>>> .tmp_vmlinux.btf
>>>
>>> If making the list offline, the pahole needs to be modified or a new tool
>>> needs to be written and maintained in the kernel tree. Therefore, it may
>>> be simpler to make the list at runtime.
>
> Hmm, I'm not convinced, since this push the sorting cost to the kernel
> boot process. To reduce the kernel boot time we need to drop this improvement.
> But if pahole just sort it offline, in any case we can use this.

Thank you, I agree. In a separate thread, Eduard and Alexei also proposed
modifying the pahole. I will analyze the pahole and conduct a test.

>
>>>
>>>> it is built. And I also would like to suggest to make a new map to make
>>>> another new map which maps the BTF ID and the address of the function, so
>>>> that we can do binary search the BTF object from the function address.
>>>> (The latter map should be built when CONFIG_BTF_ADDR_MAP=y)
>>>
>>> It has been observed that two or more functions may have the same address
>>> but different IDs. For example:
>>>
>>>         ffffffff81218370 t __do_sys_getuid16
>>>         ffffffff81218370 T __ia32_sys_getuid16
>>>         ffffffff81218370 T __x64_sys_getuid16
>>>
>>>         {
>>>             "id": 27911,
>>>             "kind": "FUNC",
>>>             "name": "__do_sys_getuid16",
>>>             "type_id": 4455,
>>>             "linkage": "static"
>>>         },{
>>>             "id": 20303,
>>>             "kind": "FUNC",
>>>             "name": "__ia32_sys_getuid16",
>>>             "type_id": 4455,
>>>             "linkage": "static"
>>>         },{
>>>             "id": 20304,
>>>             "kind": "FUNC",
>>>             "name": "__x64_sys_getuid16",
>>>             "type_id": 4455,
>>>             "linkage": "static"
>>>         },
>>>
>>> It may be a issue to return which id. However, if only the FUNC_PROTO is
>>> of concern, any one of them can be returned.
>
> In this case, since those have same type_id, returning either one is
> good enough. Since those have same address, we can not identify even
> if we use kallsyms ;)

Yeah.

>
>>>
>>> It may not be necessary to create a new list for function addresses because
>>>  the id_name map can be reused for this purpose. Here is an idea:
>>>
>>> 1. Use the function address to get the name by calling the function
>>>  sprint_symbol_no_offset.
>>>
>>> 2. Then call the function btf_find_by_name_kind to get the BTF ID.
>>>
>>> Both sprint_symbol_no_offset and btf_find_by_name_kind use binary search.
>>>
>>
>> Here is a time-consuming test to compare two methods for finding all the
>> IDs of 67,823 kernel function by their addresses:
>>
>> 1. Using a new map for function addresses took 4 ms.
>>
>> 2. Using sprint_symbol_no_offset + btf_find_by_name_kind took 38 ms.
>>
>> However, the former method requires more memory. For example, if we use
>> the following structure to hold the function address and the BTF ID:
>>
>> struct btf_id_func {
>> void *addr;
>> int id;
>> };
>>
>> We would need to allocate 1059K (67823 * 16) of memory.
>
> OK, then it can be optional. Anyway, currently only kprobe event will
> need it but in the cold path.

Yeah, it appears that we need to utilize method 2 or explore alternateive
approaches.

>
> Thank you,
>
>>
>>>>
>>>> Thank you,
>>>>
>>>>>
>>>>> Signed-off-by: Donglin Peng <[email protected]>
>>>>> ---
>>>>> Changes in RFC v2:
>>>>>   - Fix the build issue reported by kernel test robot <[email protected]>
>>>>> ---
>>>>>   include/linux/btf.h |   1 +
>>>>>   kernel/bpf/btf.c    | 300 ++++++++++++++++++++++++++++++++++++++++++--
>>>>>   2 files changed, 291 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/btf.h b/include/linux/btf.h
>>>>> index cac9f304e27a..6260a0668773 100644
>>>>> --- a/include/linux/btf.h
>>>>> +++ b/include/linux/btf.h
>>>>> @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
>>>>>   bool btf_is_module(const struct btf *btf);
>>>>>   struct module *btf_try_get_module(const struct btf *btf);
>>>>>   u32 btf_nr_types(const struct btf *btf);
>>>>> +u32 btf_type_cnt(const struct btf *btf);
>>>>>   bool btf_member_is_reg_int(const struct btf *btf, const struct
>>>>> btf_type *s,
>>>>>                  const struct btf_member *m,
>>>>>                  u32 expected_offset, u32 expected_size);
>>>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>>>>> index 817204d53372..51aa9f27853b 100644
>>>>> --- a/kernel/bpf/btf.c
>>>>> +++ b/kernel/bpf/btf.c
>>>>> @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
>>>>>       struct btf_id_dtor_kfunc dtors[];
>>>>>   };
>>>>> +enum {
>>>>> +    BTF_ID_NAME_FUNC,    /* function */
>>>>> +    BTF_ID_NAME_STRUCT,    /* struct */
>>>>> +    BTF_ID_NAME_MAX
>>>>> +};
>>>>> +
>>>>> +struct btf_id_name {
>>>>> +    int id;
>>>>> +    u32 name_off;
>>>>> +};
>>>>> +
>>>>> +struct btf_id_name_map {
>>>>> +    struct btf_id_name *id_name;
>>>>> +    u32 count;
>>>>> +};
>>>>> +
>>>>> +struct btf_id_name_maps {
>>>>> +    struct btf_id_name_map map[BTF_ID_NAME_MAX];
>>>>> +};
>>>>> +
>>>>>   struct btf {
>>>>>       void *data;
>>>>>       struct btf_type **types;
>>>>> @@ -257,6 +277,7 @@ struct btf {
>>>>>       struct btf_kfunc_set_tab *kfunc_set_tab;
>>>>>       struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
>>>>>       struct btf_struct_metas *struct_meta_tab;
>>>>> +    struct btf_id_name_maps *id_name_maps;
>>>>>       /* split BTF support */
>>>>>       struct btf *base_btf;
>>>>> @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
>>>>>       return total;
>>>>>   }
>>>>> +u32 btf_type_cnt(const struct btf *btf)
>>>>> +{
>>>>> +    return btf->start_id + btf->nr_types;
>>>>> +}
>>>>> +
>>>>> +static inline u8 btf_id_name_idx_to_kind(int index)
>>>>> +{
>>>>> +    u8 kind;
>>>>> +
>>>>> +    switch (index) {
>>>>> +    case BTF_ID_NAME_FUNC:
>>>>> +        kind = BTF_KIND_FUNC;
>>>>> +        break;
>>>>> +    case BTF_ID_NAME_STRUCT:
>>>>> +        kind = BTF_KIND_STRUCT;
>>>>> +        break;
>>>>> +    default:
>>>>> +        kind = BTF_KIND_UNKN;
>>>>> +        break;
>>>>> +    }
>>>>> +
>>>>> +    return kind;
>>>>> +}
>>>>> +
>>>>> +static inline int btf_id_name_kind_to_idx(u8 kind)
>>>>> +{
>>>>> +    int index;
>>>>> +
>>>>> +    switch (kind) {
>>>>> +    case BTF_KIND_FUNC:
>>>>> +        index = BTF_ID_NAME_FUNC;
>>>>> +        break;
>>>>> +    case BTF_KIND_STRUCT:
>>>>> +        index = BTF_ID_NAME_STRUCT;
>>>>> +        break;
>>>>> +    default:
>>>>> +        index = -1;
>>>>> +        break;
>>>>> +    }
>>>>> +
>>>>> +    return index;
>>>>> +}
>>>>> +
>>>>> +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
>>>>> +                    u32 size, const char *name,
>>>>> +                    struct btf_id_name **start,
>>>>> +                    struct btf_id_name **end,
>>>>> +                    const struct btf *btf)
>>>>> +{
>>>>> +    int ret;
>>>>> +    int low, mid, high;
>>>>> +    const char *name_buf;
>>>>> +
>>>>> +    low = 0;
>>>>> +    high = size - 1;
>>>>> +
>>>>> +    while (low <= high) {
>>>>> +        mid = low + (high - low) / 2;
>>>>> +        name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
>>>>> +        ret = strcmp(name, name_buf);
>>>>> +        if (ret > 0)
>>>>> +            low = mid + 1;
>>>>> +        else if (ret < 0)
>>>>> +            high = mid - 1;
>>>>> +        else
>>>>> +            break;
>>>>> +    }
>>>>> +
>>>>> +    if (low > high)
>>>>> +        return -ESRCH;
>>>>> +
>>>>> +    if (start) {
>>>>> +        low = mid;
>>>>> +        while (low) {
>>>>> +            name_buf = btf_name_by_offset(btf,
>>>>> id_name[low-1].name_off);
>>>>> +            if (strcmp(name, name_buf))
>>>>> +                break;
>>>>> +            low--;
>>>>> +        }
>>>>> +        *start = &id_name[low];
>>>>> +    }
>>>>> +
>>>>> +    if (end) {
>>>>> +        high = mid;
>>>>> +        while (high < size - 1) {
>>>>> +            name_buf = btf_name_by_offset(btf,
>>>>> id_name[high+1].name_off);
>>>>> +            if (strcmp(name, name_buf))
>>>>> +                break;
>>>>> +            high++;
>>>>> +        }
>>>>> +        *end = &id_name[high];
>>>>> +    }
>>>>> +
>>>>> +    return id_name[mid].id;
>>>>> +}
>>>>> +
>>>>>   s32 btf_find_by_name_kind(const struct btf *btf, const char *name,
>>>>> u8 kind)
>>>>>   {
>>>>> +    const struct btf_id_name_maps *maps;
>>>>> +    const struct btf_id_name_map *map;
>>>>> +    struct btf_id_name *start;
>>>>>       const struct btf_type *t;
>>>>>       const char *tname;
>>>>> -    u32 i, total;
>>>>> +    int index = btf_id_name_kind_to_idx(kind);
>>>>> +    s32 id, total;
>>>>> -    total = btf_nr_types(btf);
>>>>> -    for (i = 1; i < total; i++) {
>>>>> -        t = btf_type_by_id(btf, i);
>>>>> -        if (BTF_INFO_KIND(t->info) != kind)
>>>>> -            continue;
>>>>> +    do {
>>>>> +        maps = btf->id_name_maps;
>>>>> +        if (index >= 0 && maps && maps->map[index].id_name) {
>>>>> +            /* binary search */
>>>>> +            map = &maps->map[index];
>>>>> +            id = btf_find_by_name_bsearch(map->id_name,
>>>>> +                map->count, name, &start, NULL, btf);
>>>>> +            if (id > 0) {
>>>>> +                /*
>>>>> +                 * Return the first one that
>>>>> +                 * matched
>>>>> +                 */
>>>>> +                return start->id;
>>>>> +            }
>>>>> +        } else {
>>>>> +            /* linear search */
>>>>> +            total = btf_type_cnt(btf);
>>>>> +            for (id = btf->start_id; id < total; id++) {
>>>>> +                t = btf_type_by_id(btf, id);
>>>>> +                if (BTF_INFO_KIND(t->info) != kind)
>>>>> +                    continue;
>>>>> +
>>>>> +                tname = btf_name_by_offset(btf, t->name_off);
>>>>> +                if (!strcmp(tname, name))
>>>>> +                    return id;
>>>>> +            }
>>>>> +        }
>>>>> -        tname = btf_name_by_offset(btf, t->name_off);
>>>>> -        if (!strcmp(tname, name))
>>>>> -            return i;
>>>>> -    }
>>>>> +        btf = btf->base_btf;
>>>>> +    } while (btf);
>>>>>       return -ENOENT;
>>>>>   }
>>>>> @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
>>>>>       spin_unlock_irqrestore(&btf_idr_lock, flags);
>>>>>   }
>>>>> +static void btf_destroy_id_name(struct btf *btf, int index)
>>>>> +{
>>>>> +    struct btf_id_name_maps *maps = btf->id_name_maps;
>>>>> +    struct btf_id_name_map *map = &maps->map[index];
>>>>> +
>>>>> +    if (map->id_name) {
>>>>> +        kvfree(map->id_name);
>>>>> +        map->id_name = NULL;
>>>>> +        map->count = 0;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static void btf_destroy_id_name_map(struct btf *btf)
>>>>> +{
>>>>> +    int i;
>>>>> +
>>>>> +    if (!btf->id_name_maps)
>>>>> +        return;
>>>>> +
>>>>> +    for (i = 0; i < BTF_ID_NAME_MAX; i++)
>>>>> +        btf_destroy_id_name(btf, i);
>>>>> +
>>>>> +    kfree(btf->id_name_maps);
>>>>> +    btf->id_name_maps = NULL;
>>>>> +}
>>>>> +
>>>>>   static void btf_free_kfunc_set_tab(struct btf *btf)
>>>>>   {
>>>>>       struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
>>>>> @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf
>>>>> *btf)
>>>>>   static void btf_free(struct btf *btf)
>>>>>   {
>>>>> +    btf_destroy_id_name_map(btf);
>>>>>       btf_free_struct_meta_tab(btf);
>>>>>       btf_free_dtor_kfunc_tab(btf);
>>>>>       btf_free_kfunc_set_tab(btf);
>>>>> @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct
>>>>> bpf_verifier_log *log, enum bpf_prog_type prog_ty
>>>>>       return kctx_type_id;
>>>>>   }
>>>>> +static int btf_compare_id_name(const void *a, const void *b, const
>>>>> void *priv)
>>>>> +{
>>>>> +    const struct btf_id_name *ia = (const struct btf_id_name *)a;
>>>>> +    const struct btf_id_name *ib = (const struct btf_id_name *)b;
>>>>> +    const struct btf *btf = priv;
>>>>> +    int ret;
>>>>> +
>>>>> +    /*
>>>>> +     * Sort names in ascending order, if the name is same, sort ids in
>>>>> +     * ascending order.
>>>>> +     */
>>>>> +    ret = strcmp(btf_name_by_offset(btf, ia->name_off),
>>>>> +             btf_name_by_offset(btf, ib->name_off));
>>>>> +    if (!ret)
>>>>> +        ret = ia->id - ib->id;
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +static int btf_create_id_name(struct btf *btf, int index)
>>>>> +{
>>>>> +    struct btf_id_name_maps *maps = btf->id_name_maps;
>>>>> +    struct btf_id_name_map *map = &maps->map[index];
>>>>> +    const struct btf_type *t;
>>>>> +    struct btf_id_name *id_name;
>>>>> +    const char *name;
>>>>> +    int i, j = 0;
>>>>> +    u32 total, count = 0;
>>>>> +    u8 kind;
>>>>> +
>>>>> +    kind = btf_id_name_idx_to_kind(index);
>>>>> +    if (kind == BTF_KIND_UNKN)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (map->id_name || map->count != 0)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    total = btf_type_cnt(btf);
>>>>> +    for (i = btf->start_id; i < total; i++) {
>>>>> +        t = btf_type_by_id(btf, i);
>>>>> +        if (BTF_INFO_KIND(t->info) != kind)
>>>>> +            continue;
>>>>> +        name = btf_name_by_offset(btf, t->name_off);
>>>>> +        if (str_is_empty(name))
>>>>> +            continue;
>>>>> +        count++;
>>>>> +    }
>>>>> +
>>>>> +    if (count == 0)
>>>>> +        return 0;
>>>>> +
>>>>> +    id_name = kvcalloc(count, sizeof(struct btf_id_name),
>>>>> +               GFP_KERNEL);
>>>>> +    if (!id_name)
>>>>> +        return -ENOMEM;
>>>>> +
>>>>> +    for (i = btf->start_id; i < total; i++) {
>>>>> +        t = btf_type_by_id(btf, i);
>>>>> +        if (BTF_INFO_KIND(t->info) != kind)
>>>>> +            continue;
>>>>> +        name = btf_name_by_offset(btf, t->name_off);
>>>>> +        if (str_is_empty(name))
>>>>> +            continue;
>>>>> +
>>>>> +        id_name[j].id = i;
>>>>> +        id_name[j].name_off = t->name_off;
>>>>> +        j++;
>>>>> +    }
>>>>> +
>>>>> +    sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
>>>>> +           NULL, btf);
>>>>> +
>>>>> +    map->id_name = id_name;
>>>>> +    map->count = count;
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static int btf_create_id_name_map(struct btf *btf)
>>>>> +{
>>>>> +    int err, i;
>>>>> +    struct btf_id_name_maps *maps;
>>>>> +
>>>>> +    if (btf->id_name_maps)
>>>>> +        return -EBUSY;
>>>>> +
>>>>> +    maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
>>>>> +    if (!maps)
>>>>> +        return -ENOMEM;
>>>>> +
>>>>> +    btf->id_name_maps = maps;
>>>>> +
>>>>> +    for (i = 0; i < BTF_ID_NAME_MAX; i++) {
>>>>> +        err = btf_create_id_name(btf, i);
>>>>> +        if (err < 0)
>>>>> +            break;
>>>>> +    }
>>>>> +
>>>>> +    return err;
>>>>> +}
>>>>> +
>>>>>   BTF_ID_LIST(bpf_ctx_convert_btf_id)
>>>>>   BTF_ID(struct, bpf_ctx_convert)
>>>>> @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
>>>>>       if (err)
>>>>>           goto errout;
>>>>> +    err = btf_create_id_name_map(btf);
>>>>> +    if (err)
>>>>> +        goto errout;
>>>>> +
>>>>>       /* btf_parse_vmlinux() runs under bpf_verifier_lock */
>>>>>       bpf_ctx_convert.t = btf_type_by_id(btf,
>>>>> bpf_ctx_convert_btf_id[0]);
>>>>> @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
>>>>>   errout:
>>>>>       btf_verifier_env_free(env);
>>>>>       if (btf) {
>>>>> +        btf_destroy_id_name_map(btf);
>>>>>           kvfree(btf->types);
>>>>>           kfree(btf);
>>>>>       }
>>>>> @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const
>>>>> char *module_name, const void *data, u
>>>>>       if (err)
>>>>>           goto errout;
>>>>> +    err = btf_create_id_name_map(btf);
>>>>> +    if (err)
>>>>> +        goto errout;
>>>>> +
>>>>>       btf_verifier_env_free(env);
>>>>>       refcount_set(&btf->refcnt, 1);
>>>>> +
>>>>>       return btf;
>>>>>   errout:
>>>>>       btf_verifier_env_free(env);
>>>>>       if (btf) {
>>>>> +        btf_destroy_id_name_map(btf);
>>>>>           kvfree(btf->data);
>>>>>           kvfree(btf->types);
>>>>>           kfree(btf);
>>>>> --
>>>>> 2.25.1
>>>>>
>>>>
>>>>
>>>
>>
>
>

2023-09-13 10:04:58

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/13 1:03, Eduard Zingerman wrote:
> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]> wrote:
>>>
>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>> Currently, we are only using the linear search method to find the type id
>>>>> by the name, which has a time complexity of O(n). This change involves
>>>>> sorting the names of btf types in ascending order and using binary search,
>>>>> which has a time complexity of O(log(n)). This idea was inspired by the
>>>>> following patch:
>>>>>
>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
>>>>>
>>>>> At present, this improvement is only for searching in vmlinux's and
>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
>>>>>
>>>>> Another change is the search direction, where we search the BTF first and
>>>>> then its base, the type id of the first matched btf_type will be returned.
>>>>>
>>>>> Here is a time-consuming result that finding all the type ids of 67,819 kernel
>>>>> functions in vmlinux's BTF by their names:
>>>>>
>>>>> Before: 17000 ms
>>>>> After: 10 ms
>>>>>
>>>>> The average lookup performance has improved about 1700x at the above scenario.
>>>>>
>>>>> However, this change will consume more memory, for example, 67,819 kernel
>>>>> functions will allocate about 530KB memory.
>>>>
>>>> Hi Donglin,
>>>>
>>>> I think this is a good improvement. However, I wonder, why did you
>>>> choose to have a separate name map for each BTF kind?
>>>>
>>>> I did some analysis for my local testing kernel config and got such numbers:
>>>> - total number of BTF objects: 97350
>>>> - number of FUNC and STRUCT objects: 51597
>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
>>>> (these are all kinds for which lookup by name might make sense)
>>>> - number of named objects: 54246
>>>> - number of name collisions:
>>>> - unique names: 53985 counts
>>>> - 2 objects with the same name: 129 counts
>>>> - 3 objects with the same name: 3 counts
>>>>
>>>> So, it appears that having a single map for all named objects makes
>>>> sense and would also simplify the implementation, what do you think?
>>>
>>> Some more numbers for my config:
>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>> - 43575 funcs, log2 43575 = 15.4
>>> Thus, having separate map for types vs functions might save ~1.7
>>> search iterations. Is this a significant slowdown in practice?
>>
>> What do you propose to do in case of duplicates ?
>> func and struct can have the same name, but they will have two different
>> btf_ids. How do we store them ?
>> Also we might add global vars to BTF. Such request came up several times.
>> So we need to make sure our search approach scales to
>> func, struct, vars. I don't recall whether we search any other kinds.
>> Separate arrays for different kinds seems ok.
>> It's a bit of code complexity, but it's not an increase in memory.
>
> Binary search gives, say, lowest index of a thing with name A, then
> increment index while name remains A looking for correct kind.
> Given the name conflicts info from above, 99% of times there would be
> no need to iterate and in very few cases there would a couple of iterations.

Yeah, I agree.

>
> Same logic would be necessary with current approach if different BTF
> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
> cohorts are mainly a way to split the tree for faster lookups, but
> maybe that is not the main intent.
Yeah, I thought it may be faster for some kinds, but I didn't perform a
comparative test.

>
>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>> extra memory. That's quite a bit. Anything we can do to compress it?
>
> That's an interesting question, from the top of my head:
> pre-sort in pahole (re-assign IDs so that increasing ID also would
> mean "increasing" name), shouldn't be that difficult.

Thank you, I agree.

>
>> Folks requested vmlinux BTF to be a module, so it's loaded on demand.
>> BTF memory consumption is a concern to many.
>> I think before we add these per-kind search arrays we better make
>> BTF optional as a module.
>
>

2023-09-13 10:58:56

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/13 0:40, Alexei Starovoitov wrote:
> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]> wrote:
>>
>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>> Currently, we are only using the linear search method to find the type id
>>>> by the name, which has a time complexity of O(n). This change involves
>>>> sorting the names of btf types in ascending order and using binary search,
>>>> which has a time complexity of O(log(n)). This idea was inspired by the
>>>> following patch:
>>>>
>>>> 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
>>>>
>>>> At present, this improvement is only for searching in vmlinux's and
>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
>>>>
>>>> Another change is the search direction, where we search the BTF first and
>>>> then its base, the type id of the first matched btf_type will be returned.
>>>>
>>>> Here is a time-consuming result that finding all the type ids of 67,819 kernel
>>>> functions in vmlinux's BTF by their names:
>>>>
>>>> Before: 17000 ms
>>>> After: 10 ms
>>>>
>>>> The average lookup performance has improved about 1700x at the above scenario.
>>>>
>>>> However, this change will consume more memory, for example, 67,819 kernel
>>>> functions will allocate about 530KB memory.
>>>
>>> Hi Donglin,
>>>
>>> I think this is a good improvement. However, I wonder, why did you
>>> choose to have a separate name map for each BTF kind?
>>>
>>> I did some analysis for my local testing kernel config and got such numbers:
>>> - total number of BTF objects: 97350
>>> - number of FUNC and STRUCT objects: 51597
>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
>>> (these are all kinds for which lookup by name might make sense)
>>> - number of named objects: 54246
>>> - number of name collisions:
>>> - unique names: 53985 counts
>>> - 2 objects with the same name: 129 counts
>>> - 3 objects with the same name: 3 counts
>>>
>>> So, it appears that having a single map for all named objects makes
>>> sense and would also simplify the implementation, what do you think?
>>
>> Some more numbers for my config:
>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>> - 43575 funcs, log2 43575 = 15.4
>> Thus, having separate map for types vs functions might save ~1.7
>> search iterations. Is this a significant slowdown in practice?
>
> What do you propose to do in case of duplicates ?
> func and struct can have the same name, but they will have two different
> btf_ids. How do we store them ?
> Also we might add global vars to BTF. Such request came up several times.
> So we need to make sure our search approach scales to
> func, struct, vars. I don't recall whether we search any other kinds.
> Separate arrays for different kinds seems ok.
> It's a bit of code complexity, but it's not an increase in memory.
> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
> extra memory. That's quite a bit. Anything we can do to compress it?
> Folks requested vmlinux BTF to be a module, so it's loaded on demand.
> BTF memory consumption is a concern to many.
> I think before we add these per-kind search arrays we better make
> BTF optional as a module.

I think that making BTF as a module may not have much significance, since
the function bpf_get_btf_vmlinux is invoked in many places, such as during
loading a kernel module.

>

2023-09-13 13:30:11

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/12 22:19, Eduard Zingerman wrote:
> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>> Currently, we are only using the linear search method to find the type id
>>> by the name, which has a time complexity of O(n). This change involves
>>> sorting the names of btf types in ascending order and using binary search,
>>> which has a time complexity of O(log(n)). This idea was inspired by the
>>> following patch:
>>>
>>> 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
>>>
>>> At present, this improvement is only for searching in vmlinux's and
>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
>>>
>>> Another change is the search direction, where we search the BTF first and
>>> then its base, the type id of the first matched btf_type will be returned.
>>>
>>> Here is a time-consuming result that finding all the type ids of 67,819 kernel
>>> functions in vmlinux's BTF by their names:
>>>
>>> Before: 17000 ms
>>> After: 10 ms
>>>
>>> The average lookup performance has improved about 1700x at the above scenario.
>>>
>>> However, this change will consume more memory, for example, 67,819 kernel
>>> functions will allocate about 530KB memory.
>>
>> Hi Donglin,
>>
>> I think this is a good improvement. However, I wonder, why did you
>> choose to have a separate name map for each BTF kind?
>>
>> I did some analysis for my local testing kernel config and got such numbers:
>> - total number of BTF objects: 97350
>> - number of FUNC and STRUCT objects: 51597
>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
>> (these are all kinds for which lookup by name might make sense)
>> - number of named objects: 54246
>> - number of name collisions:
>> - unique names: 53985 counts
>> - 2 objects with the same name: 129 counts
>> - 3 objects with the same name: 3 counts
>>
>> So, it appears that having a single map for all named objects makes
>> sense and would also simplify the implementation, what do you think?
>
> Some more numbers for my config:
> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
> - 43575 funcs, log2 43575 = 15.4
> Thus, having separate map for types vs functions might save ~1.7
> search iterations. Is this a significant slowdown in practice?
>

Thank you, I haven't conducted such a comparative test yet, but I agree
with your point of view.

>>
>> Thanks,
>> Eduard
>>
>>>
>>> Signed-off-by: Donglin Peng <[email protected]>
>>> ---
>>> Changes in RFC v2:
>>> - Fix the build issue reported by kernel test robot <[email protected]>
>>> ---
>>> include/linux/btf.h | 1 +
>>> kernel/bpf/btf.c | 300 ++++++++++++++++++++++++++++++++++++++++++--
>>> 2 files changed, 291 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/include/linux/btf.h b/include/linux/btf.h
>>> index cac9f304e27a..6260a0668773 100644
>>> --- a/include/linux/btf.h
>>> +++ b/include/linux/btf.h
>>> @@ -201,6 +201,7 @@ bool btf_is_kernel(const struct btf *btf);
>>> bool btf_is_module(const struct btf *btf);
>>> struct module *btf_try_get_module(const struct btf *btf);
>>> u32 btf_nr_types(const struct btf *btf);
>>> +u32 btf_type_cnt(const struct btf *btf);
>>> bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
>>> const struct btf_member *m,
>>> u32 expected_offset, u32 expected_size);
>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>>> index 817204d53372..51aa9f27853b 100644
>>> --- a/kernel/bpf/btf.c
>>> +++ b/kernel/bpf/btf.c
>>> @@ -240,6 +240,26 @@ struct btf_id_dtor_kfunc_tab {
>>> struct btf_id_dtor_kfunc dtors[];
>>> };
>>>
>>> +enum {
>>> + BTF_ID_NAME_FUNC, /* function */
>>> + BTF_ID_NAME_STRUCT, /* struct */
>>> + BTF_ID_NAME_MAX
>>> +};
>>> +
>>> +struct btf_id_name {
>>> + int id;
>>> + u32 name_off;
>>> +};
>>> +
>>> +struct btf_id_name_map {
>>> + struct btf_id_name *id_name;
>>> + u32 count;
>>> +};
>>> +
>>> +struct btf_id_name_maps {
>>> + struct btf_id_name_map map[BTF_ID_NAME_MAX];
>>> +};
>>> +
>>> struct btf {
>>> void *data;
>>> struct btf_type **types;
>>> @@ -257,6 +277,7 @@ struct btf {
>>> struct btf_kfunc_set_tab *kfunc_set_tab;
>>> struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
>>> struct btf_struct_metas *struct_meta_tab;
>>> + struct btf_id_name_maps *id_name_maps;
>>>
>>> /* split BTF support */
>>> struct btf *base_btf;
>>> @@ -532,22 +553,142 @@ u32 btf_nr_types(const struct btf *btf)
>>> return total;
>>> }
>>>
>>> +u32 btf_type_cnt(const struct btf *btf)
>>> +{
>>> + return btf->start_id + btf->nr_types;
>>> +}
>>> +
>>> +static inline u8 btf_id_name_idx_to_kind(int index)
>>> +{
>>> + u8 kind;
>>> +
>>> + switch (index) {
>>> + case BTF_ID_NAME_FUNC:
>>> + kind = BTF_KIND_FUNC;
>>> + break;
>>> + case BTF_ID_NAME_STRUCT:
>>> + kind = BTF_KIND_STRUCT;
>>> + break;
>>> + default:
>>> + kind = BTF_KIND_UNKN;
>>> + break;
>>> + }
>>> +
>>> + return kind;
>>> +}
>>> +
>>> +static inline int btf_id_name_kind_to_idx(u8 kind)
>>> +{
>>> + int index;
>>> +
>>> + switch (kind) {
>>> + case BTF_KIND_FUNC:
>>> + index = BTF_ID_NAME_FUNC;
>>> + break;
>>> + case BTF_KIND_STRUCT:
>>> + index = BTF_ID_NAME_STRUCT;
>>> + break;
>>> + default:
>>> + index = -1;
>>> + break;
>>> + }
>>> +
>>> + return index;
>>> +}
>>> +
>>> +static s32 btf_find_by_name_bsearch(struct btf_id_name *id_name,
>>> + u32 size, const char *name,
>>> + struct btf_id_name **start,
>>> + struct btf_id_name **end,
>>> + const struct btf *btf)
>>> +{
>>> + int ret;
>>> + int low, mid, high;
>>> + const char *name_buf;
>>> +
>>> + low = 0;
>>> + high = size - 1;
>>> +
>>> + while (low <= high) {
>>> + mid = low + (high - low) / 2;
>>> + name_buf = btf_name_by_offset(btf, id_name[mid].name_off);
>>> + ret = strcmp(name, name_buf);
>>> + if (ret > 0)
>>> + low = mid + 1;
>>> + else if (ret < 0)
>>> + high = mid - 1;
>>> + else
>>> + break;
>>> + }
>>> +
>>> + if (low > high)
>>> + return -ESRCH;
>>> +
>>> + if (start) {
>>> + low = mid;
>>> + while (low) {
>>> + name_buf = btf_name_by_offset(btf, id_name[low-1].name_off);
>>> + if (strcmp(name, name_buf))
>>> + break;
>>> + low--;
>>> + }
>>> + *start = &id_name[low];
>>> + }
>>> +
>>> + if (end) {
>>> + high = mid;
>>> + while (high < size - 1) {
>>> + name_buf = btf_name_by_offset(btf, id_name[high+1].name_off);
>>> + if (strcmp(name, name_buf))
>>> + break;
>>> + high++;
>>> + }
>>> + *end = &id_name[high];
>>> + }
>>> +
>>> + return id_name[mid].id;
>>> +}
>>> +
>>> s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
>>> {
>>> + const struct btf_id_name_maps *maps;
>>> + const struct btf_id_name_map *map;
>>> + struct btf_id_name *start;
>>> const struct btf_type *t;
>>> const char *tname;
>>> - u32 i, total;
>>> + int index = btf_id_name_kind_to_idx(kind);
>>> + s32 id, total;
>>>
>>> - total = btf_nr_types(btf);
>>> - for (i = 1; i < total; i++) {
>>> - t = btf_type_by_id(btf, i);
>>> - if (BTF_INFO_KIND(t->info) != kind)
>>> - continue;
>>> + do {
>>> + maps = btf->id_name_maps;
>>> + if (index >= 0 && maps && maps->map[index].id_name) {
>>> + /* binary search */
>>> + map = &maps->map[index];
>>> + id = btf_find_by_name_bsearch(map->id_name,
>>> + map->count, name, &start, NULL, btf);
>>> + if (id > 0) {
>>> + /*
>>> + * Return the first one that
>>> + * matched
>>> + */
>>> + return start->id;
>>> + }
>>> + } else {
>>> + /* linear search */
>>> + total = btf_type_cnt(btf);
>>> + for (id = btf->start_id; id < total; id++) {
>>> + t = btf_type_by_id(btf, id);
>>> + if (BTF_INFO_KIND(t->info) != kind)
>>> + continue;
>>> +
>>> + tname = btf_name_by_offset(btf, t->name_off);
>>> + if (!strcmp(tname, name))
>>> + return id;
>>> + }
>>> + }
>>>
>>> - tname = btf_name_by_offset(btf, t->name_off);
>>> - if (!strcmp(tname, name))
>>> - return i;
>>> - }
>>> + btf = btf->base_btf;
>>> + } while (btf);
>>>
>>> return -ENOENT;
>>> }
>>> @@ -1639,6 +1780,32 @@ static void btf_free_id(struct btf *btf)
>>> spin_unlock_irqrestore(&btf_idr_lock, flags);
>>> }
>>>
>>> +static void btf_destroy_id_name(struct btf *btf, int index)
>>> +{
>>> + struct btf_id_name_maps *maps = btf->id_name_maps;
>>> + struct btf_id_name_map *map = &maps->map[index];
>>> +
>>> + if (map->id_name) {
>>> + kvfree(map->id_name);
>>> + map->id_name = NULL;
>>> + map->count = 0;
>>> + }
>>> +}
>>> +
>>> +static void btf_destroy_id_name_map(struct btf *btf)
>>> +{
>>> + int i;
>>> +
>>> + if (!btf->id_name_maps)
>>> + return;
>>> +
>>> + for (i = 0; i < BTF_ID_NAME_MAX; i++)
>>> + btf_destroy_id_name(btf, i);
>>> +
>>> + kfree(btf->id_name_maps);
>>> + btf->id_name_maps = NULL;
>>> +}
>>> +
>>> static void btf_free_kfunc_set_tab(struct btf *btf)
>>> {
>>> struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
>>> @@ -1689,6 +1856,7 @@ static void btf_free_struct_meta_tab(struct btf *btf)
>>>
>>> static void btf_free(struct btf *btf)
>>> {
>>> + btf_destroy_id_name_map(btf);
>>> btf_free_struct_meta_tab(btf);
>>> btf_free_dtor_kfunc_tab(btf);
>>> btf_free_kfunc_set_tab(btf);
>>> @@ -5713,6 +5881,107 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
>>> return kctx_type_id;
>>> }
>>>
>>> +static int btf_compare_id_name(const void *a, const void *b, const void *priv)
>>> +{
>>> + const struct btf_id_name *ia = (const struct btf_id_name *)a;
>>> + const struct btf_id_name *ib = (const struct btf_id_name *)b;
>>> + const struct btf *btf = priv;
>>> + int ret;
>>> +
>>> + /*
>>> + * Sort names in ascending order, if the name is same, sort ids in
>>> + * ascending order.
>>> + */
>>> + ret = strcmp(btf_name_by_offset(btf, ia->name_off),
>>> + btf_name_by_offset(btf, ib->name_off));
>>> + if (!ret)
>>> + ret = ia->id - ib->id;
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int btf_create_id_name(struct btf *btf, int index)
>>> +{
>>> + struct btf_id_name_maps *maps = btf->id_name_maps;
>>> + struct btf_id_name_map *map = &maps->map[index];
>>> + const struct btf_type *t;
>>> + struct btf_id_name *id_name;
>>> + const char *name;
>>> + int i, j = 0;
>>> + u32 total, count = 0;
>>> + u8 kind;
>>> +
>>> + kind = btf_id_name_idx_to_kind(index);
>>> + if (kind == BTF_KIND_UNKN)
>>> + return -EINVAL;
>>> +
>>> + if (map->id_name || map->count != 0)
>>> + return -EINVAL;
>>> +
>>> + total = btf_type_cnt(btf);
>>> + for (i = btf->start_id; i < total; i++) {
>>> + t = btf_type_by_id(btf, i);
>>> + if (BTF_INFO_KIND(t->info) != kind)
>>> + continue;
>>> + name = btf_name_by_offset(btf, t->name_off);
>>> + if (str_is_empty(name))
>>> + continue;
>>> + count++;
>>> + }
>>> +
>>> + if (count == 0)
>>> + return 0;
>>> +
>>> + id_name = kvcalloc(count, sizeof(struct btf_id_name),
>>> + GFP_KERNEL);
>>> + if (!id_name)
>>> + return -ENOMEM;
>>> +
>>> + for (i = btf->start_id; i < total; i++) {
>>> + t = btf_type_by_id(btf, i);
>>> + if (BTF_INFO_KIND(t->info) != kind)
>>> + continue;
>>> + name = btf_name_by_offset(btf, t->name_off);
>>> + if (str_is_empty(name))
>>> + continue;
>>> +
>>> + id_name[j].id = i;
>>> + id_name[j].name_off = t->name_off;
>>> + j++;
>>> + }
>>> +
>>> + sort_r(id_name, count, sizeof(id_name[0]), btf_compare_id_name,
>>> + NULL, btf);
>>> +
>>> + map->id_name = id_name;
>>> + map->count = count;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int btf_create_id_name_map(struct btf *btf)
>>> +{
>>> + int err, i;
>>> + struct btf_id_name_maps *maps;
>>> +
>>> + if (btf->id_name_maps)
>>> + return -EBUSY;
>>> +
>>> + maps = kzalloc(sizeof(struct btf_id_name_maps), GFP_KERNEL);
>>> + if (!maps)
>>> + return -ENOMEM;
>>> +
>>> + btf->id_name_maps = maps;
>>> +
>>> + for (i = 0; i < BTF_ID_NAME_MAX; i++) {
>>> + err = btf_create_id_name(btf, i);
>>> + if (err < 0)
>>> + break;
>>> + }
>>> +
>>> + return err;
>>> +}
>>> +
>>> BTF_ID_LIST(bpf_ctx_convert_btf_id)
>>> BTF_ID(struct, bpf_ctx_convert)
>>>
>>> @@ -5760,6 +6029,10 @@ struct btf *btf_parse_vmlinux(void)
>>> if (err)
>>> goto errout;
>>>
>>> + err = btf_create_id_name_map(btf);
>>> + if (err)
>>> + goto errout;
>>> +
>>> /* btf_parse_vmlinux() runs under bpf_verifier_lock */
>>> bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
>>>
>>> @@ -5777,6 +6050,7 @@ struct btf *btf_parse_vmlinux(void)
>>> errout:
>>> btf_verifier_env_free(env);
>>> if (btf) {
>>> + btf_destroy_id_name_map(btf);
>>> kvfree(btf->types);
>>> kfree(btf);
>>> }
>>> @@ -5844,13 +6118,19 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
>>> if (err)
>>> goto errout;
>>>
>>> + err = btf_create_id_name_map(btf);
>>> + if (err)
>>> + goto errout;
>>> +
>>> btf_verifier_env_free(env);
>>> refcount_set(&btf->refcnt, 1);
>>> +
>>> return btf;
>>>
>>> errout:
>>> btf_verifier_env_free(env);
>>> if (btf) {
>>> + btf_destroy_id_name_map(btf);
>>> kvfree(btf->data);
>>> kvfree(btf->types);
>>> kfree(btf);
>>
>
>

2023-09-13 19:13:13

by Eduard Zingerman

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Wed, 2023-09-13 at 14:34 +0100, Alan Maguire wrote:
> On 13/09/2023 11:32, pengdonglin wrote:
> > On 2023/9/13 2:46, Alexei Starovoitov wrote:
> > > On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
> > > wrote:
> > > >
> > > > On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
> > > > > On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]>
> > > > > wrote:
> > > > > >
> > > > > > On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
> > > > > > > On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> > > > > > > > Currently, we are only using the linear search method to find the
> > > > > > > > type id
> > > > > > > > by the name, which has a time complexity of O(n). This change
> > > > > > > > involves
> > > > > > > > sorting the names of btf types in ascending order and using
> > > > > > > > binary search,
> > > > > > > > which has a time complexity of O(log(n)). This idea was inspired
> > > > > > > > by the
> > > > > > > > following patch:
> > > > > > > >
> > > > > > > > 60443c88f3a8 ("kallsyms: Improve the performance of
> > > > > > > > kallsyms_lookup_name()").
> > > > > > > >
> > > > > > > > At present, this improvement is only for searching in vmlinux's and
> > > > > > > > module's BTFs, and the kind should only be BTF_KIND_FUNC or
> > > > > > > > BTF_KIND_STRUCT.
> > > > > > > >
> > > > > > > > Another change is the search direction, where we search the BTF
> > > > > > > > first and
> > > > > > > > then its base, the type id of the first matched btf_type will be
> > > > > > > > returned.
> > > > > > > >
> > > > > > > > Here is a time-consuming result that finding all the type ids of
> > > > > > > > 67,819 kernel
> > > > > > > > functions in vmlinux's BTF by their names:
> > > > > > > >
> > > > > > > > Before: 17000 ms
> > > > > > > > After:     10 ms
> > > > > > > >
> > > > > > > > The average lookup performance has improved about 1700x at the
> > > > > > > > above scenario.
> > > > > > > >
> > > > > > > > However, this change will consume more memory, for example,
> > > > > > > > 67,819 kernel
> > > > > > > > functions will allocate about 530KB memory.
> > > > > > >
> > > > > > > Hi Donglin,
> > > > > > >
> > > > > > > I think this is a good improvement. However, I wonder, why did you
> > > > > > > choose to have a separate name map for each BTF kind?
> > > > > > >
> > > > > > > I did some analysis for my local testing kernel config and got
> > > > > > > such numbers:
> > > > > > > - total number of BTF objects: 97350
> > > > > > > - number of FUNC and STRUCT objects: 51597
> > > > > > > - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
> > > > > > > objects: 56817
> > > > > > >    (these are all kinds for which lookup by name might make sense)
> > > > > > > - number of named objects: 54246
> > > > > > > - number of name collisions:
> > > > > > >    - unique names: 53985 counts
> > > > > > >    - 2 objects with the same name: 129 counts
> > > > > > >    - 3 objects with the same name: 3 counts
> > > > > > >
> > > > > > > So, it appears that having a single map for all named objects makes
> > > > > > > sense and would also simplify the implementation, what do you think?
> > > > > >
> > > > > > Some more numbers for my config:
> > > > > > - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
> > > > > > - 43575 funcs, log2 43575 = 15.4
> > > > > > Thus, having separate map for types vs functions might save ~1.7
> > > > > > search iterations. Is this a significant slowdown in practice?
> > > > >
> > > > > What do you propose to do in case of duplicates ?
> > > > > func and struct can have the same name, but they will have two
> > > > > different
> > > > > btf_ids. How do we store them ?
> > > > > Also we might add global vars to BTF. Such request came up several
> > > > > times.
> > > > > So we need to make sure our search approach scales to
> > > > > func, struct, vars. I don't recall whether we search any other kinds.
> > > > > Separate arrays for different kinds seems ok.
> > > > > It's a bit of code complexity, but it's not an increase in memory.
> > > >
> > > > Binary search gives, say, lowest index of a thing with name A, then
> > > > increment index while name remains A looking for correct kind.
> > > > Given the name conflicts info from above, 99% of times there would be
> > > > no need to iterate and in very few cases there would a couple of
> > > > iterations.
> > > >
> > > > Same logic would be necessary with current approach if different BTF
> > > > kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
> > > > cohorts are mainly a way to split the tree for faster lookups, but
> > > > maybe that is not the main intent.
> > > >
> > > > > With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
> > > > > extra memory. That's quite a bit. Anything we can do to compress it?
> > > >
> > > > That's an interesting question, from the top of my head:
> > > > pre-sort in pahole (re-assign IDs so that increasing ID also would
> > > > mean "increasing" name), shouldn't be that difficult.
> > >
> > > That sounds great. kallsyms are pre-sorted at build time.
> > > We should do the same with BTF.
> > > I think GCC can emit BTF directly now and LLVM emits it for bpf progs
> > > too,
> > > but since vmlinux and kernel module BTFs will keep being processed
> > > through pahole we don't have to make gcc/llvm sort things right away.
> > > pahole will be enough. The kernel might do 'is it sorted' check
> > > during BTF validation and then use binary search or fall back to linear
> > > when not-sorted == old pahole.
> > >
> >
> > Yeah, I agree and will attempt to modify the pahole and perform a test.
> > Do we need
> > to introduce a new macro to control the behavior when the BTF is not
> > sorted? If
> > it is not sorted, we can use the method mentioned in this patch or use
> > linear
> > search.
> >
> >
>
> One challenge with pahole is that it often runs in parallel mode, so I
> suspect any sorting would have to be done after merging across threads.
> Perhaps BTF deduplication time might be a useful time to re-sort by
> name? BTF dedup happens after BTF has been merged, and a new "sorted"
> btf_dedup_opts option could be added and controlled by a pahole
> option. However dedup is pretty complicated already..

Hi Alan,

libbpf might be the right place to do this, however, I think that it is
also doable in pahole's btf_encoder__encode(), e.g. as follows:
- after a call to btf__dedup():
- create a sorted by name IDs ordering;
- create a new BTF object;
- add records to the new BTF according to the sorted ordering;
- remap id references while adding;
- use the new BTF object instead of old one to write BTF output.

I assume that implementation would be similar regardless whether it is
done in pahole or in libbpf.

Thanks,
Eduard

> One thing we should weigh up though is if there are benefits to the
> way BTF is currently laid out. It tends to start with base types,
> and often-encountered types end up being located towards the start
> of the BTF data. For example
>
>
> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none)
> [2] CONST '(anon)' type_id=1
> [3] VOLATILE '(anon)' type_id=1
> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
> [5] PTR '(anon)' type_id=8
> [6] CONST '(anon)' type_id=5
> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
> [8] CONST '(anon)' type_id=7
> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
> [10] CONST '(anon)' type_id=9
> [11] TYPEDEF '__s8' type_id=12
> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
> [13] TYPEDEF '__u8' type_id=14
>
> So often-used types will be found quickly, even under linear search
> conditions.
>
> When we look at how many lookups by id (which are O(1), since they are
> done via the btf->types[] array) versus by name, we see:
>
> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
> 120
> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
> 15
>
> I don't see a huge number of name-based lookups, and I think most are
> outside of the hotter codepaths, unless I'm missing some. All of which
> is to say it would be a good idea to have a clear sense of what will get
> faster with sorted-by-name BTF. Thanks!
>
> Alan

2023-09-13 21:16:07

by Alan Maguire

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 13/09/2023 11:32, pengdonglin wrote:
> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>> wrote:
>>>
>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]>
>>>> wrote:
>>>>>
>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>> Currently, we are only using the linear search method to find the
>>>>>>> type id
>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>> involves
>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>> binary search,
>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>> by the
>>>>>>> following patch:
>>>>>>>
>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>> kallsyms_lookup_name()").
>>>>>>>
>>>>>>> At present, this improvement is only for searching in vmlinux's and
>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>> BTF_KIND_STRUCT.
>>>>>>>
>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>> first and
>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>> returned.
>>>>>>>
>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>> 67,819 kernel
>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>
>>>>>>> Before: 17000 ms
>>>>>>> After:     10 ms
>>>>>>>
>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>> above scenario.
>>>>>>>
>>>>>>> However, this change will consume more memory, for example,
>>>>>>> 67,819 kernel
>>>>>>> functions will allocate about 530KB memory.
>>>>>>
>>>>>> Hi Donglin,
>>>>>>
>>>>>> I think this is a good improvement. However, I wonder, why did you
>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>
>>>>>> I did some analysis for my local testing kernel config and got
>>>>>> such numbers:
>>>>>> - total number of BTF objects: 97350
>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>> objects: 56817
>>>>>>    (these are all kinds for which lookup by name might make sense)
>>>>>> - number of named objects: 54246
>>>>>> - number of name collisions:
>>>>>>    - unique names: 53985 counts
>>>>>>    - 2 objects with the same name: 129 counts
>>>>>>    - 3 objects with the same name: 3 counts
>>>>>>
>>>>>> So, it appears that having a single map for all named objects makes
>>>>>> sense and would also simplify the implementation, what do you think?
>>>>>
>>>>> Some more numbers for my config:
>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>> search iterations. Is this a significant slowdown in practice?
>>>>
>>>> What do you propose to do in case of duplicates ?
>>>> func and struct can have the same name, but they will have two
>>>> different
>>>> btf_ids. How do we store them ?
>>>> Also we might add global vars to BTF. Such request came up several
>>>> times.
>>>> So we need to make sure our search approach scales to
>>>> func, struct, vars. I don't recall whether we search any other kinds.
>>>> Separate arrays for different kinds seems ok.
>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>
>>> Binary search gives, say, lowest index of a thing with name A, then
>>> increment index while name remains A looking for correct kind.
>>> Given the name conflicts info from above, 99% of times there would be
>>> no need to iterate and in very few cases there would a couple of
>>> iterations.
>>>
>>> Same logic would be necessary with current approach if different BTF
>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
>>> cohorts are mainly a way to split the tree for faster lookups, but
>>> maybe that is not the main intent.
>>>
>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>> extra memory. That's quite a bit. Anything we can do to compress it?
>>>
>>> That's an interesting question, from the top of my head:
>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>> mean "increasing" name), shouldn't be that difficult.
>>
>> That sounds great. kallsyms are pre-sorted at build time.
>> We should do the same with BTF.
>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>> too,
>> but since vmlinux and kernel module BTFs will keep being processed
>> through pahole we don't have to make gcc/llvm sort things right away.
>> pahole will be enough. The kernel might do 'is it sorted' check
>> during BTF validation and then use binary search or fall back to linear
>> when not-sorted == old pahole.
>>
>
> Yeah, I agree and will attempt to modify the pahole and perform a test.
> Do we need
> to introduce a new macro to control the behavior when the BTF is not
> sorted? If
> it is not sorted, we can use the method mentioned in this patch or use
> linear
> search.
>
>

One challenge with pahole is that it often runs in parallel mode, so I
suspect any sorting would have to be done after merging across threads.
Perhaps BTF deduplication time might be a useful time to re-sort by
name? BTF dedup happens after BTF has been merged, and a new "sorted"
btf_dedup_opts option could be added and controlled by a pahole
option. However dedup is pretty complicated already..

One thing we should weigh up though is if there are benefits to the
way BTF is currently laid out. It tends to start with base types,
and often-encountered types end up being located towards the start
of the BTF data. For example


[1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none)
[2] CONST '(anon)' type_id=1
[3] VOLATILE '(anon)' type_id=1
[4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
[5] PTR '(anon)' type_id=8
[6] CONST '(anon)' type_id=5
[7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
[8] CONST '(anon)' type_id=7
[9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
[10] CONST '(anon)' type_id=9
[11] TYPEDEF '__s8' type_id=12
[12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
[13] TYPEDEF '__u8' type_id=14

So often-used types will be found quickly, even under linear search
conditions.

When we look at how many lookups by id (which are O(1), since they are
done via the btf->types[] array) versus by name, we see:

$ grep btf_type_by_id kernel/bpf/*.c|wc -l
120
$ grep btf_find_by_nam kernel/bpf/*.c|wc -l
15

I don't see a huge number of name-based lookups, and I think most are
outside of the hotter codepaths, unless I'm missing some. All of which
is to say it would be a good idea to have a clear sense of what will get
faster with sorted-by-name BTF. Thanks!

Alan

2023-09-13 22:30:18

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/13 2:46, Alexei Starovoitov wrote:
> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]> wrote:
>>
>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]> wrote:
>>>>
>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>> Currently, we are only using the linear search method to find the type id
>>>>>> by the name, which has a time complexity of O(n). This change involves
>>>>>> sorting the names of btf types in ascending order and using binary search,
>>>>>> which has a time complexity of O(log(n)). This idea was inspired by the
>>>>>> following patch:
>>>>>>
>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of kallsyms_lookup_name()").
>>>>>>
>>>>>> At present, this improvement is only for searching in vmlinux's and
>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or BTF_KIND_STRUCT.
>>>>>>
>>>>>> Another change is the search direction, where we search the BTF first and
>>>>>> then its base, the type id of the first matched btf_type will be returned.
>>>>>>
>>>>>> Here is a time-consuming result that finding all the type ids of 67,819 kernel
>>>>>> functions in vmlinux's BTF by their names:
>>>>>>
>>>>>> Before: 17000 ms
>>>>>> After: 10 ms
>>>>>>
>>>>>> The average lookup performance has improved about 1700x at the above scenario.
>>>>>>
>>>>>> However, this change will consume more memory, for example, 67,819 kernel
>>>>>> functions will allocate about 530KB memory.
>>>>>
>>>>> Hi Donglin,
>>>>>
>>>>> I think this is a good improvement. However, I wonder, why did you
>>>>> choose to have a separate name map for each BTF kind?
>>>>>
>>>>> I did some analysis for my local testing kernel config and got such numbers:
>>>>> - total number of BTF objects: 97350
>>>>> - number of FUNC and STRUCT objects: 51597
>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC objects: 56817
>>>>> (these are all kinds for which lookup by name might make sense)
>>>>> - number of named objects: 54246
>>>>> - number of name collisions:
>>>>> - unique names: 53985 counts
>>>>> - 2 objects with the same name: 129 counts
>>>>> - 3 objects with the same name: 3 counts
>>>>>
>>>>> So, it appears that having a single map for all named objects makes
>>>>> sense and would also simplify the implementation, what do you think?
>>>>
>>>> Some more numbers for my config:
>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>> - 43575 funcs, log2 43575 = 15.4
>>>> Thus, having separate map for types vs functions might save ~1.7
>>>> search iterations. Is this a significant slowdown in practice?
>>>
>>> What do you propose to do in case of duplicates ?
>>> func and struct can have the same name, but they will have two different
>>> btf_ids. How do we store them ?
>>> Also we might add global vars to BTF. Such request came up several times.
>>> So we need to make sure our search approach scales to
>>> func, struct, vars. I don't recall whether we search any other kinds.
>>> Separate arrays for different kinds seems ok.
>>> It's a bit of code complexity, but it's not an increase in memory.
>>
>> Binary search gives, say, lowest index of a thing with name A, then
>> increment index while name remains A looking for correct kind.
>> Given the name conflicts info from above, 99% of times there would be
>> no need to iterate and in very few cases there would a couple of iterations.
>>
>> Same logic would be necessary with current approach if different BTF
>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
>> cohorts are mainly a way to split the tree for faster lookups, but
>> maybe that is not the main intent.
>>
>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>> extra memory. That's quite a bit. Anything we can do to compress it?
>>
>> That's an interesting question, from the top of my head:
>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>> mean "increasing" name), shouldn't be that difficult.
>
> That sounds great. kallsyms are pre-sorted at build time.
> We should do the same with BTF.
> I think GCC can emit BTF directly now and LLVM emits it for bpf progs too,
> but since vmlinux and kernel module BTFs will keep being processed
> through pahole we don't have to make gcc/llvm sort things right away.
> pahole will be enough. The kernel might do 'is it sorted' check
> during BTF validation and then use binary search or fall back to linear
> when not-sorted == old pahole.
>

Yeah, I agree and will attempt to modify the pahole and perform a test. Do we need
to introduce a new macro to control the behavior when the BTF is not sorted? If
it is not sorted, we can use the method mentioned in this patch or use linear
search.

2023-09-14 10:41:53

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/13 21:34, Alan Maguire wrote:
> On 13/09/2023 11:32, pengdonglin wrote:
>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>>> wrote:
>>>>
>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]>
>>>>> wrote:
>>>>>>
>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>>> Currently, we are only using the linear search method to find the
>>>>>>>> type id
>>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>>> involves
>>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>>> binary search,
>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>>> by the
>>>>>>>> following patch:
>>>>>>>>
>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>>> kallsyms_lookup_name()").
>>>>>>>>
>>>>>>>> At present, this improvement is only for searching in vmlinux's and
>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>>> BTF_KIND_STRUCT.
>>>>>>>>
>>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>>> first and
>>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>>> returned.
>>>>>>>>
>>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>>> 67,819 kernel
>>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>>
>>>>>>>> Before: 17000 ms
>>>>>>>> After:     10 ms
>>>>>>>>
>>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>>> above scenario.
>>>>>>>>
>>>>>>>> However, this change will consume more memory, for example,
>>>>>>>> 67,819 kernel
>>>>>>>> functions will allocate about 530KB memory.
>>>>>>>
>>>>>>> Hi Donglin,
>>>>>>>
>>>>>>> I think this is a good improvement. However, I wonder, why did you
>>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>>
>>>>>>> I did some analysis for my local testing kernel config and got
>>>>>>> such numbers:
>>>>>>> - total number of BTF objects: 97350
>>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>>> objects: 56817
>>>>>>>    (these are all kinds for which lookup by name might make sense)
>>>>>>> - number of named objects: 54246
>>>>>>> - number of name collisions:
>>>>>>>    - unique names: 53985 counts
>>>>>>>    - 2 objects with the same name: 129 counts
>>>>>>>    - 3 objects with the same name: 3 counts
>>>>>>>
>>>>>>> So, it appears that having a single map for all named objects makes
>>>>>>> sense and would also simplify the implementation, what do you think?
>>>>>>
>>>>>> Some more numbers for my config:
>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>>> search iterations. Is this a significant slowdown in practice?
>>>>>
>>>>> What do you propose to do in case of duplicates ?
>>>>> func and struct can have the same name, but they will have two
>>>>> different
>>>>> btf_ids. How do we store them ?
>>>>> Also we might add global vars to BTF. Such request came up several
>>>>> times.
>>>>> So we need to make sure our search approach scales to
>>>>> func, struct, vars. I don't recall whether we search any other kinds.
>>>>> Separate arrays for different kinds seems ok.
>>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>>
>>>> Binary search gives, say, lowest index of a thing with name A, then
>>>> increment index while name remains A looking for correct kind.
>>>> Given the name conflicts info from above, 99% of times there would be
>>>> no need to iterate and in very few cases there would a couple of
>>>> iterations.
>>>>
>>>> Same logic would be necessary with current approach if different BTF
>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
>>>> cohorts are mainly a way to split the tree for faster lookups, but
>>>> maybe that is not the main intent.
>>>>
>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>>> extra memory. That's quite a bit. Anything we can do to compress it?
>>>>
>>>> That's an interesting question, from the top of my head:
>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>>> mean "increasing" name), shouldn't be that difficult.
>>>
>>> That sounds great. kallsyms are pre-sorted at build time.
>>> We should do the same with BTF.
>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>>> too,
>>> but since vmlinux and kernel module BTFs will keep being processed
>>> through pahole we don't have to make gcc/llvm sort things right away.
>>> pahole will be enough. The kernel might do 'is it sorted' check
>>> during BTF validation and then use binary search or fall back to linear
>>> when not-sorted == old pahole.
>>>
>>
>> Yeah, I agree and will attempt to modify the pahole and perform a test.
>> Do we need
>> to introduce a new macro to control the behavior when the BTF is not
>> sorted? If
>> it is not sorted, we can use the method mentioned in this patch or use
>> linear
>> search.
>>
>>
>
> One challenge with pahole is that it often runs in parallel mode, so I
> suspect any sorting would have to be done after merging across threads.
> Perhaps BTF deduplication time might be a useful time to re-sort by
> name? BTF dedup happens after BTF has been merged, and a new "sorted"
> btf_dedup_opts option could be added and controlled by a pahole
> option. However dedup is pretty complicated already..
>
> One thing we should weigh up though is if there are benefits to the
> way BTF is currently laid out. It tends to start with base types,
> and often-encountered types end up being located towards the start
> of the BTF data. For example
>
>
> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none)
> [2] CONST '(anon)' type_id=1
> [3] VOLATILE '(anon)' type_id=1
> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
> [5] PTR '(anon)' type_id=8
> [6] CONST '(anon)' type_id=5
> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
> [8] CONST '(anon)' type_id=7
> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
> [10] CONST '(anon)' type_id=9
> [11] TYPEDEF '__s8' type_id=12
> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
> [13] TYPEDEF '__u8' type_id=14
>
> So often-used types will be found quickly, even under linear search
> conditions.

I found that there seems to be no code in the kernel that get the ID of the
basic data type by calling btf_find_by_name_kind directly. The general usage
of this function is to obtain the ID of a structure or function. After we got
the ID of a structure or function, it is O(1) to get the IDs of its members
or parameters.

./kernel/trace/trace_probe.c:383: id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC);
./kernel/bpf/btf.c:3523: id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
./kernel/bpf/btf.c:5504: id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
./kernel/bpf/bpf_struct_ops.c:128: module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
./net/ipv4/bpf_tcp_ca.c:28: type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
./net/ipv4/bpf_tcp_ca.c:33: type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
./net/netfilter/nf_bpf_link.c:181: type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);

>
> When we look at how many lookups by id (which are O(1), since they are
> done via the btf->types[] array) versus by name, we see:
>
> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
> 120
> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
> 15
>
> I don't see a huge number of name-based lookups, and I think most are
> outside of the hotter codepaths, unless I'm missing some. All of which
> is to say it would be a good idea to have a clear sense of what will get
> faster with sorted-by-name BTF. Thanks!

The story goes like this.

I have added a new feature to the function graph called "funcgraph_retval",
here is the link:

https://lore.kernel.org/all/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/

We can obtain the return values of almost every function during the execution
of kernel through this feature, it can help us analyze problems.

However, this feature has two main drawbacks.

1. Even if a function's return type is void, a return value will still be printed.

2. The return value printed may be incorrect when the width of the return type is
smaller than the generic register.

I think if we can get this return type of the function, then the drawbacks mentioned
above can be eliminated. The function btf_find_by_name_kind can be used to get the ID of
the kernel function, then we can get its return type easily. If the return type is
void, the return value recorded will not be printed. If the width of the return type
is smaller than the generic register, then the value stored in the upper bits will be
trimmed. I have written a demo and these drawbacks were resolved.

However, during my test, I found that it took too much time when read the trace log
with this feature enabled, because the trace log consists of 200,000 lines. The
majority of the time was consumed by the btf_find_by_name_kind, which is called
200,000 times.

So I think the performance of btf_find_by_name_kind may need to be improved.

>
> Alan
>

2023-09-14 12:31:59

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/13 21:45, Eduard Zingerman wrote:
> On Wed, 2023-09-13 at 14:34 +0100, Alan Maguire wrote:
>> On 13/09/2023 11:32, pengdonglin wrote:
>>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>>>> wrote:
>>>>>
>>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>>>> Currently, we are only using the linear search method to find the
>>>>>>>>> type id
>>>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>>>> involves
>>>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>>>> binary search,
>>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>>>> by the
>>>>>>>>> following patch:
>>>>>>>>>
>>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>>>> kallsyms_lookup_name()").
>>>>>>>>>
>>>>>>>>> At present, this improvement is only for searching in vmlinux's and
>>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>>>> BTF_KIND_STRUCT.
>>>>>>>>>
>>>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>>>> first and
>>>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>>>> returned.
>>>>>>>>>
>>>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>>>> 67,819 kernel
>>>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>>>
>>>>>>>>> Before: 17000 ms
>>>>>>>>> After:     10 ms
>>>>>>>>>
>>>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>>>> above scenario.
>>>>>>>>>
>>>>>>>>> However, this change will consume more memory, for example,
>>>>>>>>> 67,819 kernel
>>>>>>>>> functions will allocate about 530KB memory.
>>>>>>>>
>>>>>>>> Hi Donglin,
>>>>>>>>
>>>>>>>> I think this is a good improvement. However, I wonder, why did you
>>>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>>>
>>>>>>>> I did some analysis for my local testing kernel config and got
>>>>>>>> such numbers:
>>>>>>>> - total number of BTF objects: 97350
>>>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>>>> objects: 56817
>>>>>>>>    (these are all kinds for which lookup by name might make sense)
>>>>>>>> - number of named objects: 54246
>>>>>>>> - number of name collisions:
>>>>>>>>    - unique names: 53985 counts
>>>>>>>>    - 2 objects with the same name: 129 counts
>>>>>>>>    - 3 objects with the same name: 3 counts
>>>>>>>>
>>>>>>>> So, it appears that having a single map for all named objects makes
>>>>>>>> sense and would also simplify the implementation, what do you think?
>>>>>>>
>>>>>>> Some more numbers for my config:
>>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>>>> search iterations. Is this a significant slowdown in practice?
>>>>>>
>>>>>> What do you propose to do in case of duplicates ?
>>>>>> func and struct can have the same name, but they will have two
>>>>>> different
>>>>>> btf_ids. How do we store them ?
>>>>>> Also we might add global vars to BTF. Such request came up several
>>>>>> times.
>>>>>> So we need to make sure our search approach scales to
>>>>>> func, struct, vars. I don't recall whether we search any other kinds.
>>>>>> Separate arrays for different kinds seems ok.
>>>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>>>
>>>>> Binary search gives, say, lowest index of a thing with name A, then
>>>>> increment index while name remains A looking for correct kind.
>>>>> Given the name conflicts info from above, 99% of times there would be
>>>>> no need to iterate and in very few cases there would a couple of
>>>>> iterations.
>>>>>
>>>>> Same logic would be necessary with current approach if different BTF
>>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
>>>>> cohorts are mainly a way to split the tree for faster lookups, but
>>>>> maybe that is not the main intent.
>>>>>
>>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>>>> extra memory. That's quite a bit. Anything we can do to compress it?
>>>>>
>>>>> That's an interesting question, from the top of my head:
>>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>>>> mean "increasing" name), shouldn't be that difficult.
>>>>
>>>> That sounds great. kallsyms are pre-sorted at build time.
>>>> We should do the same with BTF.
>>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>>>> too,
>>>> but since vmlinux and kernel module BTFs will keep being processed
>>>> through pahole we don't have to make gcc/llvm sort things right away.
>>>> pahole will be enough. The kernel might do 'is it sorted' check
>>>> during BTF validation and then use binary search or fall back to linear
>>>> when not-sorted == old pahole.
>>>>
>>>
>>> Yeah, I agree and will attempt to modify the pahole and perform a test.
>>> Do we need
>>> to introduce a new macro to control the behavior when the BTF is not
>>> sorted? If
>>> it is not sorted, we can use the method mentioned in this patch or use
>>> linear
>>> search.
>>>
>>>
>>
>> One challenge with pahole is that it often runs in parallel mode, so I
>> suspect any sorting would have to be done after merging across threads.
>> Perhaps BTF deduplication time might be a useful time to re-sort by
>> name? BTF dedup happens after BTF has been merged, and a new "sorted"
>> btf_dedup_opts option could be added and controlled by a pahole
>> option. However dedup is pretty complicated already..
>
> Hi Alan,
>
> libbpf might be the right place to do this, however, I think that it is
> also doable in pahole's btf_encoder__encode(), e.g. as follows:
> - after a call to btf__dedup():
> - create a sorted by name IDs ordering;
> - create a new BTF object;
> - add records to the new BTF according to the sorted ordering;
> - remap id references while adding;
> - use the new BTF object instead of old one to write BTF output.
>
> I assume that implementation would be similar regardless whether it is
> done in pahole or in libbpf.

Sounds good ;). We can place named objects at the start and may also need a
variable to keep trace of the number.

>
> Thanks,
> Eduard
>
>> One thing we should weigh up though is if there are benefits to the
>> way BTF is currently laid out. It tends to start with base types,
>> and often-encountered types end up being located towards the start
>> of the BTF data. For example
>>
>>
>> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none)
>> [2] CONST '(anon)' type_id=1
>> [3] VOLATILE '(anon)' type_id=1
>> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
>> [5] PTR '(anon)' type_id=8
>> [6] CONST '(anon)' type_id=5
>> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>> [8] CONST '(anon)' type_id=7
>> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
>> [10] CONST '(anon)' type_id=9
>> [11] TYPEDEF '__s8' type_id=12
>> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>> [13] TYPEDEF '__u8' type_id=14
>>
>> So often-used types will be found quickly, even under linear search
>> conditions.
>>
>> When we look at how many lookups by id (which are O(1), since they are
>> done via the btf->types[] array) versus by name, we see:
>>
>> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
>> 120
>> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
>> 15
>>
>> I don't see a huge number of name-based lookups, and I think most are
>> outside of the hotter codepaths, unless I'm missing some. All of which
>> is to say it would be a good idea to have a clear sense of what will get
>> faster with sorted-by-name BTF. Thanks!
>>
>> Alan
>

2023-09-14 13:13:43

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/14 20:46, Alan Maguire wrote:
> On 14/09/2023 11:13, pengdonglin wrote:
>> On 2023/9/13 21:34, Alan Maguire wrote:
>>> On 13/09/2023 11:32, pengdonglin wrote:
>>>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>>>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>>>>> wrote:
>>>>>>
>>>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]>
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>>>>> Currently, we are only using the linear search method to find the
>>>>>>>>>> type id
>>>>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>>>>> involves
>>>>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>>>>> binary search,
>>>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>>>>> by the
>>>>>>>>>> following patch:
>>>>>>>>>>
>>>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>>>>> kallsyms_lookup_name()").
>>>>>>>>>>
>>>>>>>>>> At present, this improvement is only for searching in vmlinux's
>>>>>>>>>> and
>>>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>>>>> BTF_KIND_STRUCT.
>>>>>>>>>>
>>>>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>>>>> first and
>>>>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>>>>> returned.
>>>>>>>>>>
>>>>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>>>>> 67,819 kernel
>>>>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>>>>
>>>>>>>>>> Before: 17000 ms
>>>>>>>>>> After:     10 ms
>>>>>>>>>>
>>>>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>>>>> above scenario.
>>>>>>>>>>
>>>>>>>>>> However, this change will consume more memory, for example,
>>>>>>>>>> 67,819 kernel
>>>>>>>>>> functions will allocate about 530KB memory.
>>>>>>>>>
>>>>>>>>> Hi Donglin,
>>>>>>>>>
>>>>>>>>> I think this is a good improvement. However, I wonder, why did you
>>>>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>>>>
>>>>>>>>> I did some analysis for my local testing kernel config and got
>>>>>>>>> such numbers:
>>>>>>>>> - total number of BTF objects: 97350
>>>>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>>>>> objects: 56817
>>>>>>>>>     (these are all kinds for which lookup by name might make sense)
>>>>>>>>> - number of named objects: 54246
>>>>>>>>> - number of name collisions:
>>>>>>>>>     - unique names: 53985 counts
>>>>>>>>>     - 2 objects with the same name: 129 counts
>>>>>>>>>     - 3 objects with the same name: 3 counts
>>>>>>>>>
>>>>>>>>> So, it appears that having a single map for all named objects makes
>>>>>>>>> sense and would also simplify the implementation, what do you
>>>>>>>>> think?
>>>>>>>>
>>>>>>>> Some more numbers for my config:
>>>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>>>>> search iterations. Is this a significant slowdown in practice?
>>>>>>>
>>>>>>> What do you propose to do in case of duplicates ?
>>>>>>> func and struct can have the same name, but they will have two
>>>>>>> different
>>>>>>> btf_ids. How do we store them ?
>>>>>>> Also we might add global vars to BTF. Such request came up several
>>>>>>> times.
>>>>>>> So we need to make sure our search approach scales to
>>>>>>> func, struct, vars. I don't recall whether we search any other kinds.
>>>>>>> Separate arrays for different kinds seems ok.
>>>>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>>>>
>>>>>> Binary search gives, say, lowest index of a thing with name A, then
>>>>>> increment index while name remains A looking for correct kind.
>>>>>> Given the name conflicts info from above, 99% of times there would be
>>>>>> no need to iterate and in very few cases there would a couple of
>>>>>> iterations.
>>>>>>
>>>>>> Same logic would be necessary with current approach if different BTF
>>>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
>>>>>> cohorts are mainly a way to split the tree for faster lookups, but
>>>>>> maybe that is not the main intent.
>>>>>>
>>>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>>>>> extra memory. That's quite a bit. Anything we can do to compress it?
>>>>>>
>>>>>> That's an interesting question, from the top of my head:
>>>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>>>>> mean "increasing" name), shouldn't be that difficult.
>>>>>
>>>>> That sounds great. kallsyms are pre-sorted at build time.
>>>>> We should do the same with BTF.
>>>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>>>>> too,
>>>>> but since vmlinux and kernel module BTFs will keep being processed
>>>>> through pahole we don't have to make gcc/llvm sort things right away.
>>>>> pahole will be enough. The kernel might do 'is it sorted' check
>>>>> during BTF validation and then use binary search or fall back to linear
>>>>> when not-sorted == old pahole.
>>>>>
>>>>
>>>> Yeah, I agree and will attempt to modify the pahole and perform a test.
>>>> Do we need
>>>> to introduce a new macro to control the behavior when the BTF is not
>>>> sorted? If
>>>> it is not sorted, we can use the method mentioned in this patch or use
>>>> linear
>>>> search.
>>>>
>>>>
>>>
>>> One challenge with pahole is that it often runs in parallel mode, so I
>>> suspect any sorting would have to be done after merging across threads.
>>> Perhaps BTF deduplication time might be a useful time to re-sort by
>>> name? BTF dedup happens after BTF has been merged, and a new "sorted"
>>> btf_dedup_opts option could be added and controlled by a pahole
>>> option. However dedup is pretty complicated already..
>>>
>>> One thing we should weigh up though is if there are benefits to the
>>> way BTF is currently laid out. It tends to start with base types,
>>> and often-encountered types end up being located towards the start
>>> of the BTF data. For example
>>>
>>>
>>> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64
>>> encoding=(none)
>>> [2] CONST '(anon)' type_id=1
>>> [3] VOLATILE '(anon)' type_id=1
>>> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
>>> [5] PTR '(anon)' type_id=8
>>> [6] CONST '(anon)' type_id=5
>>> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>>> [8] CONST '(anon)' type_id=7
>>> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
>>> [10] CONST '(anon)' type_id=9
>>> [11] TYPEDEF '__s8' type_id=12
>>> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>>> [13] TYPEDEF '__u8' type_id=14
>>>
>>> So often-used types will be found quickly, even under linear search
>>> conditions.
>>
>> I found that there seems to be no code in the kernel that get the ID of the
>> basic data type by calling btf_find_by_name_kind directly. The general
>> usage
>> of this function is to obtain the ID of a structure or function. After
>> we got
>> the ID of a structure or function, it is O(1) to get the IDs of its members
>> or parameters.
>>
>> ./kernel/trace/trace_probe.c:383:       id = btf_find_by_name_kind(btf,
>> funcname, BTF_KIND_FUNC);
>> ./kernel/bpf/btf.c:3523:        id = btf_find_by_name_kind(btf,
>> value_type, BTF_KIND_STRUCT);
>> ./kernel/bpf/btf.c:5504:                id = btf_find_by_name_kind(btf,
>> alloc_obj_fields[i], BTF_KIND_STRUCT);
>> ./kernel/bpf/bpf_struct_ops.c:128:      module_id =
>> btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
>> ./net/ipv4/bpf_tcp_ca.c:28:     type_id = btf_find_by_name_kind(btf,
>> "sock", BTF_KIND_STRUCT);
>> ./net/ipv4/bpf_tcp_ca.c:33:     type_id = btf_find_by_name_kind(btf,
>> "tcp_sock", BTF_KIND_STRUCT);
>> ./net/netfilter/nf_bpf_link.c:181:      type_id =
>> btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
>>
>>>
>>> When we look at how many lookups by id (which are O(1), since they are
>>> done via the btf->types[] array) versus by name, we see:
>>>
>>> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
>>> 120
>>> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
>>> 15
>>>
>>> I don't see a huge number of name-based lookups, and I think most are
>>> outside of the hotter codepaths, unless I'm missing some. All of which
>>> is to say it would be a good idea to have a clear sense of what will get
>>> faster with sorted-by-name BTF. Thanks!
>>
>> The story goes like this.
>>
>> I have added a new feature to the function graph called "funcgraph_retval",
>> here is the link:
>>
>> https://lore.kernel.org/all/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/
>>
>> We can obtain the return values of almost every function during the
>> execution
>> of kernel through this feature, it can help us analyze problems.
>>
>
> It's a great feature!

Thanks.

>
>> However, this feature has two main drawbacks.
>>
>> 1. Even if a function's return type is void,  a return value will still
>> be printed.
>>
>> 2. The return value printed may be incorrect when the width of the
>> return type is
>> smaller than the generic register.
>>
>> I think if we can get this return type of the function, then the
>> drawbacks mentioned
>> above can be eliminated. The function btf_find_by_name_kind can be used
>> to get the ID of
>> the kernel function, then we can get its return type easily. If the
>> return type is
>> void, the return value recorded will not be printed. If the width of the
>> return type
>> is smaller than the generic register, then the value stored in the upper
>> bits will be
>> trimmed. I have written a demo and these drawbacks were resolved.
>>
>> However, during my test, I found that it took too much time when read
>> the trace log
>> with this feature enabled, because the trace log consists of 200,000
>> lines. The
>> majority of the time was consumed by the btf_find_by_name_kind, which is
>> called
>> 200,000 times.
>>
>> So I think the performance of btf_find_by_name_kind  may need to be
>> improved.
>>
>
> If I recall, Masami's work uses BTF ids, but can cache them since the
> user explicitly asks for specific fields in the trace output. I'm
> presuming that's not an option for you due to the fact funcgraph tracing
> enables everything (or at least everything under a filter predicate) and
> you have limited context to work with, is that right?

Yes, right.

>
> Looking at print_graph_entry_leaf() which I _think_ is where you'd need
> to print the retval from, you have access to the function address via
> call->func, and I presume you get the name from snprinting the symbol to
> a string or similar. So you're stuck in a context where you have the
> function address, and from that you can derive the function name. Is
> that correct? Thanks!

Yes, both print_graph_return and print_graph_entry_leaf will call print_graph_retval
to print the return value. Then call sprint_symbol_no_offset with call->func to get
the function name, then call btf_find_by_name_kind to get the return type.

>
> Alan
>

2023-09-14 20:29:51

by Alan Maguire

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 14/09/2023 14:05, pengdonglin wrote:
> On 2023/9/14 20:46, Alan Maguire wrote:
>> On 14/09/2023 11:13, pengdonglin wrote:
>>> On 2023/9/13 21:34, Alan Maguire wrote:
>>>> On 13/09/2023 11:32, pengdonglin wrote:
>>>>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>>>>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman
>>>>>>>> <[email protected]>
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>>>>>> Currently, we are only using the linear search method to find
>>>>>>>>>>> the
>>>>>>>>>>> type id
>>>>>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>>>>>> involves
>>>>>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>>>>>> binary search,
>>>>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>>>>>> by the
>>>>>>>>>>> following patch:
>>>>>>>>>>>
>>>>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>>>>>> kallsyms_lookup_name()").
>>>>>>>>>>>
>>>>>>>>>>> At present, this improvement is only for searching in vmlinux's
>>>>>>>>>>> and
>>>>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>>>>>> BTF_KIND_STRUCT.
>>>>>>>>>>>
>>>>>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>>>>>> first and
>>>>>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>>>>>> returned.
>>>>>>>>>>>
>>>>>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>>>>>> 67,819 kernel
>>>>>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>>>>>
>>>>>>>>>>> Before: 17000 ms
>>>>>>>>>>> After:     10 ms
>>>>>>>>>>>
>>>>>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>>>>>> above scenario.
>>>>>>>>>>>
>>>>>>>>>>> However, this change will consume more memory, for example,
>>>>>>>>>>> 67,819 kernel
>>>>>>>>>>> functions will allocate about 530KB memory.
>>>>>>>>>>
>>>>>>>>>> Hi Donglin,
>>>>>>>>>>
>>>>>>>>>> I think this is a good improvement. However, I wonder, why did
>>>>>>>>>> you
>>>>>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>>>>>
>>>>>>>>>> I did some analysis for my local testing kernel config and got
>>>>>>>>>> such numbers:
>>>>>>>>>> - total number of BTF objects: 97350
>>>>>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>>>>>> objects: 56817
>>>>>>>>>>      (these are all kinds for which lookup by name might make
>>>>>>>>>> sense)
>>>>>>>>>> - number of named objects: 54246
>>>>>>>>>> - number of name collisions:
>>>>>>>>>>      - unique names: 53985 counts
>>>>>>>>>>      - 2 objects with the same name: 129 counts
>>>>>>>>>>      - 3 objects with the same name: 3 counts
>>>>>>>>>>
>>>>>>>>>> So, it appears that having a single map for all named objects
>>>>>>>>>> makes
>>>>>>>>>> sense and would also simplify the implementation, what do you
>>>>>>>>>> think?
>>>>>>>>>
>>>>>>>>> Some more numbers for my config:
>>>>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>>>>>> search iterations. Is this a significant slowdown in practice?
>>>>>>>>
>>>>>>>> What do you propose to do in case of duplicates ?
>>>>>>>> func and struct can have the same name, but they will have two
>>>>>>>> different
>>>>>>>> btf_ids. How do we store them ?
>>>>>>>> Also we might add global vars to BTF. Such request came up several
>>>>>>>> times.
>>>>>>>> So we need to make sure our search approach scales to
>>>>>>>> func, struct, vars. I don't recall whether we search any other
>>>>>>>> kinds.
>>>>>>>> Separate arrays for different kinds seems ok.
>>>>>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>>>>>
>>>>>>> Binary search gives, say, lowest index of a thing with name A, then
>>>>>>> increment index while name remains A looking for correct kind.
>>>>>>> Given the name conflicts info from above, 99% of times there
>>>>>>> would be
>>>>>>> no need to iterate and in very few cases there would a couple of
>>>>>>> iterations.
>>>>>>>
>>>>>>> Same logic would be necessary with current approach if different BTF
>>>>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that
>>>>>>> these
>>>>>>> cohorts are mainly a way to split the tree for faster lookups, but
>>>>>>> maybe that is not the main intent.
>>>>>>>
>>>>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>>>>>> extra memory. That's quite a bit. Anything we can do to compress
>>>>>>>> it?
>>>>>>>
>>>>>>> That's an interesting question, from the top of my head:
>>>>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>>>>>> mean "increasing" name), shouldn't be that difficult.
>>>>>>
>>>>>> That sounds great. kallsyms are pre-sorted at build time.
>>>>>> We should do the same with BTF.
>>>>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>>>>>> too,
>>>>>> but since vmlinux and kernel module BTFs will keep being processed
>>>>>> through pahole we don't have to make gcc/llvm sort things right away.
>>>>>> pahole will be enough. The kernel might do 'is it sorted' check
>>>>>> during BTF validation and then use binary search or fall back to
>>>>>> linear
>>>>>> when not-sorted == old pahole.
>>>>>>
>>>>>
>>>>> Yeah, I agree and will attempt to modify the pahole and perform a
>>>>> test.
>>>>> Do we need
>>>>> to introduce a new macro to control the behavior when the BTF is not
>>>>> sorted? If
>>>>> it is not sorted, we can use the method mentioned in this patch or use
>>>>> linear
>>>>> search.
>>>>>
>>>>>
>>>>
>>>> One challenge with pahole is that it often runs in parallel mode, so I
>>>> suspect any sorting would have to be done after merging across threads.
>>>> Perhaps BTF deduplication time might be a useful time to re-sort by
>>>> name? BTF dedup happens after BTF has been merged, and a new "sorted"
>>>> btf_dedup_opts option could be added and controlled by a pahole
>>>> option. However dedup is pretty complicated already..
>>>>
>>>> One thing we should weigh up though is if there are benefits to the
>>>> way BTF is currently laid out. It tends to start with base types,
>>>> and often-encountered types end up being located towards the start
>>>> of the BTF data. For example
>>>>
>>>>
>>>> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64
>>>> encoding=(none)
>>>> [2] CONST '(anon)' type_id=1
>>>> [3] VOLATILE '(anon)' type_id=1
>>>> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
>>>> [5] PTR '(anon)' type_id=8
>>>> [6] CONST '(anon)' type_id=5
>>>> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>>>> [8] CONST '(anon)' type_id=7
>>>> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
>>>> [10] CONST '(anon)' type_id=9
>>>> [11] TYPEDEF '__s8' type_id=12
>>>> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>>>> [13] TYPEDEF '__u8' type_id=14
>>>>
>>>> So often-used types will be found quickly, even under linear search
>>>> conditions.
>>>
>>> I found that there seems to be no code in the kernel that get the ID
>>> of the
>>> basic data type by calling btf_find_by_name_kind directly. The general
>>> usage
>>> of this function is to obtain the ID of a structure or function. After
>>> we got
>>> the ID of a structure or function, it is O(1) to get the IDs of its
>>> members
>>> or parameters.
>>>
>>> ./kernel/trace/trace_probe.c:383:       id = btf_find_by_name_kind(btf,
>>> funcname, BTF_KIND_FUNC);
>>> ./kernel/bpf/btf.c:3523:        id = btf_find_by_name_kind(btf,
>>> value_type, BTF_KIND_STRUCT);
>>> ./kernel/bpf/btf.c:5504:                id = btf_find_by_name_kind(btf,
>>> alloc_obj_fields[i], BTF_KIND_STRUCT);
>>> ./kernel/bpf/bpf_struct_ops.c:128:      module_id =
>>> btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
>>> ./net/ipv4/bpf_tcp_ca.c:28:     type_id = btf_find_by_name_kind(btf,
>>> "sock", BTF_KIND_STRUCT);
>>> ./net/ipv4/bpf_tcp_ca.c:33:     type_id = btf_find_by_name_kind(btf,
>>> "tcp_sock", BTF_KIND_STRUCT);
>>> ./net/netfilter/nf_bpf_link.c:181:      type_id =
>>> btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
>>>
>>>>
>>>> When we look at how many lookups by id (which are O(1), since they are
>>>> done via the btf->types[] array) versus by name, we see:
>>>>
>>>> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
>>>> 120
>>>> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
>>>> 15
>>>>
>>>> I don't see a huge number of name-based lookups, and I think most are
>>>> outside of the hotter codepaths, unless I'm missing some. All of which
>>>> is to say it would be a good idea to have a clear sense of what will
>>>> get
>>>> faster with sorted-by-name BTF. Thanks!
>>>
>>> The story goes like this.
>>>
>>> I have added a new feature to the function graph called
>>> "funcgraph_retval",
>>> here is the link:
>>>
>>> https://lore.kernel.org/all/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/
>>>
>>> We can obtain the return values of almost every function during the
>>> execution
>>> of kernel through this feature, it can help us analyze problems.
>>>
>>
>> It's a great feature!
>
> Thanks.
>
>>
>>> However, this feature has two main drawbacks.
>>>
>>> 1. Even if a function's return type is void,  a return value will still
>>> be printed.
>>>
>>> 2. The return value printed may be incorrect when the width of the
>>> return type is
>>> smaller than the generic register.
>>>
>>> I think if we can get this return type of the function, then the
>>> drawbacks mentioned
>>> above can be eliminated. The function btf_find_by_name_kind can be used
>>> to get the ID of
>>> the kernel function, then we can get its return type easily. If the
>>> return type is
>>> void, the return value recorded will not be printed. If the width of the
>>> return type
>>> is smaller than the generic register, then the value stored in the upper
>>> bits will be
>>> trimmed. I have written a demo and these drawbacks were resolved.
>>>
>>> However, during my test, I found that it took too much time when read
>>> the trace log
>>> with this feature enabled, because the trace log consists of 200,000
>>> lines. The
>>> majority of the time was consumed by the btf_find_by_name_kind, which is
>>> called
>>> 200,000 times.
>>>
>>> So I think the performance of btf_find_by_name_kind  may need to be
>>> improved.
>>>
>>
>> If I recall, Masami's work uses BTF ids, but can cache them since the
>> user explicitly asks for specific fields in the trace output. I'm
>> presuming that's not an option for you due to the fact funcgraph tracing
>> enables everything (or at least everything under a filter predicate) and
>> you have limited context to work with, is that right?
>
> Yes, right.
>
>>
>> Looking at print_graph_entry_leaf() which I _think_ is where you'd need
>> to print the retval from, you have access to the function address via
>> call->func, and I presume you get the name from snprinting the symbol to
>> a string or similar. So you're stuck in a context where you have the
>> function address, and from that you can derive the function name. Is
>> that correct? Thanks!
>
> Yes, both print_graph_return and print_graph_entry_leaf will call
> print_graph_retval
> to print the return value. Then call sprint_symbol_no_offset with
> call->func to get
> the function name, then call btf_find_by_name_kind to get the return type.
>

So what you ultimately need is a way to map from that information
available to be able to figure out the size of the return value
associated with a function.

On the BPF side we've been thinking a bit about the relationship between
kernel function addresses and their BTF representations; sometimes
knowing BTF->address mapping is needed for cases where the same function
name has multiple inconsistent function signatures in BTF. We don't
represent function addresses yet in BTF but may end up having to.
The reason I mention this is in an ideal world, it would benefit to
populate kallsyms entries with their associated BTF ids; then a
function would need to be looked up once in kallsyms; that lookup would
benefit from recent speedups, and if it contained the associated BTF id
we'd have an O(1) lookup from the BTF id -> function. Not sure if that
would be tenable from the kallsyms side, but I just wanted to mention
it, as from the above it seems an address-based lookup is a possibility
to solve the return value type lookup problem that you're facing.

Cc'ed Jiri who had to wrestle with kallsyms for the kprobe multi stuff.
Would the above help do you think?

Thanks!

Alan

2023-09-14 22:58:33

by Alan Maguire

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 14/09/2023 11:13, pengdonglin wrote:
> On 2023/9/13 21:34, Alan Maguire wrote:
>> On 13/09/2023 11:32, pengdonglin wrote:
>>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>>>> wrote:
>>>>>
>>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman <[email protected]>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>>>> Currently, we are only using the linear search method to find the
>>>>>>>>> type id
>>>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>>>> involves
>>>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>>>> binary search,
>>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>>>> by the
>>>>>>>>> following patch:
>>>>>>>>>
>>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>>>> kallsyms_lookup_name()").
>>>>>>>>>
>>>>>>>>> At present, this improvement is only for searching in vmlinux's
>>>>>>>>> and
>>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>>>> BTF_KIND_STRUCT.
>>>>>>>>>
>>>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>>>> first and
>>>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>>>> returned.
>>>>>>>>>
>>>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>>>> 67,819 kernel
>>>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>>>
>>>>>>>>> Before: 17000 ms
>>>>>>>>> After:     10 ms
>>>>>>>>>
>>>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>>>> above scenario.
>>>>>>>>>
>>>>>>>>> However, this change will consume more memory, for example,
>>>>>>>>> 67,819 kernel
>>>>>>>>> functions will allocate about 530KB memory.
>>>>>>>>
>>>>>>>> Hi Donglin,
>>>>>>>>
>>>>>>>> I think this is a good improvement. However, I wonder, why did you
>>>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>>>
>>>>>>>> I did some analysis for my local testing kernel config and got
>>>>>>>> such numbers:
>>>>>>>> - total number of BTF objects: 97350
>>>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>>>> objects: 56817
>>>>>>>>     (these are all kinds for which lookup by name might make sense)
>>>>>>>> - number of named objects: 54246
>>>>>>>> - number of name collisions:
>>>>>>>>     - unique names: 53985 counts
>>>>>>>>     - 2 objects with the same name: 129 counts
>>>>>>>>     - 3 objects with the same name: 3 counts
>>>>>>>>
>>>>>>>> So, it appears that having a single map for all named objects makes
>>>>>>>> sense and would also simplify the implementation, what do you
>>>>>>>> think?
>>>>>>>
>>>>>>> Some more numbers for my config:
>>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>>>> search iterations. Is this a significant slowdown in practice?
>>>>>>
>>>>>> What do you propose to do in case of duplicates ?
>>>>>> func and struct can have the same name, but they will have two
>>>>>> different
>>>>>> btf_ids. How do we store them ?
>>>>>> Also we might add global vars to BTF. Such request came up several
>>>>>> times.
>>>>>> So we need to make sure our search approach scales to
>>>>>> func, struct, vars. I don't recall whether we search any other kinds.
>>>>>> Separate arrays for different kinds seems ok.
>>>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>>>
>>>>> Binary search gives, say, lowest index of a thing with name A, then
>>>>> increment index while name remains A looking for correct kind.
>>>>> Given the name conflicts info from above, 99% of times there would be
>>>>> no need to iterate and in very few cases there would a couple of
>>>>> iterations.
>>>>>
>>>>> Same logic would be necessary with current approach if different BTF
>>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that these
>>>>> cohorts are mainly a way to split the tree for faster lookups, but
>>>>> maybe that is not the main intent.
>>>>>
>>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>>>> extra memory. That's quite a bit. Anything we can do to compress it?
>>>>>
>>>>> That's an interesting question, from the top of my head:
>>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>>>> mean "increasing" name), shouldn't be that difficult.
>>>>
>>>> That sounds great. kallsyms are pre-sorted at build time.
>>>> We should do the same with BTF.
>>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>>>> too,
>>>> but since vmlinux and kernel module BTFs will keep being processed
>>>> through pahole we don't have to make gcc/llvm sort things right away.
>>>> pahole will be enough. The kernel might do 'is it sorted' check
>>>> during BTF validation and then use binary search or fall back to linear
>>>> when not-sorted == old pahole.
>>>>
>>>
>>> Yeah, I agree and will attempt to modify the pahole and perform a test.
>>> Do we need
>>> to introduce a new macro to control the behavior when the BTF is not
>>> sorted? If
>>> it is not sorted, we can use the method mentioned in this patch or use
>>> linear
>>> search.
>>>
>>>
>>
>> One challenge with pahole is that it often runs in parallel mode, so I
>> suspect any sorting would have to be done after merging across threads.
>> Perhaps BTF deduplication time might be a useful time to re-sort by
>> name? BTF dedup happens after BTF has been merged, and a new "sorted"
>> btf_dedup_opts option could be added and controlled by a pahole
>> option. However dedup is pretty complicated already..
>>
>> One thing we should weigh up though is if there are benefits to the
>> way BTF is currently laid out. It tends to start with base types,
>> and often-encountered types end up being located towards the start
>> of the BTF data. For example
>>
>>
>> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64
>> encoding=(none)
>> [2] CONST '(anon)' type_id=1
>> [3] VOLATILE '(anon)' type_id=1
>> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
>> [5] PTR '(anon)' type_id=8
>> [6] CONST '(anon)' type_id=5
>> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>> [8] CONST '(anon)' type_id=7
>> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
>> [10] CONST '(anon)' type_id=9
>> [11] TYPEDEF '__s8' type_id=12
>> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>> [13] TYPEDEF '__u8' type_id=14
>>
>> So often-used types will be found quickly, even under linear search
>> conditions.
>
> I found that there seems to be no code in the kernel that get the ID of the
> basic data type by calling btf_find_by_name_kind directly. The general
> usage
> of this function is to obtain the ID of a structure or function. After
> we got
> the ID of a structure or function, it is O(1) to get the IDs of its members
> or parameters.
>
> ./kernel/trace/trace_probe.c:383:       id = btf_find_by_name_kind(btf,
> funcname, BTF_KIND_FUNC);
> ./kernel/bpf/btf.c:3523:        id = btf_find_by_name_kind(btf,
> value_type, BTF_KIND_STRUCT);
> ./kernel/bpf/btf.c:5504:                id = btf_find_by_name_kind(btf,
> alloc_obj_fields[i], BTF_KIND_STRUCT);
> ./kernel/bpf/bpf_struct_ops.c:128:      module_id =
> btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
> ./net/ipv4/bpf_tcp_ca.c:28:     type_id = btf_find_by_name_kind(btf,
> "sock", BTF_KIND_STRUCT);
> ./net/ipv4/bpf_tcp_ca.c:33:     type_id = btf_find_by_name_kind(btf,
> "tcp_sock", BTF_KIND_STRUCT);
> ./net/netfilter/nf_bpf_link.c:181:      type_id =
> btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
>
>>
>> When we look at how many lookups by id (which are O(1), since they are
>> done via the btf->types[] array) versus by name, we see:
>>
>> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
>> 120
>> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
>> 15
>>
>> I don't see a huge number of name-based lookups, and I think most are
>> outside of the hotter codepaths, unless I'm missing some. All of which
>> is to say it would be a good idea to have a clear sense of what will get
>> faster with sorted-by-name BTF. Thanks!
>
> The story goes like this.
>
> I have added a new feature to the function graph called "funcgraph_retval",
> here is the link:
>
> https://lore.kernel.org/all/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/
>
> We can obtain the return values of almost every function during the
> execution
> of kernel through this feature, it can help us analyze problems.
>

It's a great feature!

> However, this feature has two main drawbacks.
>
> 1. Even if a function's return type is void,  a return value will still
> be printed.
>
> 2. The return value printed may be incorrect when the width of the
> return type is
> smaller than the generic register.
>
> I think if we can get this return type of the function, then the
> drawbacks mentioned
> above can be eliminated. The function btf_find_by_name_kind can be used
> to get the ID of
> the kernel function, then we can get its return type easily. If the
> return type is
> void, the return value recorded will not be printed. If the width of the
> return type
> is smaller than the generic register, then the value stored in the upper
> bits will be
> trimmed. I have written a demo and these drawbacks were resolved.
>
> However, during my test, I found that it took too much time when read
> the trace log
> with this feature enabled, because the trace log consists of 200,000
> lines. The
> majority of the time was consumed by the btf_find_by_name_kind, which is
> called
> 200,000 times.
>
> So I think the performance of btf_find_by_name_kind  may need to be
> improved.
>

If I recall, Masami's work uses BTF ids, but can cache them since the
user explicitly asks for specific fields in the trace output. I'm
presuming that's not an option for you due to the fact funcgraph tracing
enables everything (or at least everything under a filter predicate) and
you have limited context to work with, is that right?

Looking at print_graph_entry_leaf() which I _think_ is where you'd need
to print the retval from, you have access to the function address via
call->func, and I presume you get the name from snprinting the symbol to
a string or similar. So you're stuck in a context where you have the
function address, and from that you can derive the function name. Is
that correct? Thanks!

Alan

2023-09-15 02:37:40

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On Thu, Sep 14, 2023 at 10:14 AM Alan Maguire <[email protected]> wrote:
>
> On 14/09/2023 14:05, pengdonglin wrote:
> > On 2023/9/14 20:46, Alan Maguire wrote:
> >> On 14/09/2023 11:13, pengdonglin wrote:
> >>> On 2023/9/13 21:34, Alan Maguire wrote:
> >>>> On 13/09/2023 11:32, pengdonglin wrote:
> >>>>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
> >>>>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
> >>>>>> wrote:
> >>>>>>>
> >>>>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
> >>>>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman
> >>>>>>>> <[email protected]>
> >>>>>>>> wrote:
> >>>>>>>>>
> >>>>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
> >>>>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
> >>>>>>>>>>> Currently, we are only using the linear search method to find
> >>>>>>>>>>> the
> >>>>>>>>>>> type id
> >>>>>>>>>>> by the name, which has a time complexity of O(n). This change
> >>>>>>>>>>> involves
> >>>>>>>>>>> sorting the names of btf types in ascending order and using
> >>>>>>>>>>> binary search,
> >>>>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
> >>>>>>>>>>> by the
> >>>>>>>>>>> following patch:
> >>>>>>>>>>>
> >>>>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
> >>>>>>>>>>> kallsyms_lookup_name()").
> >>>>>>>>>>>
> >>>>>>>>>>> At present, this improvement is only for searching in vmlinux's
> >>>>>>>>>>> and
> >>>>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
> >>>>>>>>>>> BTF_KIND_STRUCT.
> >>>>>>>>>>>
> >>>>>>>>>>> Another change is the search direction, where we search the BTF
> >>>>>>>>>>> first and
> >>>>>>>>>>> then its base, the type id of the first matched btf_type will be
> >>>>>>>>>>> returned.
> >>>>>>>>>>>
> >>>>>>>>>>> Here is a time-consuming result that finding all the type ids of
> >>>>>>>>>>> 67,819 kernel
> >>>>>>>>>>> functions in vmlinux's BTF by their names:
> >>>>>>>>>>>
> >>>>>>>>>>> Before: 17000 ms
> >>>>>>>>>>> After: 10 ms
> >>>>>>>>>>>
> >>>>>>>>>>> The average lookup performance has improved about 1700x at the
> >>>>>>>>>>> above scenario.
> >>>>>>>>>>>
> >>>>>>>>>>> However, this change will consume more memory, for example,
> >>>>>>>>>>> 67,819 kernel
> >>>>>>>>>>> functions will allocate about 530KB memory.
> >>>>>>>>>>
> >>>>>>>>>> Hi Donglin,
> >>>>>>>>>>
> >>>>>>>>>> I think this is a good improvement. However, I wonder, why did
> >>>>>>>>>> you
> >>>>>>>>>> choose to have a separate name map for each BTF kind?
> >>>>>>>>>>
> >>>>>>>>>> I did some analysis for my local testing kernel config and got
> >>>>>>>>>> such numbers:
> >>>>>>>>>> - total number of BTF objects: 97350
> >>>>>>>>>> - number of FUNC and STRUCT objects: 51597
> >>>>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
> >>>>>>>>>> objects: 56817
> >>>>>>>>>> (these are all kinds for which lookup by name might make
> >>>>>>>>>> sense)
> >>>>>>>>>> - number of named objects: 54246
> >>>>>>>>>> - number of name collisions:
> >>>>>>>>>> - unique names: 53985 counts
> >>>>>>>>>> - 2 objects with the same name: 129 counts
> >>>>>>>>>> - 3 objects with the same name: 3 counts
> >>>>>>>>>>
> >>>>>>>>>> So, it appears that having a single map for all named objects
> >>>>>>>>>> makes
> >>>>>>>>>> sense and would also simplify the implementation, what do you
> >>>>>>>>>> think?
> >>>>>>>>>
> >>>>>>>>> Some more numbers for my config:
> >>>>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
> >>>>>>>>> - 43575 funcs, log2 43575 = 15.4
> >>>>>>>>> Thus, having separate map for types vs functions might save ~1.7
> >>>>>>>>> search iterations. Is this a significant slowdown in practice?
> >>>>>>>>
> >>>>>>>> What do you propose to do in case of duplicates ?
> >>>>>>>> func and struct can have the same name, but they will have two
> >>>>>>>> different
> >>>>>>>> btf_ids. How do we store them ?
> >>>>>>>> Also we might add global vars to BTF. Such request came up several
> >>>>>>>> times.
> >>>>>>>> So we need to make sure our search approach scales to
> >>>>>>>> func, struct, vars. I don't recall whether we search any other
> >>>>>>>> kinds.
> >>>>>>>> Separate arrays for different kinds seems ok.
> >>>>>>>> It's a bit of code complexity, but it's not an increase in memory.
> >>>>>>>
> >>>>>>> Binary search gives, say, lowest index of a thing with name A, then
> >>>>>>> increment index while name remains A looking for correct kind.
> >>>>>>> Given the name conflicts info from above, 99% of times there
> >>>>>>> would be
> >>>>>>> no need to iterate and in very few cases there would a couple of
> >>>>>>> iterations.
> >>>>>>>
> >>>>>>> Same logic would be necessary with current approach if different BTF
> >>>>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that
> >>>>>>> these
> >>>>>>> cohorts are mainly a way to split the tree for faster lookups, but
> >>>>>>> maybe that is not the main intent.
> >>>>>>>
> >>>>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
> >>>>>>>> extra memory. That's quite a bit. Anything we can do to compress
> >>>>>>>> it?
> >>>>>>>
> >>>>>>> That's an interesting question, from the top of my head:
> >>>>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
> >>>>>>> mean "increasing" name), shouldn't be that difficult.
> >>>>>>
> >>>>>> That sounds great. kallsyms are pre-sorted at build time.
> >>>>>> We should do the same with BTF.
> >>>>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
> >>>>>> too,
> >>>>>> but since vmlinux and kernel module BTFs will keep being processed
> >>>>>> through pahole we don't have to make gcc/llvm sort things right away.
> >>>>>> pahole will be enough. The kernel might do 'is it sorted' check
> >>>>>> during BTF validation and then use binary search or fall back to
> >>>>>> linear
> >>>>>> when not-sorted == old pahole.
> >>>>>>
> >>>>>
> >>>>> Yeah, I agree and will attempt to modify the pahole and perform a
> >>>>> test.
> >>>>> Do we need
> >>>>> to introduce a new macro to control the behavior when the BTF is not
> >>>>> sorted? If
> >>>>> it is not sorted, we can use the method mentioned in this patch or use
> >>>>> linear
> >>>>> search.
> >>>>>
> >>>>>
> >>>>
> >>>> One challenge with pahole is that it often runs in parallel mode, so I
> >>>> suspect any sorting would have to be done after merging across threads.
> >>>> Perhaps BTF deduplication time might be a useful time to re-sort by
> >>>> name? BTF dedup happens after BTF has been merged, and a new "sorted"
> >>>> btf_dedup_opts option could be added and controlled by a pahole
> >>>> option. However dedup is pretty complicated already..
> >>>>
> >>>> One thing we should weigh up though is if there are benefits to the
> >>>> way BTF is currently laid out. It tends to start with base types,
> >>>> and often-encountered types end up being located towards the start
> >>>> of the BTF data. For example
> >>>>
> >>>>
> >>>> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64
> >>>> encoding=(none)
> >>>> [2] CONST '(anon)' type_id=1
> >>>> [3] VOLATILE '(anon)' type_id=1
> >>>> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
> >>>> [5] PTR '(anon)' type_id=8
> >>>> [6] CONST '(anon)' type_id=5
> >>>> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
> >>>> [8] CONST '(anon)' type_id=7
> >>>> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
> >>>> [10] CONST '(anon)' type_id=9
> >>>> [11] TYPEDEF '__s8' type_id=12
> >>>> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
> >>>> [13] TYPEDEF '__u8' type_id=14
> >>>>
> >>>> So often-used types will be found quickly, even under linear search
> >>>> conditions.
> >>>
> >>> I found that there seems to be no code in the kernel that get the ID
> >>> of the
> >>> basic data type by calling btf_find_by_name_kind directly. The general
> >>> usage
> >>> of this function is to obtain the ID of a structure or function. After
> >>> we got
> >>> the ID of a structure or function, it is O(1) to get the IDs of its
> >>> members
> >>> or parameters.
> >>>
> >>> ./kernel/trace/trace_probe.c:383: id = btf_find_by_name_kind(btf,
> >>> funcname, BTF_KIND_FUNC);
> >>> ./kernel/bpf/btf.c:3523: id = btf_find_by_name_kind(btf,
> >>> value_type, BTF_KIND_STRUCT);
> >>> ./kernel/bpf/btf.c:5504: id = btf_find_by_name_kind(btf,
> >>> alloc_obj_fields[i], BTF_KIND_STRUCT);
> >>> ./kernel/bpf/bpf_struct_ops.c:128: module_id =
> >>> btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
> >>> ./net/ipv4/bpf_tcp_ca.c:28: type_id = btf_find_by_name_kind(btf,
> >>> "sock", BTF_KIND_STRUCT);
> >>> ./net/ipv4/bpf_tcp_ca.c:33: type_id = btf_find_by_name_kind(btf,
> >>> "tcp_sock", BTF_KIND_STRUCT);
> >>> ./net/netfilter/nf_bpf_link.c:181: type_id =
> >>> btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
> >>>
> >>>>
> >>>> When we look at how many lookups by id (which are O(1), since they are
> >>>> done via the btf->types[] array) versus by name, we see:
> >>>>
> >>>> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
> >>>> 120
> >>>> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
> >>>> 15
> >>>>
> >>>> I don't see a huge number of name-based lookups, and I think most are
> >>>> outside of the hotter codepaths, unless I'm missing some. All of which
> >>>> is to say it would be a good idea to have a clear sense of what will
> >>>> get
> >>>> faster with sorted-by-name BTF. Thanks!
> >>>
> >>> The story goes like this.
> >>>
> >>> I have added a new feature to the function graph called
> >>> "funcgraph_retval",
> >>> here is the link:
> >>>
> >>> https://lore.kernel.org/all/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/
> >>>
> >>> We can obtain the return values of almost every function during the
> >>> execution
> >>> of kernel through this feature, it can help us analyze problems.
> >>>
> >>
> >> It's a great feature!
> >
> > Thanks.
> >
> >>
> >>> However, this feature has two main drawbacks.
> >>>
> >>> 1. Even if a function's return type is void, a return value will still
> >>> be printed.
> >>>
> >>> 2. The return value printed may be incorrect when the width of the
> >>> return type is
> >>> smaller than the generic register.
> >>>
> >>> I think if we can get this return type of the function, then the
> >>> drawbacks mentioned
> >>> above can be eliminated. The function btf_find_by_name_kind can be used
> >>> to get the ID of
> >>> the kernel function, then we can get its return type easily. If the
> >>> return type is
> >>> void, the return value recorded will not be printed. If the width of the
> >>> return type
> >>> is smaller than the generic register, then the value stored in the upper
> >>> bits will be
> >>> trimmed. I have written a demo and these drawbacks were resolved.
> >>>
> >>> However, during my test, I found that it took too much time when read
> >>> the trace log
> >>> with this feature enabled, because the trace log consists of 200,000
> >>> lines. The
> >>> majority of the time was consumed by the btf_find_by_name_kind, which is
> >>> called
> >>> 200,000 times.
> >>>
> >>> So I think the performance of btf_find_by_name_kind may need to be
> >>> improved.
> >>>
> >>
> >> If I recall, Masami's work uses BTF ids, but can cache them since the
> >> user explicitly asks for specific fields in the trace output. I'm
> >> presuming that's not an option for you due to the fact funcgraph tracing
> >> enables everything (or at least everything under a filter predicate) and
> >> you have limited context to work with, is that right?
> >
> > Yes, right.
> >
> >>
> >> Looking at print_graph_entry_leaf() which I _think_ is where you'd need
> >> to print the retval from, you have access to the function address via
> >> call->func, and I presume you get the name from snprinting the symbol to
> >> a string or similar. So you're stuck in a context where you have the
> >> function address, and from that you can derive the function name. Is
> >> that correct? Thanks!
> >
> > Yes, both print_graph_return and print_graph_entry_leaf will call
> > print_graph_retval
> > to print the return value. Then call sprint_symbol_no_offset with
> > call->func to get
> > the function name, then call btf_find_by_name_kind to get the return type.
> >
>
> So what you ultimately need is a way to map from that information
> available to be able to figure out the size of the return value
> associated with a function.
>
> On the BPF side we've been thinking a bit about the relationship between
> kernel function addresses and their BTF representations; sometimes
> knowing BTF->address mapping is needed for cases where the same function
> name has multiple inconsistent function signatures in BTF. We don't
> represent function addresses yet in BTF but may end up having to.
> The reason I mention this is in an ideal world, it would benefit to
> populate kallsyms entries with their associated BTF ids;

I think it would be cleaner to keep addresses in BTF.
Since we might have same btf_id func with multiple addresses
and same name, but different btf_id with multi address too.
I suspect one to one kallsym to btf_id won't cover all cases.

Also we search BTFs not only for vars/funcs, but for types too.
It's better to optimize btf_find_by_name_kind() that it's fast
for finding structs by name too.

imo Eduard's proposal to sort all BTFs by name after dedup is the best.
I don't think sorting will add a noticeable build time increase.
We don't need to add pahole flags either.
The kernel can handle sorted vs non-sorted BTFs transparently.

2023-09-17 09:12:02

by pengdonglin

[permalink] [raw]
Subject: Re: [RFC PATCH v2] bpf: Using binary search to improve the performance of btf_find_by_name_kind

On 2023/9/15 1:14, Alan Maguire wrote:
> On 14/09/2023 14:05, pengdonglin wrote:
>> On 2023/9/14 20:46, Alan Maguire wrote:
>>> On 14/09/2023 11:13, pengdonglin wrote:
>>>> On 2023/9/13 21:34, Alan Maguire wrote:
>>>>> On 13/09/2023 11:32, pengdonglin wrote:
>>>>>> On 2023/9/13 2:46, Alexei Starovoitov wrote:
>>>>>>> On Tue, Sep 12, 2023 at 10:03 AM Eduard Zingerman <[email protected]>
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> On Tue, 2023-09-12 at 09:40 -0700, Alexei Starovoitov wrote:
>>>>>>>>> On Tue, Sep 12, 2023 at 7:19 AM Eduard Zingerman
>>>>>>>>> <[email protected]>
>>>>>>>>> wrote:
>>>>>>>>>>
>>>>>>>>>> On Tue, 2023-09-12 at 16:51 +0300, Eduard Zingerman wrote:
>>>>>>>>>>> On Sat, 2023-09-09 at 02:16 -0700, Donglin Peng wrote:
>>>>>>>>>>>> Currently, we are only using the linear search method to find
>>>>>>>>>>>> the
>>>>>>>>>>>> type id
>>>>>>>>>>>> by the name, which has a time complexity of O(n). This change
>>>>>>>>>>>> involves
>>>>>>>>>>>> sorting the names of btf types in ascending order and using
>>>>>>>>>>>> binary search,
>>>>>>>>>>>> which has a time complexity of O(log(n)). This idea was inspired
>>>>>>>>>>>> by the
>>>>>>>>>>>> following patch:
>>>>>>>>>>>>
>>>>>>>>>>>> 60443c88f3a8 ("kallsyms: Improve the performance of
>>>>>>>>>>>> kallsyms_lookup_name()").
>>>>>>>>>>>>
>>>>>>>>>>>> At present, this improvement is only for searching in vmlinux's
>>>>>>>>>>>> and
>>>>>>>>>>>> module's BTFs, and the kind should only be BTF_KIND_FUNC or
>>>>>>>>>>>> BTF_KIND_STRUCT.
>>>>>>>>>>>>
>>>>>>>>>>>> Another change is the search direction, where we search the BTF
>>>>>>>>>>>> first and
>>>>>>>>>>>> then its base, the type id of the first matched btf_type will be
>>>>>>>>>>>> returned.
>>>>>>>>>>>>
>>>>>>>>>>>> Here is a time-consuming result that finding all the type ids of
>>>>>>>>>>>> 67,819 kernel
>>>>>>>>>>>> functions in vmlinux's BTF by their names:
>>>>>>>>>>>>
>>>>>>>>>>>> Before: 17000 ms
>>>>>>>>>>>> After:     10 ms
>>>>>>>>>>>>
>>>>>>>>>>>> The average lookup performance has improved about 1700x at the
>>>>>>>>>>>> above scenario.
>>>>>>>>>>>>
>>>>>>>>>>>> However, this change will consume more memory, for example,
>>>>>>>>>>>> 67,819 kernel
>>>>>>>>>>>> functions will allocate about 530KB memory.
>>>>>>>>>>>
>>>>>>>>>>> Hi Donglin,
>>>>>>>>>>>
>>>>>>>>>>> I think this is a good improvement. However, I wonder, why did
>>>>>>>>>>> you
>>>>>>>>>>> choose to have a separate name map for each BTF kind?
>>>>>>>>>>>
>>>>>>>>>>> I did some analysis for my local testing kernel config and got
>>>>>>>>>>> such numbers:
>>>>>>>>>>> - total number of BTF objects: 97350
>>>>>>>>>>> - number of FUNC and STRUCT objects: 51597
>>>>>>>>>>> - number of FUNC, STRUCT, UNION, ENUM, ENUM64, TYPEDEF, DATASEC
>>>>>>>>>>> objects: 56817
>>>>>>>>>>>      (these are all kinds for which lookup by name might make
>>>>>>>>>>> sense)
>>>>>>>>>>> - number of named objects: 54246
>>>>>>>>>>> - number of name collisions:
>>>>>>>>>>>      - unique names: 53985 counts
>>>>>>>>>>>      - 2 objects with the same name: 129 counts
>>>>>>>>>>>      - 3 objects with the same name: 3 counts
>>>>>>>>>>>
>>>>>>>>>>> So, it appears that having a single map for all named objects
>>>>>>>>>>> makes
>>>>>>>>>>> sense and would also simplify the implementation, what do you
>>>>>>>>>>> think?
>>>>>>>>>>
>>>>>>>>>> Some more numbers for my config:
>>>>>>>>>> - 13241 types (struct, union, typedef, enum), log2 13241 = 13.7
>>>>>>>>>> - 43575 funcs, log2 43575 = 15.4
>>>>>>>>>> Thus, having separate map for types vs functions might save ~1.7
>>>>>>>>>> search iterations. Is this a significant slowdown in practice?
>>>>>>>>>
>>>>>>>>> What do you propose to do in case of duplicates ?
>>>>>>>>> func and struct can have the same name, but they will have two
>>>>>>>>> different
>>>>>>>>> btf_ids. How do we store them ?
>>>>>>>>> Also we might add global vars to BTF. Such request came up several
>>>>>>>>> times.
>>>>>>>>> So we need to make sure our search approach scales to
>>>>>>>>> func, struct, vars. I don't recall whether we search any other
>>>>>>>>> kinds.
>>>>>>>>> Separate arrays for different kinds seems ok.
>>>>>>>>> It's a bit of code complexity, but it's not an increase in memory.
>>>>>>>>
>>>>>>>> Binary search gives, say, lowest index of a thing with name A, then
>>>>>>>> increment index while name remains A looking for correct kind.
>>>>>>>> Given the name conflicts info from above, 99% of times there
>>>>>>>> would be
>>>>>>>> no need to iterate and in very few cases there would a couple of
>>>>>>>> iterations.
>>>>>>>>
>>>>>>>> Same logic would be necessary with current approach if different BTF
>>>>>>>> kinds would be allowed in BTF_ID_NAME_* cohorts. I figured that
>>>>>>>> these
>>>>>>>> cohorts are mainly a way to split the tree for faster lookups, but
>>>>>>>> maybe that is not the main intent.
>>>>>>>>
>>>>>>>>> With 13k structs and 43k funcs it's 56k * (4 + 4) that's 0.5 Mbyte
>>>>>>>>> extra memory. That's quite a bit. Anything we can do to compress
>>>>>>>>> it?
>>>>>>>>
>>>>>>>> That's an interesting question, from the top of my head:
>>>>>>>> pre-sort in pahole (re-assign IDs so that increasing ID also would
>>>>>>>> mean "increasing" name), shouldn't be that difficult.
>>>>>>>
>>>>>>> That sounds great. kallsyms are pre-sorted at build time.
>>>>>>> We should do the same with BTF.
>>>>>>> I think GCC can emit BTF directly now and LLVM emits it for bpf progs
>>>>>>> too,
>>>>>>> but since vmlinux and kernel module BTFs will keep being processed
>>>>>>> through pahole we don't have to make gcc/llvm sort things right away.
>>>>>>> pahole will be enough. The kernel might do 'is it sorted' check
>>>>>>> during BTF validation and then use binary search or fall back to
>>>>>>> linear
>>>>>>> when not-sorted == old pahole.
>>>>>>>
>>>>>>
>>>>>> Yeah, I agree and will attempt to modify the pahole and perform a
>>>>>> test.
>>>>>> Do we need
>>>>>> to introduce a new macro to control the behavior when the BTF is not
>>>>>> sorted? If
>>>>>> it is not sorted, we can use the method mentioned in this patch or use
>>>>>> linear
>>>>>> search.
>>>>>>
>>>>>>
>>>>>
>>>>> One challenge with pahole is that it often runs in parallel mode, so I
>>>>> suspect any sorting would have to be done after merging across threads.
>>>>> Perhaps BTF deduplication time might be a useful time to re-sort by
>>>>> name? BTF dedup happens after BTF has been merged, and a new "sorted"
>>>>> btf_dedup_opts option could be added and controlled by a pahole
>>>>> option. However dedup is pretty complicated already..
>>>>>
>>>>> One thing we should weigh up though is if there are benefits to the
>>>>> way BTF is currently laid out. It tends to start with base types,
>>>>> and often-encountered types end up being located towards the start
>>>>> of the BTF data. For example
>>>>>
>>>>>
>>>>> [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64
>>>>> encoding=(none)
>>>>> [2] CONST '(anon)' type_id=1
>>>>> [3] VOLATILE '(anon)' type_id=1
>>>>> [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
>>>>> [5] PTR '(anon)' type_id=8
>>>>> [6] CONST '(anon)' type_id=5
>>>>> [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>>>>> [8] CONST '(anon)' type_id=7
>>>>> [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none)
>>>>> [10] CONST '(anon)' type_id=9
>>>>> [11] TYPEDEF '__s8' type_id=12
>>>>> [12] INT 'signed char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED
>>>>> [13] TYPEDEF '__u8' type_id=14
>>>>>
>>>>> So often-used types will be found quickly, even under linear search
>>>>> conditions.
>>>>
>>>> I found that there seems to be no code in the kernel that get the ID
>>>> of the
>>>> basic data type by calling btf_find_by_name_kind directly. The general
>>>> usage
>>>> of this function is to obtain the ID of a structure or function. After
>>>> we got
>>>> the ID of a structure or function, it is O(1) to get the IDs of its
>>>> members
>>>> or parameters.
>>>>
>>>> ./kernel/trace/trace_probe.c:383:       id = btf_find_by_name_kind(btf,
>>>> funcname, BTF_KIND_FUNC);
>>>> ./kernel/bpf/btf.c:3523:        id = btf_find_by_name_kind(btf,
>>>> value_type, BTF_KIND_STRUCT);
>>>> ./kernel/bpf/btf.c:5504:                id = btf_find_by_name_kind(btf,
>>>> alloc_obj_fields[i], BTF_KIND_STRUCT);
>>>> ./kernel/bpf/bpf_struct_ops.c:128:      module_id =
>>>> btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
>>>> ./net/ipv4/bpf_tcp_ca.c:28:     type_id = btf_find_by_name_kind(btf,
>>>> "sock", BTF_KIND_STRUCT);
>>>> ./net/ipv4/bpf_tcp_ca.c:33:     type_id = btf_find_by_name_kind(btf,
>>>> "tcp_sock", BTF_KIND_STRUCT);
>>>> ./net/netfilter/nf_bpf_link.c:181:      type_id =
>>>> btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
>>>>
>>>>>
>>>>> When we look at how many lookups by id (which are O(1), since they are
>>>>> done via the btf->types[] array) versus by name, we see:
>>>>>
>>>>> $ grep btf_type_by_id kernel/bpf/*.c|wc -l
>>>>> 120
>>>>> $ grep btf_find_by_nam kernel/bpf/*.c|wc -l
>>>>> 15
>>>>>
>>>>> I don't see a huge number of name-based lookups, and I think most are
>>>>> outside of the hotter codepaths, unless I'm missing some. All of which
>>>>> is to say it would be a good idea to have a clear sense of what will
>>>>> get
>>>>> faster with sorted-by-name BTF. Thanks!
>>>>
>>>> The story goes like this.
>>>>
>>>> I have added a new feature to the function graph called
>>>> "funcgraph_retval",
>>>> here is the link:
>>>>
>>>> https://lore.kernel.org/all/1fc502712c981e0e6742185ba242992170ac9da8.1680954589.git.pengdonglin@sangfor.com.cn/
>>>>
>>>> We can obtain the return values of almost every function during the
>>>> execution
>>>> of kernel through this feature, it can help us analyze problems.
>>>>
>>>
>>> It's a great feature!
>>
>> Thanks.
>>
>>>
>>>> However, this feature has two main drawbacks.
>>>>
>>>> 1. Even if a function's return type is void,  a return value will still
>>>> be printed.
>>>>
>>>> 2. The return value printed may be incorrect when the width of the
>>>> return type is
>>>> smaller than the generic register.
>>>>
>>>> I think if we can get this return type of the function, then the
>>>> drawbacks mentioned
>>>> above can be eliminated. The function btf_find_by_name_kind can be used
>>>> to get the ID of
>>>> the kernel function, then we can get its return type easily. If the
>>>> return type is
>>>> void, the return value recorded will not be printed. If the width of the
>>>> return type
>>>> is smaller than the generic register, then the value stored in the upper
>>>> bits will be
>>>> trimmed. I have written a demo and these drawbacks were resolved.
>>>>
>>>> However, during my test, I found that it took too much time when read
>>>> the trace log
>>>> with this feature enabled, because the trace log consists of 200,000
>>>> lines. The
>>>> majority of the time was consumed by the btf_find_by_name_kind, which is
>>>> called
>>>> 200,000 times.
>>>>
>>>> So I think the performance of btf_find_by_name_kind  may need to be
>>>> improved.
>>>>
>>>
>>> If I recall, Masami's work uses BTF ids, but can cache them since the
>>> user explicitly asks for specific fields in the trace output. I'm
>>> presuming that's not an option for you due to the fact funcgraph tracing
>>> enables everything (or at least everything under a filter predicate) and
>>> you have limited context to work with, is that right?
>>
>> Yes, right.
>>
>>>
>>> Looking at print_graph_entry_leaf() which I _think_ is where you'd need
>>> to print the retval from, you have access to the function address via
>>> call->func, and I presume you get the name from snprinting the symbol to
>>> a string or similar. So you're stuck in a context where you have the
>>> function address, and from that you can derive the function name. Is
>>> that correct? Thanks!
>>
>> Yes, both print_graph_return and print_graph_entry_leaf will call
>> print_graph_retval
>> to print the return value. Then call sprint_symbol_no_offset with
>> call->func to get
>> the function name, then call btf_find_by_name_kind to get the return type.
>>
>
> So what you ultimately need is a way to map from that information
> available to be able to figure out the size of the return value
> associated with a function.
>
> On the BPF side we've been thinking a bit about the relationship between
> kernel function addresses and their BTF representations; sometimes
> knowing BTF->address mapping is needed for cases where the same function
> name has multiple inconsistent function signatures in BTF. We don't
> represent function addresses yet in BTF but may end up having to.
> The reason I mention this is in an ideal world, it would benefit to
> populate kallsyms entries with their associated BTF ids; then a
> function would need to be looked up once in kallsyms; that lookup would
> benefit from recent speedups, and if it contained the associated BTF id
> we'd have an O(1) lookup from the BTF id -> function. Not sure if that
> would be tenable from the kallsyms side, but I just wanted to mention
> it, as from the above it seems an address-based lookup is a possibility
> to solve the return value type lookup problem that you're facing.
>
> Cc'ed Jiri who had to wrestle with kallsyms for the kprobe multi stuff.
> Would the above help do you think?

Thank you, but I tend to agree with Alexei. Using kallsyms will consume extra
memory too, but it can only be used for functions. I have done a test using
kallsyms_lookup + optimized btf_find_by_name_kind, and the time consumed
was not excessive, it took about 38ms to find the IDs of 67823 kernel
functions. If using a new map for IDs and function addresses, it took
about 5ms.

>
> Thanks!
>
> Alan
>