Note: while prefixed with "bpf", this set is based on Linus' master branch.
This is v2 of [1]. It is revised based on Peter's feedback. The patch is
also split into 3.
[1] https://lore.kernel.org/linux-mm/[email protected]/
Song Liu (3):
bpf: fill new bpf_prog_pack with illegal instructions
x86/alternative: introduce text_poke_set
bpf: introduce bpf_arch_text_invalidate for bpf_prog_pack
arch/x86/include/asm/text-patching.h | 1 +
arch/x86/kernel/alternative.c | 70 ++++++++++++++++++++++++----
arch/x86/net/bpf_jit_comp.c | 5 ++
include/linux/bpf.h | 1 +
kernel/bpf/core.c | 18 +++++--
5 files changed, 81 insertions(+), 14 deletions(-)
--
2.30.2
Introduce a memset like API for text_poke. This will be used to fill the
unused RX memory with illegal instructions.
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Song Liu <[email protected]>
---
arch/x86/include/asm/text-patching.h | 1 +
arch/x86/kernel/alternative.c | 70 ++++++++++++++++++++++++----
2 files changed, 61 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index d20ab0921480..1cc15528ce29 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+extern void *text_poke_set(void *addr, int c, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d374cb3cf024..732814065389 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -994,7 +994,21 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
-static void *__text_poke(void *addr, const void *opcode, size_t len)
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+ memcpy(dst, src, len);
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+ int c = *(int *)src;
+
+ memset(dst, c, len);
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
@@ -1059,7 +1073,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
prev = use_temporary_mm(poking_mm);
kasan_disable_current();
- memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+ func((u8 *)poking_addr + offset_in_page(addr), src, len);
kasan_enable_current();
/*
@@ -1087,11 +1101,13 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
PAGE_SHIFT, false);
- /*
- * If the text does not match what we just wrote then something is
- * fundamentally screwy; there's nothing we can really do about that.
- */
- BUG_ON(memcmp(addr, opcode, len));
+ if (func == text_poke_memcpy) {
+ /*
+ * If the text does not match what we just wrote then something is
+ * fundamentally screwy; there's nothing we can really do about that.
+ */
+ BUG_ON(memcmp(addr, src, len));
+ }
local_irq_restore(flags);
pte_unmap_unlock(ptep, ptl);
@@ -1118,7 +1134,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
{
lockdep_assert_held(&text_mutex);
- return __text_poke(addr, opcode, len);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
}
/**
@@ -1137,7 +1153,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
*/
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
{
- return __text_poke(addr, opcode, len);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
}
/**
@@ -1167,7 +1183,41 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
- __text_poke((void *)ptr, opcode + patched, s);
+ __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+/**
+ * text_poke_set - memset into (an unused part of) RX memory
+ * @addr: address to modify
+ * @c: the byte to fill the area with
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_set(void *addr, int c, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
patched += s;
}
mutex_unlock(&text_mutex);
--
2.30.2
bpf_prog_pack enables sharing huge pages among multiple BPF programs.
These pages are marked as executable before the JIT engine fill it with
BPF programs. To make these pages safe, fill the hole bpf_prog_pack with
illegal instructions before making it executable.
Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]")
Signed-off-by: Song Liu <[email protected]>
---
kernel/bpf/core.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..132dfba389be 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void)
return size;
}
-static struct bpf_prog_pack *alloc_new_pack(void)
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
@@ -886,6 +886,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
kfree(pack);
return NULL;
}
+ bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
list_add_tail(&pack->list, &pack_list);
@@ -895,7 +896,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
return pack;
}
-static void *bpf_prog_pack_alloc(u32 size)
+static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
struct bpf_prog_pack *pack;
@@ -910,6 +911,7 @@ static void *bpf_prog_pack_alloc(u32 size)
size = round_up(size, PAGE_SIZE);
ptr = module_alloc(size);
if (ptr) {
+ bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
@@ -923,7 +925,7 @@ static void *bpf_prog_pack_alloc(u32 size)
goto found_free_area;
}
- pack = alloc_new_pack();
+ pack = alloc_new_pack(bpf_fill_ill_insns);
if (!pack)
goto out;
@@ -1102,7 +1104,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
if (bpf_jit_charge_modmem(size))
return NULL;
- ro_header = bpf_prog_pack_alloc(size);
+ ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
if (!ro_header) {
bpf_jit_uncharge_modmem(size);
return NULL;
--
2.30.2
Introduce bpf_arch_text_invalidate and use it to fill bpf_prog_pack with
illegal instructions when a BPF program is freed.
Signed-off-by: Song Liu <[email protected]>
---
arch/x86/net/bpf_jit_comp.c | 5 +++++
include/linux/bpf.h | 1 +
kernel/bpf/core.c | 8 ++++++++
3 files changed, 14 insertions(+)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 16b6efacf7c6..9ce65570264c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -228,6 +228,11 @@ static void jit_fill_hole(void *area, unsigned int size)
memset(area, 0xcc, size);
}
+int bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len));
+}
+
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bdb5298735ce..e884a1f39023 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2381,6 +2381,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
void *bpf_arch_text_copy(void *dst, void *src, size_t len);
+int bpf_arch_text_invalidate(void *dst, size_t len);
struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 132dfba389be..0f9a16f7b2a8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -968,6 +968,9 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+ WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+ "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
bitmap_clear(pack->bitmap, pos, nbits);
if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
bpf_prog_chunk_count(), 0) == 0) {
@@ -2729,6 +2732,11 @@ void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
return ERR_PTR(-ENOTSUPP);
}
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return -ENOTSUPP;
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
--
2.30.2
Hi Linus,
> On Apr 25, 2022, at 1:39 PM, Song Liu <[email protected]> wrote:
>
> Note: while prefixed with "bpf", this set is based on Linus' master branch.
>
> This is v2 of [1]. It is revised based on Peter's feedback. The patch is
> also split into 3.
>
> [1] https://lore.kernel.org/linux-mm/[email protected]/
Could you please share your suggestions on this set? Shall we ship it
with 5.18?
Thanks,
Song
>
> Song Liu (3):
> bpf: fill new bpf_prog_pack with illegal instructions
> x86/alternative: introduce text_poke_set
> bpf: introduce bpf_arch_text_invalidate for bpf_prog_pack
>
> arch/x86/include/asm/text-patching.h | 1 +
> arch/x86/kernel/alternative.c | 70 ++++++++++++++++++++++++----
> arch/x86/net/bpf_jit_comp.c | 5 ++
> include/linux/bpf.h | 1 +
> kernel/bpf/core.c | 18 +++++--
> 5 files changed, 81 insertions(+), 14 deletions(-)
>
> --
> 2.30.2
On Wed, Apr 27, 2022 at 3:24 PM Song Liu <[email protected]> wrote:
>
> Could you please share your suggestions on this set? Shall we ship it
> with 5.18?
I'd personally prefer to just not do the prog_pack thing at all, since
I don't think it was actually in a "ready to ship" state for this
merge window, and the hugepage mapping protection games I'm still
leery of.
Yes, the hugepage protection things probably do work from what I saw
when I looked through them, but that x86 vmalloc hugepage code was
really designed for another use (non-refcounted device pages), so the
fact that it all actually seems surprisingly ok certainly wasn't
because the code was designed to do that new case.
Does the prog_pack thing work with small pages?
Yes. But that wasn't what it was designed for or its selling point, so
it all is a bit suspect to me.
And I'm looking at those set_memory_xyz() calls, and I'm going "yeah,
I think it works on x86, but on ppc I look at it and I see
static inline int set_memory_ro(unsigned long addr, int numpages)
{
return change_memory_attr(addr, numpages, SET_MEMORY_RO);
}
and then in change_memory_attr() it does
if (WARN_ON_ONCE(is_vmalloc_or_module_addr((void *)addr) &&
is_vm_area_hugepages((void *)addr)))
return -EINVAL;
and I'm "this is all supposedly generic code, but I'm not seeing how
it works cross-architecture".
I *think* it's actually because this is all basically x86-specific due
to x86 being the only architecture that implements
bpf_arch_text_copy(), and everybody else then ends up erroring out and
not using the prog_pack thing after all.
And then one of the two places that use bpf_arch_text_copy() doesn't
even check the returned error code.
So this all ends up just depending on "only x86 will actually succeed
in bpf_jit_binary_pack_finalize(), everybody else will fail after
having done all the common setup".
End result: it all seems a bit broken right now. The "generic" code
only works on x86, and on other architectures it goes through the
motions and then fails at the end. And even on x86 I worry about
actually enabling it fully.
I'm not having the warm and fuzzies about this all, in other words.
Maybe people can convince me otherwise, but I think you need to work at it.
And even for 5.19+ kind of timeframes, I'd actually like the x86
people who maintain arch/x86/mm/pat/set_memory.c also sign off on
using that code for hugepage vmalloc mappings too.
I *think* it does. But that code has various subtle things going on.
I see PeterZ is cc'd (presumably because of the text_poke() stuff, not
because of the whole "call set_memory_ro() on virtually mapped kernel
largepage memory".
Did people even talk to x86 people about this, or did the whole "it
works, except it turns out set_vm_flush_reset_perms() doesn't work"
mean that the authors of that code never got involved?
Linus
Hi Linus,
Thanks for your thorough analysis of the situation, which make a lot of
sense.
> On Apr 27, 2022, at 6:45 PM, Linus Torvalds <[email protected]> wrote:
>
> On Wed, Apr 27, 2022 at 3:24 PM Song Liu <[email protected]> wrote:
>>
>> Could you please share your suggestions on this set? Shall we ship it
>> with 5.18?
>
> I'd personally prefer to just not do the prog_pack thing at all, since
> I don't think it was actually in a "ready to ship" state for this
> merge window, and the hugepage mapping protection games I'm still
> leery of.
>
> Yes, the hugepage protection things probably do work from what I saw
> when I looked through them, but that x86 vmalloc hugepage code was
> really designed for another use (non-refcounted device pages), so the
> fact that it all actually seems surprisingly ok certainly wasn't
> because the code was designed to do that new case.
>
> Does the prog_pack thing work with small pages?
>
> Yes. But that wasn't what it was designed for or its selling point, so
> it all is a bit suspect to me.
prog_pack on small pages can also reduce the direct map fragmentation.
This is because libbpf uses tiny BPF programs to probe kernel features.
Before prog_pack, all these BPF programs can fragment the direct map.
For example, runqslower (tools/bpf/runqslower/) loads total 7 BPF programs
(3 actual programs and 4 tiny probe programs). All these programs may
cause direct map fragmentation. With prog_pack, OTOH, these BPF programs
would fit in a single page (or even share pages with other tools).
>
> And I'm looking at those set_memory_xyz() calls, and I'm going "yeah,
> I think it works on x86, but on ppc I look at it and I see
>
> static inline int set_memory_ro(unsigned long addr, int numpages)
> {
> return change_memory_attr(addr, numpages, SET_MEMORY_RO);
> }
>
> and then in change_memory_attr() it does
>
> if (WARN_ON_ONCE(is_vmalloc_or_module_addr((void *)addr) &&
> is_vm_area_hugepages((void *)addr)))
> return -EINVAL;
>
> and I'm "this is all supposedly generic code, but I'm not seeing how
> it works cross-architecture".
AFAICT, we have spent the time and effort to design bpf_prog_pack to
work cross-architecture. However, since prog_pack requires relatively
new building blocks in multiple layers (vmalloc, set_memory_XXX,
bpf_jit, etc.), non-x86 architectures have more missing pieces.
The fact that we cannot rely on set_vm_flush_reset_perms() for huge
pages on x86 does leak some architectural details to generic code.
But I guess we don’t really need the hack (by not using
set_vm_flush_reset_perms, but calling set_memory_ manually) for
prog_pack on small pages?
>
> I *think* it's actually because this is all basically x86-specific due
> to x86 being the only architecture that implements
> bpf_arch_text_copy(), and everybody else then ends up erroring out and
> not using the prog_pack thing after all.
>
> And then one of the two places that use bpf_arch_text_copy() doesn't
> even check the returned error code.
>
> So this all ends up just depending on "only x86 will actually succeed
> in bpf_jit_binary_pack_finalize(), everybody else will fail after
> having done all the common setup".
>
> End result: it all seems a bit broken right now. The "generic" code
> only works on x86, and on other architectures it goes through the
> motions and then fails at the end. And even on x86 I worry about
> actually enabling it fully.
>
> I'm not having the warm and fuzzies about this all, in other words.
>
> Maybe people can convince me otherwise, but I think you need to work at it.
>
> And even for 5.19+ kind of timeframes, I'd actually like the x86
> people who maintain arch/x86/mm/pat/set_memory.c also sign off on
> using that code for hugepage vmalloc mappings too.
>
> I *think* it does. But that code has various subtle things going on.
>
> I see PeterZ is cc'd (presumably because of the text_poke() stuff, not
> because of the whole "call set_memory_ro() on virtually mapped kernel
> largepage memory".
>
> Did people even talk to x86 people about this, or did the whole "it
> works, except it turns out set_vm_flush_reset_perms() doesn't work"
> mean that the authors of that code never got involved?
We have CC'ed x86 folks (at least on some versions). But we haven’t
got much feedbacks until recently. We should definitely do better
with this in the future.
set_vm_flush_reset_perms is clearly a problem here. But if we don’t
use huge page (current upstream + this set), we shouldn’t need that
workaround.
Overall, we do hope to eliminate (or reduce) system slowdown caused
by direct map fragmentation. Ideally, we want achieve this with
small number of huge pages. If huge pages don’t work here, small
pages would also help. Given we already do set_memory_() with BPF
programs (in bpf_jit_binary_lock_ro), I think the risk with small
pages is pretty low. And we should still see non-trivial performance
improvement from 4kB bpf_prog_pack (I don’t have full benchmark
results at the moment).
Thanks again,
Song
> On Apr 27, 2022, at 11:48 PM, Song Liu <[email protected]> wrote:
>
> Hi Linus,
>
> Thanks for your thorough analysis of the situation, which make a lot of
> sense.
>
>> On Apr 27, 2022, at 6:45 PM, Linus Torvalds <[email protected]> wrote:
>>
>> On Wed, Apr 27, 2022 at 3:24 PM Song Liu <[email protected]> wrote:
>>>
>>> Could you please share your suggestions on this set? Shall we ship it
>>> with 5.18?
>>
>> I'd personally prefer to just not do the prog_pack thing at all, since
>> I don't think it was actually in a "ready to ship" state for this
>> merge window, and the hugepage mapping protection games I'm still
>> leery of.
>>
>> Yes, the hugepage protection things probably do work from what I saw
>> when I looked through them, but that x86 vmalloc hugepage code was
>> really designed for another use (non-refcounted device pages), so the
>> fact that it all actually seems surprisingly ok certainly wasn't
>> because the code was designed to do that new case.
>>
>> Does the prog_pack thing work with small pages?
>>
>> Yes. But that wasn't what it was designed for or its selling point, so
>> it all is a bit suspect to me.
>
> prog_pack on small pages can also reduce the direct map fragmentation.
> This is because libbpf uses tiny BPF programs to probe kernel features.
> Before prog_pack, all these BPF programs can fragment the direct map.
> For example, runqslower (tools/bpf/runqslower/) loads total 7 BPF programs
> (3 actual programs and 4 tiny probe programs). All these programs may
> cause direct map fragmentation. With prog_pack, OTOH, these BPF programs
> would fit in a single page (or even share pages with other tools).
Here are some performance data from our web service production benchmark,
which is the biggest service in our fleet. We compare 3 kernels:
nopack: no bpf_prog_pack; IOW, the same behavior as 5.17
4kpack: use bpf_prog_pack on 4kB pages (same as 5.18-rc5)
2mpack: use bpf_prog_pack on 2MB pages
The benchmark measures system throughput under latency constraints.
4kpack provides 0.5% to 0.7% more throughput than nopack.
2mpack provides 0.6% to 0.9% more throughput than nopack.
So the data has confirmed:
1. Direct map fragmentation has non-trivial impact on system performance;
2. While 2MB pages are preferred, bpf_prog_pack on 4kB pages also gives
Significant performance improvements.
Thanks,
Song
> On May 6, 2022, at 11:50 PM, Song Liu <[email protected]> wrote:
>
>
>
>> On Apr 27, 2022, at 11:48 PM, Song Liu <[email protected]> wrote:
>>
>> Hi Linus,
>>
>> Thanks for your thorough analysis of the situation, which make a lot of
>> sense.
>>
>>> On Apr 27, 2022, at 6:45 PM, Linus Torvalds <[email protected]> wrote:
>>>
>>> On Wed, Apr 27, 2022 at 3:24 PM Song Liu <[email protected]> wrote:
>>>>
>>>> Could you please share your suggestions on this set? Shall we ship it
>>>> with 5.18?
>>>
>>> I'd personally prefer to just not do the prog_pack thing at all, since
>>> I don't think it was actually in a "ready to ship" state for this
>>> merge window, and the hugepage mapping protection games I'm still
>>> leery of.
>>>
>>> Yes, the hugepage protection things probably do work from what I saw
>>> when I looked through them, but that x86 vmalloc hugepage code was
>>> really designed for another use (non-refcounted device pages), so the
>>> fact that it all actually seems surprisingly ok certainly wasn't
>>> because the code was designed to do that new case.
>>>
>>> Does the prog_pack thing work with small pages?
>>>
>>> Yes. But that wasn't what it was designed for or its selling point, so
>>> it all is a bit suspect to me.
>>
>> prog_pack on small pages can also reduce the direct map fragmentation.
>> This is because libbpf uses tiny BPF programs to probe kernel features.
>> Before prog_pack, all these BPF programs can fragment the direct map.
>> For example, runqslower (tools/bpf/runqslower/) loads total 7 BPF programs
>> (3 actual programs and 4 tiny probe programs). All these programs may
>> cause direct map fragmentation. With prog_pack, OTOH, these BPF programs
>> would fit in a single page (or even share pages with other tools).
>
> Here are some performance data from our web service production benchmark,
> which is the biggest service in our fleet. We compare 3 kernels:
>
> nopack: no bpf_prog_pack; IOW, the same behavior as 5.17
> 4kpack: use bpf_prog_pack on 4kB pages (same as 5.18-rc5)
> 2mpack: use bpf_prog_pack on 2MB pages
>
> The benchmark measures system throughput under latency constraints.
> 4kpack provides 0.5% to 0.7% more throughput than nopack.
> 2mpack provides 0.6% to 0.9% more throughput than nopack.
>
> So the data has confirmed:
> 1. Direct map fragmentation has non-trivial impact on system performance;
> 2. While 2MB pages are preferred, bpf_prog_pack on 4kB pages also gives
> Significant performance improvements.
Please note that 0.5% is a huge improvement for our fleet. I believe this
is also significant for other companies with many thousand servers.
Thanks,
Song