2022-04-22 10:28:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack

> > On Apr 21, 2022, at 3:30 PM, Linus Torvalds <[email protected]> wrote:

> > I actually think bpf_arch_text_copy() is another horribly badly done thing.
> >
> > It seems only implemented on x86 (I'm not sure how anything else is
> > supposed to work, I didn't go look), and there it is horribly badly
> > done, using __text_poke() that does all these magical things just to
> > make it atomic wrt concurrent code execution.
> >
> > None of which is *AT*ALL* relevant for this case, since concurrent
> > code execution simply isn't a thing (and if it were, you would already
> > have lost).
> >
> > And if that wasn't pointless enough, it does all that magic "map the
> > page writably at a different virtual address using poking_addr in
> > poking_mm" and a different address space entirely.
> >
> > All of that is required for REAL KERNEL CODE.
> >
> > But the thing is, for bpf_prog_pack, all of that is just completely
> > pointless and stupid complexity.

I think the point is that this hole will likely share a page with active
code, and as such there should not be a writable mapping mapping to it,
necessitating the whole __text_poke() mess.

That said; it does seem somewhat silly have a whole page worth of int3
around just for this.

Perhaps we can do something like the completely untested below?

---
arch/x86/kernel/alternative.c | 48 +++++++++++++++++++++++++++++++++++++------
1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d374cb3cf024..60afa9105307 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -994,7 +994,20 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;

-static void *__text_poke(void *addr, const void *opcode, size_t len)
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+ memcpy(dst, src, len);
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+ int c = *(int *)src;
+ memset(dst, c, len);
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
{
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
@@ -1059,7 +1072,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
prev = use_temporary_mm(poking_mm);

kasan_disable_current();
- memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+ func((void *)poking_addr + offset_in_page(addr), src, len);
kasan_enable_current();

/*
@@ -1091,7 +1104,8 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
* If the text does not match what we just wrote then something is
* fundamentally screwy; there's nothing we can really do about that.
*/
- BUG_ON(memcmp(addr, opcode, len));
+ if (func == text_poke_memcpy)
+ BUG_ON(memcmp(addr, src, len));

local_irq_restore(flags);
pte_unmap_unlock(ptep, ptl);
@@ -1118,7 +1132,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
{
lockdep_assert_held(&text_mutex);

- return __text_poke(addr, opcode, len);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
}

/**
@@ -1137,7 +1151,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
*/
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
{
- return __text_poke(addr, opcode, len);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
}

/**
@@ -1167,7 +1181,29 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)

s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);

- __text_poke((void *)ptr, opcode + patched, s);
+ __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+void *text_poke_set(void *addr, int c, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
patched += s;
}
mutex_unlock(&text_mutex);


2022-04-23 07:52:17

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH bpf] bpf: invalidate unused part of bpf_prog_pack

On Fri, Apr 22, 2022 at 12:31 AM Peter Zijlstra <[email protected]> wrote:
>
> > > On Apr 21, 2022, at 3:30 PM, Linus Torvalds <[email protected]> wrote:
>
> > > I actually think bpf_arch_text_copy() is another horribly badly done thing.
> > >
> > > It seems only implemented on x86 (I'm not sure how anything else is
> > > supposed to work, I didn't go look), and there it is horribly badly
> > > done, using __text_poke() that does all these magical things just to
> > > make it atomic wrt concurrent code execution.
> > >
> > > None of which is *AT*ALL* relevant for this case, since concurrent
> > > code execution simply isn't a thing (and if it were, you would already
> > > have lost).
> > >
> > > And if that wasn't pointless enough, it does all that magic "map the
> > > page writably at a different virtual address using poking_addr in
> > > poking_mm" and a different address space entirely.
> > >
> > > All of that is required for REAL KERNEL CODE.
> > >
> > > But the thing is, for bpf_prog_pack, all of that is just completely
> > > pointless and stupid complexity.
>
> I think the point is that this hole will likely share a page with active
> code, and as such there should not be a writable mapping mapping to it,
> necessitating the whole __text_poke() mess.
>
> That said; it does seem somewhat silly have a whole page worth of int3
> around just for this.
>
> Perhaps we can do something like the completely untested below?

Yeah, this looks like a better approach. I will draft v2 based on this.

Thanks,
Song

>
> ---
> arch/x86/kernel/alternative.c | 48 +++++++++++++++++++++++++++++++++++++------
> 1 file changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index d374cb3cf024..60afa9105307 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -994,7 +994,20 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
> __ro_after_init struct mm_struct *poking_mm;
> __ro_after_init unsigned long poking_addr;
>
> -static void *__text_poke(void *addr, const void *opcode, size_t len)
> +static void text_poke_memcpy(void *dst, const void *src, size_t len)
> +{
> + memcpy(dst, src, len);
> +}
> +
> +static void text_poke_memset(void *dst, const void *src, size_t len)
> +{
> + int c = *(int *)src;
> + memset(dst, c, len);
> +}
> +
> +typedef void text_poke_f(void *dst, const void *src, size_t len);
> +
> +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
> {
> bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
> struct page *pages[2] = {NULL};
> @@ -1059,7 +1072,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
> prev = use_temporary_mm(poking_mm);
>
> kasan_disable_current();
> - memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
> + func((void *)poking_addr + offset_in_page(addr), src, len);
> kasan_enable_current();
>
> /*
> @@ -1091,7 +1104,8 @@ static void *__text_poke(void *addr, const void *opcode, size_t len)
> * If the text does not match what we just wrote then something is
> * fundamentally screwy; there's nothing we can really do about that.
> */
> - BUG_ON(memcmp(addr, opcode, len));
> + if (func == text_poke_memcpy)
> + BUG_ON(memcmp(addr, src, len));
>
> local_irq_restore(flags);
> pte_unmap_unlock(ptep, ptl);
> @@ -1118,7 +1132,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
> {
> lockdep_assert_held(&text_mutex);
>
> - return __text_poke(addr, opcode, len);
> + return __text_poke(text_poke_memcpy, addr, opcode, len);
> }
>
> /**
> @@ -1137,7 +1151,7 @@ void *text_poke(void *addr, const void *opcode, size_t len)
> */
> void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
> {
> - return __text_poke(addr, opcode, len);
> + return __text_poke(text_poke_memcpy, addr, opcode, len);
> }
>
> /**
> @@ -1167,7 +1181,29 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
>
> s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
>
> - __text_poke((void *)ptr, opcode + patched, s);
> + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
> + patched += s;
> + }
> + mutex_unlock(&text_mutex);
> + return addr;
> +}
> +
> +void *text_poke_set(void *addr, int c, size_t len)
> +{
> + unsigned long start = (unsigned long)addr;
> + size_t patched = 0;
> +
> + if (WARN_ON_ONCE(core_kernel_text(start)))
> + return NULL;
> +
> + mutex_lock(&text_mutex);
> + while (patched < len) {
> + unsigned long ptr = start + patched;
> + size_t s;
> +
> + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
> +
> + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
> patched += s;
> }
> mutex_unlock(&text_mutex);