2019-08-27 18:15:27

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 3/3] x86/ftrace: Use text_poke()

Move ftrace over to using the generic x86 text_poke functions; this
avoids having a second/different copy of that code around.

This also avoids ftrace violating the (new) W^X rule and avoids
fragmenting the kernel text page-tables, due to no longer having to
toggle them RW.

Cc: Steven Rostedt <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/ftrace.h | 2
arch/x86/kernel/alternative.c | 4
arch/x86/kernel/ftrace.c | 630 ++++++------------------------------------
arch/x86/kernel/traps.c | 9
4 files changed, 93 insertions(+), 552 deletions(-)

--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -35,8 +35,6 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};

-int ftrace_int3_handler(struct pt_regs *regs);
-
#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR

#endif /* CONFIG_DYNAMIC_FTRACE */
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -941,7 +941,7 @@ static struct bp_patching_desc {
int nr_entries;
} bp_patching;

-static int patch_cmp(const void *key, const void *elt)
+static int notrace patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;

@@ -953,7 +953,7 @@ static int patch_cmp(const void *key, co
}
NOKPROBE_SYMBOL(patch_cmp);

-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
void *ip;
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -43,16 +43,12 @@ int ftrace_arch_code_modify_prepare(void
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
- set_kernel_text_rw();
- set_all_modules_text_rw();
return 0;
}

int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
- set_all_modules_text_ro();
- set_kernel_text_ro();
mutex_unlock(&text_mutex);
return 0;
}
@@ -60,67 +56,34 @@ int ftrace_arch_code_modify_post_process
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
struct {
- unsigned char op;
+ char op;
int offset;
} __attribute__((packed));
};

-static int ftrace_calc_offset(long ip, long addr)
-{
- return (int)(addr - ip);
-}
-
-static unsigned char *
-ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+static const char *ftrace_text_replace(char op, unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;

- calc.op = op;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+ calc.op = op;
+ calc.offset = (int)(addr - (ip + MCOUNT_INSN_SIZE));

return calc.code;
}

-static unsigned char *
-ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
- return ftrace_text_replace(0xe8, ip, addr);
-}
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
-}
-
-static unsigned long text_ip_addr(unsigned long ip)
+static const char *ftrace_nop_replace(void)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only, so we use
- * the kernel identity mapping instead of the kernel text mapping
- * to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return ip;
+ return ideal_nops[NOP_ATOMIC5];
}

-static const unsigned char *ftrace_nop_replace(void)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- return ideal_nops[NOP_ATOMIC5];
+ return ftrace_text_replace(CALL_INSN_OPCODE, ip, addr);
}

-static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- ftrace_expected = old_code;
+ char cur_code[MCOUNT_INSN_SIZE];

/*
* Note:
@@ -129,31 +92,38 @@ ftrace_modify_code_direct(unsigned long
* Carefully read and modify the code with probe_kernel_*(), and make
* sure what we read is what we expected it to be before modifying it.
*/
-
/* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
return -EFAULT;
+ }

/* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ON(1);
return -EINVAL;
+ }

- ip = text_ip_addr(ip);
-
- /* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
- return -EPERM;
+ return 0;
+}

- sync_core();
+static int
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
+{
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;

+ /* replace the text with the new text */
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}

-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;

old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
@@ -167,19 +137,20 @@ int ftrace_make_nop(struct module *mod,
* just modify the code directly.
*/
if (addr == MCOUNT_ADDR)
- return ftrace_modify_code_direct(rec->ip, old, new);
-
- ftrace_expected = NULL;
+ return ftrace_modify_code_direct(ip, old, new);

- /* Normal cases use add_brk_on_nop */
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
}

int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;

old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
@@ -189,43 +160,6 @@ int ftrace_make_call(struct dyn_ftrace *
}

/*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- * CPU-0 CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- * <trap-int3> // implicit (r)mb
- * if (atomic_read(mfc))
- * call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
-/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
* ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -237,452 +171,84 @@ int ftrace_modify_call(struct dyn_ftrace
unsigned long addr)
{
WARN_ON(1);
- ftrace_expected = NULL;
return -EINVAL;
}

-static unsigned long ftrace_update_func;
-static unsigned long ftrace_update_func_call;
-
-static int update_ftrace_func(unsigned long ip, void *new)
-{
- unsigned char old[MCOUNT_INSN_SIZE];
- int ret;
-
- memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
-
- ftrace_update_func = ip;
- /* Make sure the breakpoints see the ftrace_update_func update */
- smp_wmb();
-
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
- ret = ftrace_modify_code(ip, old, new);
-
- atomic_dec(&modifying_ftrace_code);
-
- return ret;
-}
-
int ftrace_update_ftrace_func(ftrace_func_t func)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char *new;
- int ret;
-
- ftrace_update_func_call = (unsigned long)func;
-
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
-
- /* Also update the regs callback function */
- if (!ret) {
- ip = (unsigned long)(&ftrace_regs_call);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- }
-
- return ret;
-}
-
-static nokprobe_inline int is_ftrace_caller(unsigned long ip)
-{
- if (ip == ftrace_update_func)
- return 1;
-
- return 0;
-}
-
-/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
unsigned long ip;
+ const char *new;

- if (WARN_ON_ONCE(!regs))
- return 0;
-
- ip = regs->ip - INT3_INSN_SIZE;
-
- if (ftrace_location(ip)) {
- int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
- return 1;
- } else if (is_ftrace_caller(ip)) {
- if (!ftrace_update_func_call) {
- int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
- return 1;
- }
- int3_emulate_call(regs, ftrace_update_func_call);
- return 1;
- }
-
- return 0;
-}
-NOKPROBE_SYMBOL(ftrace_int3_handler);
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- ip = text_ip_addr(ip);
-
- if (probe_kernel_write((void *)ip, val, size))
- return -EPERM;
-
- return 0;
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- ftrace_expected = old;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- return ftrace_write(ip, &brk, 1);
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned const char *old;
- unsigned long ip = rec->ip;
-
- old = ftrace_call_replace(ip, addr);
-
- return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
- unsigned const char *old;
-
- old = ftrace_nop_replace();
-
- return add_break(rec->ip, old);
-}
-
-static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ftrace_addr = ftrace_get_addr_curr(rec);
-
- ret = ftrace_test_record(rec, enable);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
-
-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
-{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return 0;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = ftrace_get_addr_new(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = ftrace_get_addr_curr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- ftrace_expected = nop;
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
- update:
- return ftrace_write(ip, nop, 1);
-}
-
-static int add_update_code(unsigned long ip, unsigned const char *new)
-{
- /* skip breakpoint */
- ip++;
- new++;
- return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
-}
-
-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
- return add_update_code(ip, new);
-}
-
-static int add_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
- return add_update_code(ip, new);
-}
-
-static int add_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_update_nop(rec);
- }
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);

- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return finish_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return finish_update_nop(rec);
- }
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);

return 0;
}

-static void do_sync_core(void *data)
-{
- sync_core();
-}
-
-static void run_sync(void)
-{
- int enable_irqs;
-
- /* No need to sync if there's only one CPU */
- if (num_online_cpus() == 1)
- return;
-
- enable_irqs = irqs_disabled();
-
- /* We may be called with interrupts disabled (on bootup). */
- if (enable_irqs)
- local_irq_enable();
- on_each_cpu(do_sync_core, NULL, 1);
- if (enable_irqs)
- local_irq_disable();
-}
-
void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
- const char *report = "adding breakpoints";
- int count = 0;
+ const char *new, *old;
int ret;

for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);

- ret = add_breakpoints(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
-
- report = "updating code";
- count = 0;
-
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
-
- ret = add_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ };
+
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_bug(ret, rec);
+ return;
+ }
}

- run_sync();
-
- report = "removing breakpoints";
- count = 0;
-
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);

- ret = finish_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
-
- return;
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ };

- remove_breakpoints:
- pr_warn("Failed on %s (%d):\n", report, count);
- ftrace_bug(ret, rec);
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
- /*
- * Breakpoints are handled only when this function is in
- * progress. The system could not work with them.
- */
- if (remove_breakpoint(rec))
- BUG();
+ text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
}
- run_sync();
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- /*
- * The breakpoint is handled only when this function is in progress.
- * The system could not work if we could not remove it.
- */
- BUG_ON(ret);
- out:
- run_sync();
- return ret;
-
- fail_update:
- /* Also here the system could not work with the breakpoint */
- if (ftrace_write(ip, old_code, 1))
- BUG();
- goto out;
+ text_poke_finish();
}

void arch_ftrace_update_code(int command)
{
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ftrace_modify_all_code(command);
-
- atomic_dec(&modifying_ftrace_code);
}

int __init ftrace_dyn_arch_init(void)
@@ -828,11 +394,7 @@ create_trampoline(struct ftrace_ops *ops

set_vm_flush_reset_perms(trampoline);

- /*
- * Module allocation needs to be completed by making the page
- * executable. The page is still writable, which is a security hazard,
- * but anyhow ftrace breaks W^X completely.
- */
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -859,11 +421,10 @@ static unsigned long calc_trampoline_cal
void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
ftrace_func_t func;
- unsigned char *new;
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret, npages;
+ const char *new;

if (ops->trampoline) {
/*
@@ -872,14 +433,11 @@ void arch_ftrace_update_trampoline(struc
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
- set_memory_rw(ops->trampoline, npages);
} else {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
- npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}

offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -887,15 +445,11 @@ void arch_ftrace_update_trampoline(struc

func = ftrace_ops_get_func(ops);

- ftrace_update_func_call = (unsigned long)func;
-
+ mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- set_memory_ro(ops->trampoline, npages);
-
- /* The update should never fail */
- WARN_ON(ret);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}

/* Return the address of the function the trampoline calls */
@@ -981,19 +535,18 @@ void arch_ftrace_trampoline_free(struct
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);

-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- return ftrace_text_replace(0xe9, ip, addr);
+ return ftrace_text_replace(JMP32_INSN_OPCODE, ip, addr);
}

static int ftrace_mod_jmp(unsigned long ip, void *func)
{
- unsigned char *new;
+ const char *new;

- ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
-
- return update_ftrace_func(ip, new);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); // BATCH
+ return 0;
}

int ftrace_enable_ftrace_graph_caller(void)
@@ -1019,10 +572,9 @@ int ftrace_disable_ftrace_graph_caller(v
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
unsigned long old;
int faulted;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;

/*
* When resuming from suspend-to-ram, this function can be indirectly
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -568,15 +568,6 @@ NOKPROBE_SYMBOL(do_general_protection);

dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * ftrace must be first, everything else may cause a recursive crash.
- * See note by declaration of modifying_ftrace_code in ftrace.c
- */
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
-#endif
if (poke_int3_handler(regs))
return;




Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On 27/08/2019 20:06, Peter Zijlstra wrote:
> Move ftrace over to using the generic x86 text_poke functions; this
> avoids having a second/different copy of that code around.
>
> This also avoids ftrace violating the (new) W^X rule and avoids
> fragmenting the kernel text page-tables, due to no longer having to
> toggle them RW.

I tested this patch, and it works... but it generates more IPIs than the
previous one.

> Cc: Steven Rostedt <[email protected]>
> Cc: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/x86/include/asm/ftrace.h | 2
> arch/x86/kernel/alternative.c | 4
> arch/x86/kernel/ftrace.c | 630 ++++++------------------------------------
> arch/x86/kernel/traps.c | 9
> 4 files changed, 93 insertions(+), 552 deletions(-)
>
> --- a/arch/x86/include/asm/ftrace.h
> +++ b/arch/x86/include/asm/ftrace.h

[ ... jumping to the point ...]

> -
> void ftrace_replace_code(int enable)
> {
> struct ftrace_rec_iter *iter;
> struct dyn_ftrace *rec;
> - const char *report = "adding breakpoints";
> - int count = 0;
> + const char *new, *old;
> int ret;
>
> for_ftrace_rec_iter(iter) {
> rec = ftrace_rec_iter_record(iter);
>
> - ret = add_breakpoints(rec, enable);
> - if (ret)
> - goto remove_breakpoints;
> - count++;
> - }
> -
> - run_sync();

ftrace was already batching the updates, for instance, causing 3 IPIs to enable
all functions. The text_poke() batching also works. But because of the limited
buffer [ see the reply to the patch 2/3 ], it is flushing the buffer during the
operation, causing more IPIs than the previous code. Using the 5.4-rc1 in a VM,
when enabling the function tracer, I see 250+ intermediate text_poke_finish()
because of a full buffer...

Would this be the case of trying to use a dynamically allocated buffer?

Thoughts?

-- Daniel

> -
> - report = "updating code";
> - count = 0;
> -
> - for_ftrace_rec_iter(iter) {
> - rec = ftrace_rec_iter_record(iter);
> -
> - ret = add_update(rec, enable);
> - if (ret)
> - goto remove_breakpoints;
> - count++;
> + switch (ftrace_test_record(rec, enable)) {
> + case FTRACE_UPDATE_IGNORE:
> + default:
> + continue;
> +
> + case FTRACE_UPDATE_MAKE_CALL:
> + old = ftrace_nop_replace();
> + break;
> +
> + case FTRACE_UPDATE_MODIFY_CALL:
> + case FTRACE_UPDATE_MAKE_NOP:
> + old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
> + break;
> + };
> +
> + ret = ftrace_verify_code(rec->ip, old);
> + if (ret) {
> + ftrace_bug(ret, rec);
> + return;
> + }
> }
>
> - run_sync();
> -
> - report = "removing breakpoints";
> - count = 0;
> -
> for_ftrace_rec_iter(iter) {
> rec = ftrace_rec_iter_record(iter);
>
> - ret = finish_update(rec, enable);
> - if (ret)
> - goto remove_breakpoints;
> - count++;
> - }
> -
> - run_sync();
> -
> - return;
> + switch (ftrace_test_record(rec, enable)) {
> + case FTRACE_UPDATE_IGNORE:
> + default:
> + continue;
> +
> + case FTRACE_UPDATE_MAKE_CALL:
> + case FTRACE_UPDATE_MODIFY_CALL:
> + new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
> + break;
> +
> + case FTRACE_UPDATE_MAKE_NOP:
> + new = ftrace_nop_replace();
> + break;
> + };
>
> - remove_breakpoints:
> - pr_warn("Failed on %s (%d):\n", report, count);
> - ftrace_bug(ret, rec);
> - for_ftrace_rec_iter(iter) {
> - rec = ftrace_rec_iter_record(iter);
> - /*
> - * Breakpoints are handled only when this function is in
> - * progress. The system could not work with them.
> - */
> - if (remove_breakpoint(rec))
> - BUG();
> + text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
> + ftrace_update_record(rec, enable);
> }
> - run_sync();
> -}
> -
> -static int
> -ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
> - unsigned const char *new_code)
> -{
> - int ret;
> -
> - ret = add_break(ip, old_code);
> - if (ret)
> - goto out;
> -
> - run_sync();
> -
> - ret = add_update_code(ip, new_code);
> - if (ret)
> - goto fail_update;
> -
> - run_sync();
> -
> - ret = ftrace_write(ip, new_code, 1);
> - /*
> - * The breakpoint is handled only when this function is in progress.
> - * The system could not work if we could not remove it.
> - */
> - BUG_ON(ret);
> - out:
> - run_sync();
> - return ret;
> -
> - fail_update:
> - /* Also here the system could not work with the breakpoint */
> - if (ftrace_write(ip, old_code, 1))
> - BUG();
> - goto out;
> + text_poke_finish();
> }
>
> void arch_ftrace_update_code(int command)
> {
> - /* See comment above by declaration of modifying_ftrace_code */
> - atomic_inc(&modifying_ftrace_code);
> -
> ftrace_modify_all_code(command);
> -
> - atomic_dec(&modifying_ftrace_code);
> }
>
> int __init ftrace_dyn_arch_init(void)
> @@ -828,11 +394,7 @@ create_trampoline(struct ftrace_ops *ops
>
> set_vm_flush_reset_perms(trampoline);
>
> - /*
> - * Module allocation needs to be completed by making the page
> - * executable. The page is still writable, which is a security hazard,
> - * but anyhow ftrace breaks W^X completely.
> - */
> + set_memory_ro((unsigned long)trampoline, npages);
> set_memory_x((unsigned long)trampoline, npages);
> return (unsigned long)trampoline;
> fail:
> @@ -859,11 +421,10 @@ static unsigned long calc_trampoline_cal
> void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
> {
> ftrace_func_t func;
> - unsigned char *new;
> unsigned long offset;
> unsigned long ip;
> unsigned int size;
> - int ret, npages;
> + const char *new;
>
> if (ops->trampoline) {
> /*
> @@ -872,14 +433,11 @@ void arch_ftrace_update_trampoline(struc
> */
> if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
> return;
> - npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
> - set_memory_rw(ops->trampoline, npages);
> } else {
> ops->trampoline = create_trampoline(ops, &size);
> if (!ops->trampoline)
> return;
> ops->trampoline_size = size;
> - npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
> }
>
> offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
> @@ -887,15 +445,11 @@ void arch_ftrace_update_trampoline(struc
>
> func = ftrace_ops_get_func(ops);
>
> - ftrace_update_func_call = (unsigned long)func;
> -
> + mutex_lock(&text_mutex);
> /* Do a safe modify in case the trampoline is executing */
> new = ftrace_call_replace(ip, (unsigned long)func);
> - ret = update_ftrace_func(ip, new);
> - set_memory_ro(ops->trampoline, npages);
> -
> - /* The update should never fail */
> - WARN_ON(ret);
> + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
> + mutex_unlock(&text_mutex);
> }
>
> /* Return the address of the function the trampoline calls */
> @@ -981,19 +535,18 @@ void arch_ftrace_trampoline_free(struct
> #ifdef CONFIG_DYNAMIC_FTRACE
> extern void ftrace_graph_call(void);
>
> -static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
> +static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
> {
> - return ftrace_text_replace(0xe9, ip, addr);
> + return ftrace_text_replace(JMP32_INSN_OPCODE, ip, addr);
> }
>
> static int ftrace_mod_jmp(unsigned long ip, void *func)
> {
> - unsigned char *new;
> + const char *new;
>
> - ftrace_update_func_call = 0UL;
> new = ftrace_jmp_replace(ip, (unsigned long)func);
> -
> - return update_ftrace_func(ip, new);
> + text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); // BATCH
> + return 0;
> }
>
> int ftrace_enable_ftrace_graph_caller(void)
> @@ -1019,10 +572,9 @@ int ftrace_disable_ftrace_graph_caller(v
> void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
> unsigned long frame_pointer)
> {
> + unsigned long return_hooker = (unsigned long)&return_to_handler;
> unsigned long old;
> int faulted;
> - unsigned long return_hooker = (unsigned long)
> - &return_to_handler;
>
> /*
> * When resuming from suspend-to-ram, this function can be indirectly
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -568,15 +568,6 @@ NOKPROBE_SYMBOL(do_general_protection);
>
> dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
> {
> -#ifdef CONFIG_DYNAMIC_FTRACE
> - /*
> - * ftrace must be first, everything else may cause a recursive crash.
> - * See note by declaration of modifying_ftrace_code in ftrace.c
> - */
> - if (unlikely(atomic_read(&modifying_ftrace_code)) &&
> - ftrace_int3_handler(regs))
> - return;
> -#endif
> if (poke_int3_handler(regs))
> return;
>
>
>

2019-10-02 18:24:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, Oct 02, 2019 at 06:35:26PM +0200, Daniel Bristot de Oliveira wrote:

> ftrace was already batching the updates, for instance, causing 3 IPIs to enable
> all functions. The text_poke() batching also works. But because of the limited
> buffer [ see the reply to the patch 2/3 ], it is flushing the buffer during the
> operation, causing more IPIs than the previous code. Using the 5.4-rc1 in a VM,
> when enabling the function tracer, I see 250+ intermediate text_poke_finish()
> because of a full buffer...
>
> Would this be the case of trying to use a dynamically allocated buffer?
>
> Thoughts?

Is it a problem? I tried growing the buffer (IIRC I made it 10 times
bigger) and didn't see any performance improvements because of it.

2019-10-03 06:26:22

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, 2 Oct 2019 18:35:26 +0200
Daniel Bristot de Oliveira <[email protected]> wrote:

> ftrace was already batching the updates, for instance, causing 3 IPIs to enable
> all functions. The text_poke() batching also works. But because of the limited
> buffer [ see the reply to the patch 2/3 ], it is flushing the buffer during the
> operation, causing more IPIs than the previous code. Using the 5.4-rc1 in a VM,
> when enabling the function tracer, I see 250+ intermediate text_poke_finish()
> because of a full buffer...

Would you have any performance numbers of the previous code and applying this?

Thank you,


--
Masami Hiramatsu <[email protected]>

2019-10-04 07:22:11

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, 2 Oct 2019 20:21:06 +0200
Peter Zijlstra <[email protected]> wrote:

> On Wed, Oct 02, 2019 at 06:35:26PM +0200, Daniel Bristot de Oliveira wrote:
>
> > ftrace was already batching the updates, for instance, causing 3 IPIs to enable
> > all functions. The text_poke() batching also works. But because of the limited
> > buffer [ see the reply to the patch 2/3 ], it is flushing the buffer during the
> > operation, causing more IPIs than the previous code. Using the 5.4-rc1 in a VM,
> > when enabling the function tracer, I see 250+ intermediate text_poke_finish()
> > because of a full buffer...
> >
> > Would this be the case of trying to use a dynamically allocated buffer?
> >
> > Thoughts?
>
> Is it a problem? I tried growing the buffer (IIRC I made it 10 times
> bigger) and didn't see any performance improvements because of it.

I'm just worried if people are going to complain about the IPI burst.
Although, I just tried it out before applying this patch, and there's
still a bit of a burst. Not sure why. I did:

# cat /proc/interrupts > /tmp/before; echo function > /debug/tracing/current_tracer; cat /proc/interrupts > /tmp/after
# cat /proc/interrupts > /tmp/before1; echo nop > /debug/tracing/current_tracer; cat /proc/interrupts > /tmp/after1

Before this patch:

# diff /tmp/before /tmp/after
< CAL: 2342 2347 2116 2175 2446 2030 2416 2222 Function call interrupts
---
> CAL: 2462 2467 2236 2295 2446 2150 2536 2342 Function call interrupts

(Just showing the function call interrupts)

There appears to be 120 IPIs sent to all CPUS for enabling function tracer.

# diff /tmp/before1 /tmp/after1
< CAL: 2462 2467 2236 2295 2446 2150 2536 2342 Function call interrupts
---
> CAL: 2577 2582 2351 2410 2446 2265 2651 2457 Function call interrupts

And 151 IPIs for disabling it.

After applying this patch:

# diff /tmp/before /tmp/after
< CAL: 66070 46620 59955 59236 68707 63397 61644 62742 Function call interrupts
---
> CAL: 66727 47277 59955 59893 69364 64054 62301 63399 Function call interrupts

# diff /tmp/before1 /tmp/after1
< CAL: 66727 47277 59955 59893 69364 64054 62301 63399 Function call interrupts
---
> CAL: 67358 47938 59985 60554 70025 64715 62962 64060 Function call interrupts


We get 657 IPIs for enabling function tracer, and 661 for disabling it.
Funny how it's more on the disable than the enable with the patch but
the other way without it.

But still, we are going from 120 to 660 IPIs for every CPU. Not saying
it's a problem, but something that we should note. Someone (those that
don't like kernel interference) may complain.

-- Steve

Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On 04/10/2019 00:10, Steven Rostedt wrote:
> On Wed, 2 Oct 2019 20:21:06 +0200
> Peter Zijlstra <[email protected]> wrote:
>
>> On Wed, Oct 02, 2019 at 06:35:26PM +0200, Daniel Bristot de Oliveira wrote:
>>
>>> ftrace was already batching the updates, for instance, causing 3 IPIs to enable
>>> all functions. The text_poke() batching also works. But because of the limited
>>> buffer [ see the reply to the patch 2/3 ], it is flushing the buffer during the
>>> operation, causing more IPIs than the previous code. Using the 5.4-rc1 in a VM,
>>> when enabling the function tracer, I see 250+ intermediate text_poke_finish()
>>> because of a full buffer...
>>>
>>> Would this be the case of trying to use a dynamically allocated buffer?
>>>
>>> Thoughts?
>>
>> Is it a problem? I tried growing the buffer (IIRC I made it 10 times
>> bigger) and didn't see any performance improvements because of it.
>
> I'm just worried if people are going to complain about the IPI burst.
> Although, I just tried it out before applying this patch, and there's
> still a bit of a burst. Not sure why. I did:
>
> # cat /proc/interrupts > /tmp/before; echo function > /debug/tracing/current_tracer; cat /proc/interrupts > /tmp/after
> # cat /proc/interrupts > /tmp/before1; echo nop > /debug/tracing/current_tracer; cat /proc/interrupts > /tmp/after1
>
> Before this patch:
>
> # diff /tmp/before /tmp/after
> < CAL: 2342 2347 2116 2175 2446 2030 2416 2222 Function call interrupts
> ---
>> CAL: 2462 2467 2236 2295 2446 2150 2536 2342 Function call interrupts
>
> (Just showing the function call interrupts)
>
> There appears to be 120 IPIs sent to all CPUS for enabling function tracer.
>
> # diff /tmp/before1 /tmp/after1
> < CAL: 2462 2467 2236 2295 2446 2150 2536 2342 Function call interrupts
> ---
>> CAL: 2577 2582 2351 2410 2446 2265 2651 2457 Function call interrupts
>
> And 151 IPIs for disabling it.
>
> After applying this patch:
>
> # diff /tmp/before /tmp/after
> < CAL: 66070 46620 59955 59236 68707 63397 61644 62742 Function call interrupts
> ---
>> CAL: 66727 47277 59955 59893 69364 64054 62301 63399 Function call interrupts
>
> # diff /tmp/before1 /tmp/after1
> < CAL: 66727 47277 59955 59893 69364 64054 62301 63399 Function call interrupts
> ---
>> CAL: 67358 47938 59985 60554 70025 64715 62962 64060 Function call interrupts
>
>
> We get 657 IPIs for enabling function tracer, and 661 for disabling it.
> Funny how it's more on the disable than the enable with the patch but
> the other way without it.
>
> But still, we are going from 120 to 660 IPIs for every CPU. Not saying
> it's a problem, but something that we should note. Someone (those that
> don't like kernel interference) may complain.

That is the point I was raising.

When enabling ftrace, we have three different paths:

1) the enabling/disabling ftrace path
2) the int3 path - if a thread/irq is running a kernel function
3) the IPI - that affects all CPUs, even those that are not "hitting" trace
code, e.g., user-space.

The first one is for sure a cold-path. The second one is a hot-path: any task
running kernel functions will hit it. But IMHO, the hottest one is the IPIs,
because it will run on all CPUs, e.g., even isolated CPUs that are running in
user-space.

Currently, ftrace does:

for_ftrace_rec:
Install all breakpoints
send IPI

for_ftrace_rec:
write the end of the instruction
send IPI

for_ftrace_rec:
write the first byte of the instruction
send IPI

And that is the same thing we do with the batch mode, and so it would be better
to integrate both.

The problem is that considering the patch 2/3, the amount of entries we can
batch in the text_poke is limited, and so we batch on chunks of TP_VEC_MAX
entries. So, rather than doing 3 IPIs to change the code, we do:

(ftrace_rec count/TP_VEC_MAX) * 3 IPIs.

One possible solution for this would be to allocate a buffer with "number of
ftrace_rec" and use it in the text_poke batch mode.

But to do it, we should keep the old interface (the one that the 2/3 is
changing). Well, at least using a per-use-case buffer.

[ In addition ]

Currently, ftrace_rec entries are ordered inside the group of functions, but
"groups of function" are not ordered. So, the current int3 handler does a (*):

for_each_group_of_functions:
check if the ip is in the range ----> n by the number of groups.
do a bsearch. ----> log(n) by the numbers of entry
in the group.

If, instead, it uses an ordered vector, the complexity would be log(n) by the
total number of entries, which is better. So, how bad is the idea of:

in the enabling ftrace code path, it:
discover the number of entries
alloc a buffer
discover the order of the groups
for each group in the correct order
queue the entry in the buffer
apply the changes using the text_poke...

In this way we would optimize the two hot-paths:
int3 will be log(n)
IPIs bounded to 3.

I am not saying we need to do it now, as Steve said, not sure if this is a big
problem, but... those that don't like kernel interference may complain. But if
we leave the per-use-case vector in the text_poke_batch interface, things will
be easier to fix.

NOTE: the other IPIs are generated by hooking the tracepoints and switching the
code to RO/RW...

* as far as I understood ftrace_location_range().

-- Daniel

> -- Steve
>

2019-10-04 12:01:38

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Thu, Oct 03, 2019 at 06:10:45PM -0400, Steven Rostedt wrote:
> But still, we are going from 120 to 660 IPIs for every CPU. Not saying
> it's a problem, but something that we should note. Someone (those that
> don't like kernel interference) may complain.

It is machine wide function tracing, interference is going to happen..
:-)

Anyway, we can grow the batch size if sufficient benefit can be shown to
exist.

2019-10-04 13:43:42

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, 4 Oct 2019 13:22:37 +0200
Peter Zijlstra <[email protected]> wrote:

> On Thu, Oct 03, 2019 at 06:10:45PM -0400, Steven Rostedt wrote:
> > But still, we are going from 120 to 660 IPIs for every CPU. Not saying
> > it's a problem, but something that we should note. Someone (those that
> > don't like kernel interference) may complain.
>
> It is machine wide function tracing, interference is going to happen..
> :-)
>
> Anyway, we can grow the batch size if sufficient benefit can be shown to
> exist.

Yeah, I was thinking that we just go with these patches and then fix
the IPI issue when someone starts complaining ;-)

Anyway, is this series ready to go? I can pull them in (I guess I
should get an ack from Thomas or Ingo as they are x86 specific). I'm
currently working on code that affects the same code paths as this
patch, and would like to build my changes on top of this, instead of
monkeying around with major conflicts.

-- Steve

2019-10-04 13:43:53

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, 4 Oct 2019 10:10:47 +0200
Daniel Bristot de Oliveira <[email protected]> wrote:

> [ In addition ]
>
> Currently, ftrace_rec entries are ordered inside the group of functions, but
> "groups of function" are not ordered. So, the current int3 handler does a (*):
>
> for_each_group_of_functions:
> check if the ip is in the range ----> n by the number of groups.
> do a bsearch. ----> log(n) by the numbers of entry
> in the group.
>
> If, instead, it uses an ordered vector, the complexity would be log(n) by the
> total number of entries, which is better. So, how bad is the idea of:

BTW, I'm currently rewriting the grouping of the vectors, in order to
shrink the size of each dyn_ftrace_rec (as we discussed at Kernel
Recipes). I can make the groups all sorted in doing so, thus we can
load the sorted if that's needed, without doing anything special.

>
> in the enabling ftrace code path, it:
> discover the number of entries
> alloc a buffer
> discover the order of the groups
> for each group in the correct order
> queue the entry in the buffer
> apply the changes using the text_poke...
>
> In this way we would optimize the two hot-paths:
> int3 will be log(n)
> IPIs bounded to 3.
>
> I am not saying we need to do it now, as Steve said, not sure if this is a big
> problem, but... those that don't like kernel interference may complain. But if
> we leave the per-use-case vector in the text_poke_batch interface, things will
> be easier to fix.
>
> NOTE: the other IPIs are generated by hooking the tracepoints and switching the
> code to RO/RW...

Yeah, I did a trace of who is doing the on_each_cpu() call, and saw it
coming from the RO/RW changes, which this patch series removes.

-- Steve


>
> * as far as I understood ftrace_location_range().
>
> -- Daniel
>
> > -- Steve
> >

Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On 04/10/2019 15:40, Steven Rostedt wrote:
> On Fri, 4 Oct 2019 10:10:47 +0200
> Daniel Bristot de Oliveira <[email protected]> wrote:
>
>> [ In addition ]
>>
>> Currently, ftrace_rec entries are ordered inside the group of functions, but
>> "groups of function" are not ordered. So, the current int3 handler does a (*):
>>
>> for_each_group_of_functions:
>> check if the ip is in the range ----> n by the number of groups.
>> do a bsearch. ----> log(n) by the numbers of entry
>> in the group.
>>
>> If, instead, it uses an ordered vector, the complexity would be log(n) by the
>> total number of entries, which is better. So, how bad is the idea of:
> BTW, I'm currently rewriting the grouping of the vectors, in order to
> shrink the size of each dyn_ftrace_rec (as we discussed at Kernel
> Recipes). I can make the groups all sorted in doing so, thus we can
> load the sorted if that's needed, without doing anything special.
>

Good! if you do they sorted and store the amount of entries in a variable, we
can have things done for a future "optimized" version.

-- Daniel

2019-10-04 15:16:52

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, 4 Oct 2019 16:44:35 +0200
Daniel Bristot de Oliveira <[email protected]> wrote:

> On 04/10/2019 15:40, Steven Rostedt wrote:
> > On Fri, 4 Oct 2019 10:10:47 +0200
> > Daniel Bristot de Oliveira <[email protected]> wrote:
> >
> >> [ In addition ]
> >>
> >> Currently, ftrace_rec entries are ordered inside the group of functions, but
> >> "groups of function" are not ordered. So, the current int3 handler does a (*):
> >>
> >> for_each_group_of_functions:
> >> check if the ip is in the range ----> n by the number of groups.
> >> do a bsearch. ----> log(n) by the numbers of entry
> >> in the group.
> >>
> >> If, instead, it uses an ordered vector, the complexity would be log(n) by the
> >> total number of entries, which is better. So, how bad is the idea of:
> > BTW, I'm currently rewriting the grouping of the vectors, in order to
> > shrink the size of each dyn_ftrace_rec (as we discussed at Kernel
> > Recipes). I can make the groups all sorted in doing so, thus we can
> > load the sorted if that's needed, without doing anything special.
> >
>
> Good! if you do they sorted and store the amount of entries in a variable, we
> can have things done for a future "optimized" version.
>

You mean something like this?

-- Steve


From 9b85c08d796dc953cf9b9331d1aed4c3052b09b9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <[email protected]>
Date: Tue, 1 Oct 2019 14:38:07 -0400
Subject: [PATCH] ftrace: Add information on number of page groups allocated

Looking for ways to shrink the size of the dyn_ftrace structure, knowing the
information about how many pages and the number of groups of those pages, is
useful in working out the best ways to save on memory.

This adds one info print on how many groups of pages were used to allocate
the ftrace dyn_ftrace structures, and also shows the number of pages and
groups in the dyn_ftrace_total_info (which is used for debugging).

Signed-off-by: Steven Rostedt (VMware) <[email protected]>
---
kernel/trace/ftrace.c | 14 ++++++++++++++
kernel/trace/trace.c | 21 +++++++++++++++------
kernel/trace/trace.h | 2 ++
3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c4cc048eb594..244e2d9083a6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2860,6 +2860,8 @@ static void ftrace_shutdown_sysctl(void)

static u64 ftrace_update_time;
unsigned long ftrace_update_tot_cnt;
+unsigned long ftrace_number_of_pages;
+unsigned long ftrace_number_of_groups;

static inline int ops_traces_mod(struct ftrace_ops *ops)
{
@@ -2984,6 +2986,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
goto again;
}

+ ftrace_number_of_pages += 1 << order;
+ ftrace_number_of_groups++;
+
cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
pg->size = cnt;

@@ -3039,6 +3044,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
start_pg = pg->next;
kfree(pg);
pg = start_pg;
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
}
pr_info("ftrace: FAILED to allocate memory for functions\n");
return NULL;
@@ -5788,6 +5795,8 @@ void ftrace_release_mod(struct module *mod)
free_pages((unsigned long)pg->records, order);
tmp_page = pg->next;
kfree(pg);
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
}
}

@@ -6129,6 +6138,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
+ ftrace_number_of_pages -= 1 << order;
+ ftrace_number_of_groups--;
kfree(pg);
pg = container_of(last_pg, struct ftrace_page, next);
if (!(*last_pg))
@@ -6184,6 +6195,9 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);

+ pr_info("ftrace: allocated %ld pages with %ld groups\n",
+ ftrace_number_of_pages, ftrace_number_of_groups);
+
set_ftrace_early_filters();

return;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e917aa783675..bb41a70c5189 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7548,14 +7548,23 @@ static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- unsigned long *p = filp->private_data;
- char buf[64]; /* Not too big for a shallow stack */
+ unsigned long *p = ftrace_update_tot_cnt;
+ ssize_t ret;
+ char *buf;
int r;

- r = scnprintf(buf, 63, "%ld", *p);
- buf[r++] = '\n';
+ r = snprintf(NULL, 0, "%ld pages:%d groups: %d\n");
+ r++;
+ buf = kmalloc(r, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ r = scnprintf(buf, r, "%ld pages:%d groups: %d\n",
+ ftrace_update_tot_cnt,
+ ftrace_number_of_pages,
+ ftrace_number_of_groups);

- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static const struct file_operations tracing_dyn_info_fops = {
@@ -8747,7 +8756,7 @@ static __init int tracer_init_tracefs(void)

#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
- &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+ NULL, &tracing_dyn_info_fops);
#endif

create_trace_instances(d_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f801d154ff6a..f6f92a6c5cec 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -802,6 +802,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);

#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+extern unsigned long ftrace_number_of_pages;
+extern unsigned long ftrace_number_of_groups;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
--
2.20.1

2019-10-07 08:11:00

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, Oct 04, 2019 at 10:10:47AM +0200, Daniel Bristot de Oliveira wrote:
> 1) the enabling/disabling ftrace path
> 2) the int3 path - if a thread/irq is running a kernel function
> 3) the IPI - that affects all CPUs, even those that are not "hitting" trace
> code, e.g., user-space.
>
> The first one is for sure a cold-path. The second one is a hot-path: any task
> running kernel functions will hit it. But IMHO, the hottest one is the IPIs,
> because it will run on all CPUs, e.g., even isolated CPUs that are running in
> user-space.

Well, we can fix that, just like RCU. In fact, I suspect RCU has all the
bits required to cure this.

For NOHZ_FULL CPUs you can forgo the IPI and delay it until the next kernel
entry, just like RCU.

2019-10-11 07:02:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, Oct 04, 2019 at 10:10:47AM +0200, Daniel Bristot de Oliveira wrote:
> Currently, ftrace_rec entries are ordered inside the group of functions, but
> "groups of function" are not ordered. So, the current int3 handler does a (*):

We can insert a sort() of the vector right before doing
text_poke_bp_batch() of course...

Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On 11/10/2019 09:01, Peter Zijlstra wrote:
> On Fri, Oct 04, 2019 at 10:10:47AM +0200, Daniel Bristot de Oliveira wrote:
>> Currently, ftrace_rec entries are ordered inside the group of functions, but
>> "groups of function" are not ordered. So, the current int3 handler does a (*):
> We can insert a sort() of the vector right before doing
> text_poke_bp_batch() of course...

I agree!

What I tried to do earlier this week was to order the ftrace_pages in the
insertion [1], and so, while sequentially reading the pages with
do_for_each_ftrace_rec() we would already see the "ip"s in order.

As ftrace_pages are inserted only at boot and during a load of a module, this
would push the ordering for a very very slow path.

It works! But under the assumption that the address of functions in a module
does not intersect with the address of other modules/kernel, e.g.:

kernel: module A: module B:
[ 1, 2, 3, 4 ] [ 7, 8, 9 ] [ 15, 16, 19 ]

But this does not happen in practice, as I saw things like:

kernel: module A: module B:
[ 1, 2, 3, 4 ] [ 7, 8, 18 ] [ 15, 16, 19 ]
^^ <--- greater than the first of the next

Is this expected?

At this point, I stopped working on it to give a time for my brain o think about
a better solution, also because Steve is reworking ftrace_pages to save some
space. So, it was better to wait.

But, yes, we will need [ as an optimization ] to sort the address right before
inserting them in the batch. Still, having the ftrace_pages ordered seems to be
a good thing, as in many cases, the ftrace_pages are disjoint sets.

[1] see ftrace_pages_insert() and ftrace_pages_start.

-- Daniel

2019-10-11 11:01:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, Oct 11, 2019 at 09:37:10AM +0200, Daniel Bristot de Oliveira wrote:
> On 11/10/2019 09:01, Peter Zijlstra wrote:
> > On Fri, Oct 04, 2019 at 10:10:47AM +0200, Daniel Bristot de Oliveira wrote:
> >> Currently, ftrace_rec entries are ordered inside the group of functions, but
> >> "groups of function" are not ordered. So, the current int3 handler does a (*):
> > We can insert a sort() of the vector right before doing
> > text_poke_bp_batch() of course...
>
> I agree!
>
> What I tried to do earlier this week was to order the ftrace_pages in the
> insertion [1], and so, while sequentially reading the pages with
> do_for_each_ftrace_rec() we would already see the "ip"s in order.
>
> As ftrace_pages are inserted only at boot and during a load of a module, this
> would push the ordering for a very very slow path.
>
> It works! But under the assumption that the address of functions in a module
> does not intersect with the address of other modules/kernel, e.g.:
>
> kernel: module A: module B:
> [ 1, 2, 3, 4 ] [ 7, 8, 9 ] [ 15, 16, 19 ]
>
> But this does not happen in practice, as I saw things like:
>
> kernel: module A: module B:
> [ 1, 2, 3, 4 ] [ 7, 8, 18 ] [ 15, 16, 19 ]
> ^^ <--- greater than the first of the next
>
> Is this expected?

Expected is I think a big word. That is, I would certainly not expect
that, but I think I can see how it can happen.

Suppose that init sections are placed at the end (didn't verify, but
makes sense), then suppose that A's 18 is in an init section, which gets
freed once the module load is completed. We then proceed to load B,
which then overlaps with what used to be A's init.

If this were the case, then while we could retain the pach location for
A's 18, we would never actually modify it, because it's gotten marked
freed or something (that's how jump_labels work, I didn't check what
ftrace actually does here).

So while the above contains some assumptions, it is a semi plausable
explanation for what you've described.

2019-10-11 13:14:49

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, 11 Oct 2019 09:37:10 +0200
Daniel Bristot de Oliveira <[email protected]> wrote:

> But, yes, we will need [ as an optimization ] to sort the address right before
> inserting them in the batch. Still, having the ftrace_pages ordered seems to be
> a good thing, as in many cases, the ftrace_pages are disjoint sets.

I think it would be fine if we run the batches according to the ftrace
page groups. Which will be guaranteed to be sorted. I try to keep the
groups to a minimum, thus it shouldn't be too many ipi busts.

Although, my new work may actually make more page groups by trying to
cut down on the dyn_ftrace size by using offsets. If a function extends
outside the offset range, a new page group will need to be created.

-- Steve

2019-10-22 00:40:36

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Fri, Oct 4, 2019 at 6:45 AM Steven Rostedt <[email protected]> wrote:
>
> On Fri, 4 Oct 2019 13:22:37 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > On Thu, Oct 03, 2019 at 06:10:45PM -0400, Steven Rostedt wrote:
> > > But still, we are going from 120 to 660 IPIs for every CPU. Not saying
> > > it's a problem, but something that we should note. Someone (those that
> > > don't like kernel interference) may complain.
> >
> > It is machine wide function tracing, interference is going to happen..
> > :-)
> >
> > Anyway, we can grow the batch size if sufficient benefit can be shown to
> > exist.
>
> Yeah, I was thinking that we just go with these patches and then fix
> the IPI issue when someone starts complaining ;-)
>
> Anyway, is this series ready to go? I can pull them in (I guess I
> should get an ack from Thomas or Ingo as they are x86 specific). I'm
> currently working on code that affects the same code paths as this
> patch, and would like to build my changes on top of this, instead of
> monkeying around with major conflicts.

What is the status of this set ?
Steven, did you apply it ?

2019-10-22 00:46:18

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, 21 Oct 2019 17:36:54 -0700
Alexei Starovoitov <[email protected]> wrote:


> What is the status of this set ?
> Steven, did you apply it ?

There's still bugs to figure out.

-- Steve

2019-10-22 03:19:35

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, 21 Oct 2019 20:10:09 -0700
Alexei Starovoitov <[email protected]> wrote:

> On Mon, Oct 21, 2019 at 5:43 PM Steven Rostedt <[email protected]> wrote:
> >
> > On Mon, 21 Oct 2019 17:36:54 -0700
> > Alexei Starovoitov <[email protected]> wrote:
> >
> >
> > > What is the status of this set ?
> > > Steven, did you apply it ?
> >
> > There's still bugs to figure out.
>
> what bugs you're seeing?
> The IPI frequency that was mentioned in this thread or something else?
> I'm hacking ftrace+bpf stuff in the same spot and would like to
> base my work on the latest and greatest.

There's real bugs (crashes), talked about in later versions of the
patch set:

https://lore.kernel.org/r/[email protected]

And there was another crash on Peter's latest I just reported:

https://lore.kernel.org/r/[email protected]

-- Steve

2019-10-22 03:21:58

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, 21 Oct 2019 23:16:30 -0400
Steven Rostedt <[email protected]> wrote:

> > what bugs you're seeing?
> > The IPI frequency that was mentioned in this thread or something else?
> > I'm hacking ftrace+bpf stuff in the same spot and would like to
> > base my work on the latest and greatest.

I'm also going to be touching some of this code too, as I'm waiting for
Peter's code to settle down. What are you touching? I'm going to be
working on making the dyn_ftrace records smaller, and this is going to
change the way the iterations work on modifying the code.

This is what I discussed on stage at Kernel Recipes with Willy Tarreau.

-- Steve

2019-10-22 03:22:38

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, Oct 21, 2019 at 5:43 PM Steven Rostedt <[email protected]> wrote:
>
> On Mon, 21 Oct 2019 17:36:54 -0700
> Alexei Starovoitov <[email protected]> wrote:
>
>
> > What is the status of this set ?
> > Steven, did you apply it ?
>
> There's still bugs to figure out.

what bugs you're seeing?
The IPI frequency that was mentioned in this thread or something else?
I'm hacking ftrace+bpf stuff in the same spot and would like to
base my work on the latest and greatest.

2019-10-22 03:59:05

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, Oct 21, 2019 at 11:16:30PM -0400, Steven Rostedt wrote:
> On Mon, 21 Oct 2019 20:10:09 -0700
> Alexei Starovoitov <[email protected]> wrote:
>
> > On Mon, Oct 21, 2019 at 5:43 PM Steven Rostedt <[email protected]> wrote:
> > >
> > > On Mon, 21 Oct 2019 17:36:54 -0700
> > > Alexei Starovoitov <[email protected]> wrote:
> > >
> > >
> > > > What is the status of this set ?
> > > > Steven, did you apply it ?
> > >
> > > There's still bugs to figure out.
> >
> > what bugs you're seeing?
> > The IPI frequency that was mentioned in this thread or something else?
> > I'm hacking ftrace+bpf stuff in the same spot and would like to
> > base my work on the latest and greatest.
>
> There's real bugs (crashes), talked about in later versions of the
> patch set:
>
> https://lore.kernel.org/r/[email protected]
>
> And there was another crash on Peter's latest I just reported:
>
> https://lore.kernel.org/r/[email protected]

ahh. I missed the whole thread. Thanks for pointers.

2019-10-22 04:09:47

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, Oct 21, 2019 at 11:19:04PM -0400, Steven Rostedt wrote:
> On Mon, 21 Oct 2019 23:16:30 -0400
> Steven Rostedt <[email protected]> wrote:
>
> > > what bugs you're seeing?
> > > The IPI frequency that was mentioned in this thread or something else?
> > > I'm hacking ftrace+bpf stuff in the same spot and would like to
> > > base my work on the latest and greatest.
>
> I'm also going to be touching some of this code too, as I'm waiting for
> Peter's code to settle down. What are you touching? I'm going to be
> working on making the dyn_ftrace records smaller, and this is going to
> change the way the iterations work on modifying the code.

I'm not touching dyn_ftrace.
Actually calling my stuff ftrace+bpf is probably not correct either.
I'm reusing code patching of nop into call that ftrace does. That's it.
Turned out I cannot use 99% of ftrace facilities.
ftrace_caller, ftrace_call, ftrace_ops_list_func and the whole ftrace api
with ip, parent_ip and pt_regs cannot be used for this part of the work.
bpf prog needs to access raw function arguments. To achieve that I'm
generating code on the fly. Just like bpf jits do.
As soon as I have something reviewable I'll share it.
That's the stuff I mentioned to you at KR.
First nop of a function will be replaced with a call into bpf.
Very similar to what existing kprobe+bpf does, but faster and safer.
Second part of calling real ftrace from bpf is on todo list.

2019-10-22 13:22:14

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Mon, 21 Oct 2019 21:05:33 -0700
Alexei Starovoitov <[email protected]> wrote:

> On Mon, Oct 21, 2019 at 11:19:04PM -0400, Steven Rostedt wrote:
> > On Mon, 21 Oct 2019 23:16:30 -0400
> > Steven Rostedt <[email protected]> wrote:
> >
> > > > what bugs you're seeing?
> > > > The IPI frequency that was mentioned in this thread or something else?
> > > > I'm hacking ftrace+bpf stuff in the same spot and would like to
> > > > base my work on the latest and greatest.
> >
> > I'm also going to be touching some of this code too, as I'm waiting for
> > Peter's code to settle down. What are you touching? I'm going to be
> > working on making the dyn_ftrace records smaller, and this is going to
> > change the way the iterations work on modifying the code.
>
> I'm not touching dyn_ftrace.
> Actually calling my stuff ftrace+bpf is probably not correct either.
> I'm reusing code patching of nop into call that ftrace does. That's it.
> Turned out I cannot use 99% of ftrace facilities.
> ftrace_caller, ftrace_call, ftrace_ops_list_func and the whole ftrace api
> with ip, parent_ip and pt_regs cannot be used for this part of the work.
> bpf prog needs to access raw function arguments. To achieve that I'm

You can do that today with the ftrace facility, just like live patching
does. You register a ftrace_ops with the flag FTRACE_OPS_FL_IPMODIFY,
and your func will set the regs->ip to your bpf handler. When the
ftrace_ops->func returns, instead of going back to the called
function, it can jump to your bpf_handler. You can create a shadow stack
(like function graph tracer does) to save the return address for where
you bpf handler needs to return to. As your bpf_handler needs raw
access to the parameters, it may not even need the shadow stack because
it should know the function it is reading the parameters from.

If you still want the return address without the shadow stack, it
wouldn't be hard to add another ftrace_ops flag, that allows the
ftrace_ops to inject a return address to simulate a call function
before it jumps to the modified IP.

> generating code on the fly. Just like bpf jits do.
> As soon as I have something reviewable I'll share it.
> That's the stuff I mentioned to you at KR.
> First nop of a function will be replaced with a call into bpf.
> Very similar to what existing kprobe+bpf does, but faster and safer.
> Second part of calling real ftrace from bpf is on todo list.

Having two different infrastructures modifying the first nop is going
to cause a huge nightmare to maintain. This is why live patching didn't
do it. I strongly suggested that you do not have bpf have its own
infrastructure.

-- Steve

2019-10-22 14:06:32

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 07:19:56 -0400
Steven Rostedt <[email protected]> wrote:

> > I'm not touching dyn_ftrace.
> > Actually calling my stuff ftrace+bpf is probably not correct either.
> > I'm reusing code patching of nop into call that ftrace does. That's it.
> > Turned out I cannot use 99% of ftrace facilities.
> > ftrace_caller, ftrace_call, ftrace_ops_list_func and the whole ftrace api
> > with ip, parent_ip and pt_regs cannot be used for this part of the work.
> > bpf prog needs to access raw function arguments. To achieve that I'm
>
> You can do that today with the ftrace facility, just like live patching
> does. You register a ftrace_ops with the flag FTRACE_OPS_FL_IPMODIFY,
> and your func will set the regs->ip to your bpf handler. When the
> ftrace_ops->func returns, instead of going back to the called
> function, it can jump to your bpf_handler. You can create a shadow stack
> (like function graph tracer does) to save the return address for where
> you bpf handler needs to return to. As your bpf_handler needs raw
> access to the parameters, it may not even need the shadow stack because
> it should know the function it is reading the parameters from.

To show just how easy this is, I wrote up a quick hack that hijacks the
wake_up_process() function and adds a trace_printk() to see what was
woken up. My output from the trace is this:

<idle>-0 [007] ..s1 68.517276: my_wake_up: We are waking up rcu_preempt:10
<...>-1240 [001] .... 68.517727: my_wake_up: We are waking up kthreadd:2
<...>-1240 [001] d..1 68.517973: my_wake_up: We are waking up kworker/1:0:17
bash-1188 [003] d..2 68.519020: my_wake_up: We are waking up kworker/u16:3:140
bash-1188 [003] d..2 68.519138: my_wake_up: We are waking up kworker/u16:3:140
sshd-1187 [005] d.s2 68.519295: my_wake_up: We are waking up kworker/5:2:517
<idle>-0 [007] ..s1 68.522293: my_wake_up: We are waking up rcu_preempt:10
<idle>-0 [007] ..s1 68.526309: my_wake_up: We are waking up rcu_preempt:10

I added the code to the trace-event-sample.c sample module, and got the
above when I loaded that module (modprobe trace-event-sample).

It's mostly non arch specific (that is, you can do this with any
arch that supports the IPMODIFY flag). The only parts that would need
arch specific code is the regs->ip compare. The pip check can also be
done less "hacky". But this shows you how easy this can be done today.
Not sure what is missing that you need.

Here's the patch:

diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 1a72b7d95cdc..526a6098c811 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -11,6 +11,41 @@
#define CREATE_TRACE_POINTS
#include "trace-events-sample.h"

+#include <linux/ftrace.h>
+
+int wake_up_process(struct task_struct *p);
+
+int x;
+
+static int my_wake_up(struct task_struct *p)
+{
+ int ret;
+
+ trace_printk("We are waking up %s:%d\n", p->comm, p->pid);
+ ret = wake_up_process(p);
+ /* Force not having a tail call */
+ if (!x)
+ return ret;
+ return 0;
+}
+
+static void my_hijack_func(unsigned long ip, unsigned long pip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ unsigned long this_func = (unsigned long)my_wake_up;
+
+ if (pip >= this_func && pip <= this_func + 0x10000)
+ return;
+
+ regs->ip = my_wake_up;
+}
+
+static struct ftrace_ops my_ops = {
+ .func = my_hijack_func,
+ .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_SAVE_REGS,
+};
+
static const char *random_strings[] = {
"Mother Goose",
"Snoopy",
@@ -115,6 +150,11 @@ void foo_bar_unreg(void)

static int __init trace_event_init(void)
{
+ int ret;
+
+ ret = ftrace_set_filter_ip(&my_ops, (unsigned long)wake_up_process, 0, 0);
+ if (!ret)
+ register_ftrace_function(&my_ops);
simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
if (IS_ERR(simple_tsk))
return -1;
@@ -124,6 +164,7 @@ static int __init trace_event_init(void)

static void __exit trace_event_exit(void)
{
+ unregister_ftrace_function(&my_ops);
kthread_stop(simple_tsk);
mutex_lock(&thread_mutex);
if (simple_tsk_fn)


-- Steve

2019-10-22 17:53:22

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, Oct 22, 2019 at 09:44:55AM -0400, Steven Rostedt wrote:
> On Tue, 22 Oct 2019 07:19:56 -0400
> Steven Rostedt <[email protected]> wrote:
>
> > > I'm not touching dyn_ftrace.
> > > Actually calling my stuff ftrace+bpf is probably not correct either.
> > > I'm reusing code patching of nop into call that ftrace does. That's it.
> > > Turned out I cannot use 99% of ftrace facilities.
> > > ftrace_caller, ftrace_call, ftrace_ops_list_func and the whole ftrace api
> > > with ip, parent_ip and pt_regs cannot be used for this part of the work.
> > > bpf prog needs to access raw function arguments. To achieve that I'm
> >
> > You can do that today with the ftrace facility, just like live patching
> > does. You register a ftrace_ops with the flag FTRACE_OPS_FL_IPMODIFY,
> > and your func will set the regs->ip to your bpf handler. When the
> > ftrace_ops->func returns, instead of going back to the called
> > function, it can jump to your bpf_handler. You can create a shadow stack
> > (like function graph tracer does) to save the return address for where
> > you bpf handler needs to return to. As your bpf_handler needs raw
> > access to the parameters, it may not even need the shadow stack because
> > it should know the function it is reading the parameters from.
>
> To show just how easy this is, I wrote up a quick hack that hijacks the
> wake_up_process() function and adds a trace_printk() to see what was
> woken up. My output from the trace is this:
>
> <idle>-0 [007] ..s1 68.517276: my_wake_up: We are waking up rcu_preempt:10
> <...>-1240 [001] .... 68.517727: my_wake_up: We are waking up kthreadd:2
> <...>-1240 [001] d..1 68.517973: my_wake_up: We are waking up kworker/1:0:17
> bash-1188 [003] d..2 68.519020: my_wake_up: We are waking up kworker/u16:3:140
> bash-1188 [003] d..2 68.519138: my_wake_up: We are waking up kworker/u16:3:140
> sshd-1187 [005] d.s2 68.519295: my_wake_up: We are waking up kworker/5:2:517
> <idle>-0 [007] ..s1 68.522293: my_wake_up: We are waking up rcu_preempt:10
> <idle>-0 [007] ..s1 68.526309: my_wake_up: We are waking up rcu_preempt:10
>
> I added the code to the trace-event-sample.c sample module, and got the
> above when I loaded that module (modprobe trace-event-sample).
>
> It's mostly non arch specific (that is, you can do this with any
> arch that supports the IPMODIFY flag). The only parts that would need
> arch specific code is the regs->ip compare. The pip check can also be
> done less "hacky". But this shows you how easy this can be done today.
> Not sure what is missing that you need.

yes. I've studied all these possibilites, tried a bunch ways to extend
and reuse ftrace_ops, but every time found something fundamental to
the existing ftrace design that doesn't work for bpf usage.
See below:

> Here's the patch:
>
> diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
> index 1a72b7d95cdc..526a6098c811 100644
> --- a/samples/trace_events/trace-events-sample.c
> +++ b/samples/trace_events/trace-events-sample.c
> @@ -11,6 +11,41 @@
> #define CREATE_TRACE_POINTS
> #include "trace-events-sample.h"
>
> +#include <linux/ftrace.h>
> +
> +int wake_up_process(struct task_struct *p);
> +
> +int x;
> +
> +static int my_wake_up(struct task_struct *p)
> +{
> + int ret;
> +
> + trace_printk("We are waking up %s:%d\n", p->comm, p->pid);
> + ret = wake_up_process(p);
> + /* Force not having a tail call */
> + if (!x)
> + return ret;
> + return 0;
> +}
> +
> +static void my_hijack_func(unsigned long ip, unsigned long pip,
> + struct ftrace_ops *ops, struct pt_regs *regs)

1.
To pass regs into the callback ftrace_regs_caller() has huge amount
of stores to do. Saving selector registers is not fast. pushf/popf are even slower.
~200 bytes of stack is being used for save/restore.
This is pure overhead that bpf execution cannot afford.
bpf is different from traditional ftrace and other tracing, since
it's often active 24/7. Every nanosecond counts.
So for bpf I'm generating assembler trampoline tailored to specific kernel
function that has its fentry nop replaced with a call to that trampoline.
Instead of 20+ register save I save only arguments of that kernel function.
For example the trampoline that attaches to kfree_skb() will save only two registers
(rbp and rdi on x86) and will use 16 bytes of stack.

2.
The common ftrace callback api allows ftrace infra to use generic ftrace_ops_list_func()
that works for all ftracers, but it doesn't scale.
We see different teams writing bpf services that attach to the same function.
In addition there are 30+ kprobes active in other places, so for every
fentry the ftrace_ops_list_func() walks long linked list and does hash
lookup for each. That's not efficient and we see this slowdown in practice.
Because of unique trampoline for every kernel function single
generic list caller is not possible.
Instead generated unique trampoline handles all attached bpf program
for this particular kernel function in a sequence of calls.
No link lists to walk, no hash tables to lookup.
All overhead is gone.

3.
The existing kprobe bpf progs are using pt_regs to read arguments. Due to
that ugliness all of them are written for single architecture (x86-64).
Porting them to arm64 is not that hard, but porting to 32-bit arch is close
to impossible. With custom generated trampoline we'll have bpf progs that
work as-is on all archs. raw_tracepoint bpf progs already demonstrated
that such portability is possible. This new kprobe++ progs will be similar.

4.
Due to uniqueness of bpf trampoline sharing trampoline between ftracers
and bpf progs is not possible, so users would have to choose whether to
ftrace that particular kernel function or attach bpf to it.
Attach is not going to stomp on each other. I'm reusing ftrace_make_call/nop
approach that checks that its a 'nop' being replaced.

There probably will be some gotchas and unforeseen issues, since prototype
is very rough and not in reviewable form yet. Will share when it's ready.

2019-10-22 18:13:17

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 10:50:56 -0700
Alexei Starovoitov <[email protected]> wrote:

> > +static void my_hijack_func(unsigned long ip, unsigned long pip,
> > + struct ftrace_ops *ops, struct pt_regs *regs)
>
> 1.
> To pass regs into the callback ftrace_regs_caller() has huge amount
> of stores to do. Saving selector registers is not fast. pushf/popf are even slower.
> ~200 bytes of stack is being used for save/restore.
> This is pure overhead that bpf execution cannot afford.
> bpf is different from traditional ftrace and other tracing, since
> it's often active 24/7. Every nanosecond counts.

Live patching is the same as what you have. If not even more critical.

Note, it would be easy to also implement a "just give me IP regs" flag,
or have that be the default if IPMODIFY is set and REGS is not.


> So for bpf I'm generating assembler trampoline tailored to specific kernel
> function that has its fentry nop replaced with a call to that trampoline.
> Instead of 20+ register save I save only arguments of that kernel function.
> For example the trampoline that attaches to kfree_skb() will save only two registers
> (rbp and rdi on x86) and will use 16 bytes of stack.
>
> 2.
> The common ftrace callback api allows ftrace infra to use generic ftrace_ops_list_func()
> that works for all ftracers, but it doesn't scale.

That only happens if you have more than one callback to a same
function. Otherwise you get a dedicated trampoline.


> We see different teams writing bpf services that attach to the same function.
> In addition there are 30+ kprobes active in other places, so for every
> fentry the ftrace_ops_list_func() walks long linked list and does hash
> lookup for each. That's not efficient and we see this slowdown in practice.
> Because of unique trampoline for every kernel function single
> generic list caller is not possible.
> Instead generated unique trampoline handles all attached bpf program
> for this particular kernel function in a sequence of calls.

Why not have a single ftrace_ops() that calls this utility and do the
multiplexing then?

> No link lists to walk, no hash tables to lookup.
> All overhead is gone.
>
> 3.
> The existing kprobe bpf progs are using pt_regs to read arguments. Due to

That was done because kprobes in general work off of int3. And the
saving of pt_regs was to reuse the code and allow kprobes to work both
with or without a ftrace helper.

> that ugliness all of them are written for single architecture (x86-64).
> Porting them to arm64 is not that hard, but porting to 32-bit arch is close
> to impossible. With custom generated trampoline we'll have bpf progs that
> work as-is on all archs. raw_tracepoint bpf progs already demonstrated
> that such portability is possible. This new kprobe++ progs will be similar.
>
> 4.
> Due to uniqueness of bpf trampoline sharing trampoline between ftracers
> and bpf progs is not possible, so users would have to choose whether to
> ftrace that particular kernel function or attach bpf to it.
> Attach is not going to stomp on each other. I'm reusing ftrace_make_call/nop
> approach that checks that its a 'nop' being replaced.

What about the approach I showed here? Just register a ftrace_ops with
ip modify set, and then call you unique trampoline directly.

It would keep the modification all in one place instead of having
multiple implementations of it. We can make ftrace call your trampoline
just like it was called directly, without writing a whole new
infrastructure.

-- Steve


>
> There probably will be some gotchas and unforeseen issues, since prototype
> is very rough and not in reviewable form yet. Will share when it's ready.

2019-10-22 23:57:21

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 13:46:23 -0700
Alexei Starovoitov <[email protected]> wrote:

> On Tue, Oct 22, 2019 at 02:10:21PM -0400, Steven Rostedt wrote:
> > On Tue, 22 Oct 2019 10:50:56 -0700
> > Alexei Starovoitov <[email protected]> wrote:
> >
> > > > +static void my_hijack_func(unsigned long ip, unsigned long pip,
> > > > + struct ftrace_ops *ops, struct pt_regs *regs)
> > >
> > > 1.
> > > To pass regs into the callback ftrace_regs_caller() has huge amount
> > > of stores to do. Saving selector registers is not fast. pushf/popf are even slower.
> > > ~200 bytes of stack is being used for save/restore.
> > > This is pure overhead that bpf execution cannot afford.
> > > bpf is different from traditional ftrace and other tracing, since
> > > it's often active 24/7. Every nanosecond counts.
> >
> > Live patching is the same as what you have. If not even more critical.
> >
> > Note, it would be easy to also implement a "just give me IP regs" flag,
> > or have that be the default if IPMODIFY is set and REGS is not.
>
> And that will reduce overhead from 20+ regs save/restore into 3-4 ?

Huge difference.

> Say, it's only two regs (rbp and rip). Why bpf side has to pay this runtime
> penalty? I see no good technical reason.

Because we have existing infrastructure, and we don't need to rewrite
the world from scratch. Let's try this out first, and then you show me
how much of an overhead this is to prove that bpf deserves to create a
new infrastructure.

>
> >
> > > So for bpf I'm generating assembler trampoline tailored to specific kernel
> > > function that has its fentry nop replaced with a call to that trampoline.
> > > Instead of 20+ register save I save only arguments of that kernel function.
> > > For example the trampoline that attaches to kfree_skb() will save only two registers
> > > (rbp and rdi on x86) and will use 16 bytes of stack.
> > >
> > > 2.
> > > The common ftrace callback api allows ftrace infra to use generic ftrace_ops_list_func()
> > > that works for all ftracers, but it doesn't scale.
> >
> > That only happens if you have more than one callback to a same
> > function. Otherwise you get a dedicated trampoline.
>
> That's exactly what I tried to explain below. We have production use case
> with 2 kprobes (backed by ftrace) at the same function.
> fwiw the function is tcp_retransmit_skb.

As I state below, you can merge it into a single ftrace_ops.


>
> > > We see different teams writing bpf services that attach to the same function.
> > > In addition there are 30+ kprobes active in other places, so for every
> > > fentry the ftrace_ops_list_func() walks long linked list and does hash
> > > lookup for each. That's not efficient and we see this slowdown in practice.
> > > Because of unique trampoline for every kernel function single
> > > generic list caller is not possible.
> > > Instead generated unique trampoline handles all attached bpf program
> > > for this particular kernel function in a sequence of calls.
> >
> > Why not have a single ftrace_ops() that calls this utility and do the
> > multiplexing then?
>
> because it's not an acceptable overhead.

Prove it! (with the non regs case)

>
> > > No link lists to walk, no hash tables to lookup.
> > > All overhead is gone.
> > >
> > > 3.
> > > The existing kprobe bpf progs are using pt_regs to read arguments. Due to
> >
> > That was done because kprobes in general work off of int3. And the
> > saving of pt_regs was to reuse the code and allow kprobes to work both
> > with or without a ftrace helper.
>
> sure. that makes sense. That's why ftrace-based kprobes are the best
> and fastest kernel infra today for bpf to attach to.
> But after using it all in prod we see that it's too slow for ever growing
> demands which are bpf specific. All the optimizations that went
> into kprobe handling do help. No doubt about that. But here I'm talking
> about removing _all_ overhead. Not just making kprobe 2-3 times faster.

You said yourself that the reg saving overhead is too much. The
pushf/popf is hard too. And I agree that that is unacceptable overhead.
That's why we have two function callers: One with and one without
saving regs. Because I knew that was expensive and we didn't need to
cause everyone to suffer just to have kprobes work with ftrace.

I gave a solution for this. And that is to add another flag to allow
for just the minimum to change the ip. And we can even add another flag
to allow for changing the stack if needed (to emulate a call with the
same parameters).

The world is not just bpf. Does Facebook now own the Linux kernel?

By doing this work, live kernel patching will also benefit. Because it
is also dealing with the unnecessary overhead of saving regs.

And we could possibly even have kprobes benefit from this if a kprobe
doesn't need full regs.

>
> > > that ugliness all of them are written for single architecture (x86-64).
> > > Porting them to arm64 is not that hard, but porting to 32-bit arch is close
> > > to impossible. With custom generated trampoline we'll have bpf progs that
> > > work as-is on all archs. raw_tracepoint bpf progs already demonstrated
> > > that such portability is possible. This new kprobe++ progs will be similar.
> > >
> > > 4.
> > > Due to uniqueness of bpf trampoline sharing trampoline between ftracers
> > > and bpf progs is not possible, so users would have to choose whether to
> > > ftrace that particular kernel function or attach bpf to it.
> > > Attach is not going to stomp on each other. I'm reusing ftrace_make_call/nop
> > > approach that checks that its a 'nop' being replaced.
> >
> > What about the approach I showed here? Just register a ftrace_ops with
> > ip modify set, and then call you unique trampoline directly.
>
> It's 100% unnecessary overhead.

Prove it!

The only benchmarks you are dealing with is using kprobes, which is
known to add added overhead. But you are not even considering to use
something that can handle it without the full regs.

I could probably whip up a POC patch set to get this working in a
day or two.

>
> > It would keep the modification all in one place instead of having
> > multiple implementations of it. We can make ftrace call your trampoline
> > just like it was called directly, without writing a whole new
> > infrastructure.
>
> That is not true at all.
> I haven't posted the code yet, but you're already arguing about
> hypothetical code duplication.
> There is none so far. I'm not reinventing ftrace.
> There will be no FTRACE_OPS_FL_* flags, no ftrace_ops equivalent.
> None of it applies.
> Since Peter is replacing ftrace specific nop->int3->call patching
> with text_poke() I can just use that.


But you said that you can't have this and trace the functions at the
same time. Which also means you can't do live kernel patching on these
functions either.

-- Steve

2019-10-23 00:04:35

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, Oct 22, 2019 at 05:04:30PM -0400, Steven Rostedt wrote:
>
> I gave a solution for this. And that is to add another flag to allow
> for just the minimum to change the ip. And we can even add another flag
> to allow for changing the stack if needed (to emulate a call with the
> same parameters).

your solution is to reduce the overhead.
my solution is to remove it competely. See the difference?

> By doing this work, live kernel patching will also benefit. Because it
> is also dealing with the unnecessary overhead of saving regs.
>
> And we could possibly even have kprobes benefit from this if a kprobe
> doesn't need full regs.

Neither of two statements are true. The per-function generated trampoline
I'm talking about is bpf specific. For a function with two arguments it's just:
push rbp
mov rbp, rsp
push rdi
push rsi
lea rdi,[rbp-0x10]
call jited_bpf_prog
pop rsi
pop rdi
leave
ret

fentry's nop is replaced with call to the above.
That's it.
kprobe and live patching has no use out of it.

> But you said that you can't have this and trace the functions at the
> same time. Which also means you can't do live kernel patching on these
> functions either.

I don't think it's a real use case, but to avoid further arguing
I'll add one nop to the front of generated bpf trampoline so that
ftrace and livepatch can use it.

2019-10-23 00:09:47

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 14:58:43 -0700
Alexei Starovoitov <[email protected]> wrote:

> On Tue, Oct 22, 2019 at 05:04:30PM -0400, Steven Rostedt wrote:
> >
> > I gave a solution for this. And that is to add another flag to allow
> > for just the minimum to change the ip. And we can even add another flag
> > to allow for changing the stack if needed (to emulate a call with the
> > same parameters).
>
> your solution is to reduce the overhead.
> my solution is to remove it competely. See the difference?

You're just trimming it down. I'm curious to what overhead you save by
not saving all parameter registers, and doing a case by case basis?

>
> > By doing this work, live kernel patching will also benefit. Because it
> > is also dealing with the unnecessary overhead of saving regs.
> >
> > And we could possibly even have kprobes benefit from this if a kprobe
> > doesn't need full regs.
>
> Neither of two statements are true. The per-function generated trampoline
> I'm talking about is bpf specific. For a function with two arguments it's just:
> push rbp
> mov rbp, rsp
> push rdi
> push rsi
> lea rdi,[rbp-0x10]
> call jited_bpf_prog

What exactly does the jited_bpf_prog do? Does it modify context?
or is it for monitoring only.

Do only GPL BPF programs get this access?

> pop rsi
> pop rdi
> leave
> ret
>
> fentry's nop is replaced with call to the above.
> That's it.
> kprobe and live patching has no use out of it.
>
> > But you said that you can't have this and trace the functions at the
> > same time. Which also means you can't do live kernel patching on these
> > functions either.
>
> I don't think it's a real use case, but to avoid further arguing
> I'll add one nop to the front of generated bpf trampoline so that
> ftrace and livepatch can use it.

And how does this nop get accounted for? It needs to update the ftrace
dyn_ftrace array that stores all the function locations.

-- Steve

2019-10-23 00:18:57

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()



>> On Oct 22, 2019, at 2:58 PM, Alexei Starovoitov <[email protected]> wrote:
>>
>> On Tue, Oct 22, 2019 at 05:04:30PM -0400, Steven Rostedt wrote:
>> I gave a solution for this. And that is to add another flag to allow
>> for just the minimum to change the ip. And we can even add another flag
>> to allow for changing the stack if needed (to emulate a call with the
>> same parameters).
>
> your solution is to reduce the overhead.
> my solution is to remove it competely. See the difference?
>
>> By doing this work, live kernel patching will also benefit. Because it
>> is also dealing with the unnecessary overhead of saving regs.
>> And we could possibly even have kprobes benefit from this if a kprobe
>> doesn't need full regs.
>
> Neither of two statements are true. The per-function generated trampoline
> I'm talking about is bpf specific. For a function with two arguments it's just:
> push rbp
> mov rbp, rsp
> push rdi
> push rsi
> lea rdi,[rbp-0x10]
> call jited_bpf_prog
> pop rsi
> pop rdi
> leave
> ret

Why are you saving rsi? You said upthread that you’re saving the args, but rsi is already available in rsi.

Just how custom is this bpf program? It seems to clobber no regs (except args), and it doesn’t return anything. Is it entirely specific to the probed function? If so, why not just call it directly?

In any event, I think you can’t *just* use text_poke. Something needs to coordinate to ensure that, if you bpf trace an already-kprobed function, the right thing happens.

FWIW, if you are going to use a trampoline like this, consider using r11 for the caller frame instead of rsi. You won’t need to restore it. But I’m wondering whether the bpf jitted code could just directly access the frame instead of indirecting through a register. (Or am I entirely misunderstanding what rdi is for in your example?)

2019-10-23 00:21:44

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 15:45:26 -0700
Andy Lutomirski <[email protected]> wrote:

> >> On Oct 22, 2019, at 2:58 PM, Alexei Starovoitov <[email protected]> wrote:
> >>
> >> On Tue, Oct 22, 2019 at 05:04:30PM -0400, Steven Rostedt wrote:
> >> I gave a solution for this. And that is to add another flag to allow
> >> for just the minimum to change the ip. And we can even add another flag
> >> to allow for changing the stack if needed (to emulate a call with the
> >> same parameters).
> >
> > your solution is to reduce the overhead.
> > my solution is to remove it competely. See the difference?
> >
> >> By doing this work, live kernel patching will also benefit. Because it
> >> is also dealing with the unnecessary overhead of saving regs.
> >> And we could possibly even have kprobes benefit from this if a kprobe
> >> doesn't need full regs.
> >
> > Neither of two statements are true. The per-function generated trampoline
> > I'm talking about is bpf specific. For a function with two arguments it's just:
> > push rbp
> > mov rbp, rsp
> > push rdi
> > push rsi
> > lea rdi,[rbp-0x10]
> > call jited_bpf_prog
> > pop rsi
> > pop rdi
> > leave
> > ret
>
> Why are you saving rsi? You said upthread that you’re saving the
> args, but rsi is already available in rsi.

The above is for two parameters, and is being called before the
function with those two parameters. The jited_bpf_prog will be called
with those two parameters as well, but it may also clobber them. Then
we need to restore those two parameters before calling the original
function.

>
> Just how custom is this bpf program? It seems to clobber no regs
> (except args), and it doesn’t return anything. Is it entirely
> specific to the probed function? If so, why not just call it
> directly?

It's injecting the jited_bpf_prog to be called when the probed function
is called, with the same parameters as the probed function.

my_probed_function
call trampoline


trampoline
save parameters
call jited_bpf_prog (with same parameters)
restore paremeters
ret

Jumps back to the my_probed_function, where my_probed_function is
clueless it was just interrupted.

No need to save any clobbered registers but the parameters, as the
jited_bpf_prog needs to save any registers that it clobbers just like
the my_probed_function saves its.

-- Steve

2019-10-23 01:45:06

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, Oct 22, 2019 at 02:10:21PM -0400, Steven Rostedt wrote:
> On Tue, 22 Oct 2019 10:50:56 -0700
> Alexei Starovoitov <[email protected]> wrote:
>
> > > +static void my_hijack_func(unsigned long ip, unsigned long pip,
> > > + struct ftrace_ops *ops, struct pt_regs *regs)
> >
> > 1.
> > To pass regs into the callback ftrace_regs_caller() has huge amount
> > of stores to do. Saving selector registers is not fast. pushf/popf are even slower.
> > ~200 bytes of stack is being used for save/restore.
> > This is pure overhead that bpf execution cannot afford.
> > bpf is different from traditional ftrace and other tracing, since
> > it's often active 24/7. Every nanosecond counts.
>
> Live patching is the same as what you have. If not even more critical.
>
> Note, it would be easy to also implement a "just give me IP regs" flag,
> or have that be the default if IPMODIFY is set and REGS is not.

And that will reduce overhead from 20+ regs save/restore into 3-4 ?
Say, it's only two regs (rbp and rip). Why bpf side has to pay this runtime
penalty? I see no good technical reason.

>
> > So for bpf I'm generating assembler trampoline tailored to specific kernel
> > function that has its fentry nop replaced with a call to that trampoline.
> > Instead of 20+ register save I save only arguments of that kernel function.
> > For example the trampoline that attaches to kfree_skb() will save only two registers
> > (rbp and rdi on x86) and will use 16 bytes of stack.
> >
> > 2.
> > The common ftrace callback api allows ftrace infra to use generic ftrace_ops_list_func()
> > that works for all ftracers, but it doesn't scale.
>
> That only happens if you have more than one callback to a same
> function. Otherwise you get a dedicated trampoline.

That's exactly what I tried to explain below. We have production use case
with 2 kprobes (backed by ftrace) at the same function.
fwiw the function is tcp_retransmit_skb.

> > We see different teams writing bpf services that attach to the same function.
> > In addition there are 30+ kprobes active in other places, so for every
> > fentry the ftrace_ops_list_func() walks long linked list and does hash
> > lookup for each. That's not efficient and we see this slowdown in practice.
> > Because of unique trampoline for every kernel function single
> > generic list caller is not possible.
> > Instead generated unique trampoline handles all attached bpf program
> > for this particular kernel function in a sequence of calls.
>
> Why not have a single ftrace_ops() that calls this utility and do the
> multiplexing then?

because it's not an acceptable overhead.

> > No link lists to walk, no hash tables to lookup.
> > All overhead is gone.
> >
> > 3.
> > The existing kprobe bpf progs are using pt_regs to read arguments. Due to
>
> That was done because kprobes in general work off of int3. And the
> saving of pt_regs was to reuse the code and allow kprobes to work both
> with or without a ftrace helper.

sure. that makes sense. That's why ftrace-based kprobes are the best
and fastest kernel infra today for bpf to attach to.
But after using it all in prod we see that it's too slow for ever growing
demands which are bpf specific. All the optimizations that went
into kprobe handling do help. No doubt about that. But here I'm talking
about removing _all_ overhead. Not just making kprobe 2-3 times faster.

> > that ugliness all of them are written for single architecture (x86-64).
> > Porting them to arm64 is not that hard, but porting to 32-bit arch is close
> > to impossible. With custom generated trampoline we'll have bpf progs that
> > work as-is on all archs. raw_tracepoint bpf progs already demonstrated
> > that such portability is possible. This new kprobe++ progs will be similar.
> >
> > 4.
> > Due to uniqueness of bpf trampoline sharing trampoline between ftracers
> > and bpf progs is not possible, so users would have to choose whether to
> > ftrace that particular kernel function or attach bpf to it.
> > Attach is not going to stomp on each other. I'm reusing ftrace_make_call/nop
> > approach that checks that its a 'nop' being replaced.
>
> What about the approach I showed here? Just register a ftrace_ops with
> ip modify set, and then call you unique trampoline directly.

It's 100% unnecessary overhead.

> It would keep the modification all in one place instead of having
> multiple implementations of it. We can make ftrace call your trampoline
> just like it was called directly, without writing a whole new
> infrastructure.

That is not true at all.
I haven't posted the code yet, but you're already arguing about
hypothetical code duplication.
There is none so far. I'm not reinventing ftrace.
There will be no FTRACE_OPS_FL_* flags, no ftrace_ops equivalent.
None of it applies.
Since Peter is replacing ftrace specific nop->int3->call patching
with text_poke() I can just use that.

2019-10-23 02:55:00

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, Oct 22, 2019 at 03:45:26PM -0700, Andy Lutomirski wrote:
>
>
> >> On Oct 22, 2019, at 2:58 PM, Alexei Starovoitov <[email protected]> wrote:
> >>
> >> On Tue, Oct 22, 2019 at 05:04:30PM -0400, Steven Rostedt wrote:
> >> I gave a solution for this. And that is to add another flag to allow
> >> for just the minimum to change the ip. And we can even add another flag
> >> to allow for changing the stack if needed (to emulate a call with the
> >> same parameters).
> >
> > your solution is to reduce the overhead.
> > my solution is to remove it competely. See the difference?
> >
> >> By doing this work, live kernel patching will also benefit. Because it
> >> is also dealing with the unnecessary overhead of saving regs.
> >> And we could possibly even have kprobes benefit from this if a kprobe
> >> doesn't need full regs.
> >
> > Neither of two statements are true. The per-function generated trampoline
> > I'm talking about is bpf specific. For a function with two arguments it's just:
> > push rbp
> > mov rbp, rsp
> > push rdi
> > push rsi
> > lea rdi,[rbp-0x10]
> > call jited_bpf_prog
> > pop rsi
> > pop rdi
> > leave
> > ret
>
> Why are you saving rsi? You said upthread that you’re saving the args, but rsi is already available in rsi.

because rsi is caller saved. The above example is for probing something
like tcp_set_state(struct sock *sk, int state) that everyone used to
kprobe until we got a tracepoint there.
The main bpf prog has only one argument R1 == rdi on x86,
but it's allowed to clobber all caller saved regs.
Just like x86 function that accepts one argument in rdi can clobber rsi and others.
So it's essential to save 'sk' and 'state' for tcp_set_state()
to continue as nothing happened.

> But I’m wondering whether the bpf jitted code could just directly access the frame instead of indirecting through a register.

That's an excellent question!
We've debated a ton whether to extend main prog from R1 to all R1-R5
like bpf subprograms allow. The problem is a lot of existing infra
assume single R1==ctx. Passing 6th argument is not defined either.
But it's nice to see all arguments of the kernel function.
Also bpf is 64-bit ISA. Even when it's running on 32-bit arch.
Just taking values from registers doesn't work there.
Whereas when args are indirectly passed as a bunch of u64s in the stack
the bpf prog becomes portable across architectures
(not 100% of course, but close).

> Is it entirely specific to the probed function?

yes. It is specific to the probed function. The verifier makes sure
that only first two arguments are accessed as read-only
via *(u64*)(r1 + 0) and *(u64*)(r1 + 8)
But the program is allowed to use r2-r5 without saving them.
r6-r10 are saved in implicit program prologue.

> In any event, I think you can’t *just* use text_poke. Something needs to coordinate to ensure that, if you bpf trace an already-kprobed function, the right thing happens.

Right. Not _just_. I'm looking at Peter's patches and as a minimum I need to grab
text_mutex and make sure it's nop being replaced.

> FWIW, if you are going to use a trampoline like this, consider using r11 for the caller frame instead of rsi.

r11 can be used by jited code that doing divide and multiply.
It's possible to refactor that code and free it up.
I prefer to save such micro-optimization for later.
Dealing with interpreter is also a pain to the point I'm considering
to avoid supporting it for these new things.
Folks should be using CONFIG_BPF_JIT_ALWAYS_ON anyway.

2019-10-23 03:34:01

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 18:17:40 -0400
Steven Rostedt <[email protected]> wrote:

> > your solution is to reduce the overhead.
> > my solution is to remove it competely. See the difference?
>
> You're just trimming it down. I'm curious to what overhead you save by
> not saving all parameter registers, and doing a case by case basis?

I played with adding a ftrace_ip_caller, that only modifies the ip, and
passes a regs with bare minimum saved (allowing the callback to modify
the regs->ip).

I modified the test-events-sample to just hijack the do_raw_spin_lock()
function, which is a very hot path, as it is called by most of the
spin_lock code.

I then ran:

# perf stat -r 20 /work/c/hackbench 50

vanilla, as nothing is happening.

I then enabled the test-events-sample with the new ftrace ip
modification only.

I then had it do the hijack of do_raw_spin_lock() with the current
ftrace_regs_caller that kprobes and live kernel patching use.

The improvement was quite noticeable.

Baseline: (no tracing)

# perf stat -r 20 /work/c/hackbench 50
Time: 1.146
Time: 1.120
Time: 1.102
Time: 1.099
Time: 1.136
Time: 1.123
Time: 1.128
Time: 1.115
Time: 1.111
Time: 1.192
Time: 1.160
Time: 1.156
Time: 1.135
Time: 1.144
Time: 1.150
Time: 1.120
Time: 1.121
Time: 1.120
Time: 1.106
Time: 1.127

Performance counter stats for '/work/c/hackbench 50' (20 runs):

9,056.18 msec task-clock # 6.770 CPUs utilized ( +- 0.26% )
148,461 context-switches # 0.016 M/sec ( +- 2.58% )
18,397 cpu-migrations # 0.002 M/sec ( +- 3.24% )
54,029 page-faults # 0.006 M/sec ( +- 0.63% )
33,227,808,738 cycles # 3.669 GHz ( +- 0.25% )
23,314,273,335 stalled-cycles-frontend # 70.16% frontend cycles idle ( +- 0.25% )
23,568,794,330 instructions # 0.71 insn per cycle
# 0.99 stalled cycles per insn ( +- 0.26% )
3,756,521,438 branches # 414.802 M/sec ( +- 0.29% )
36,949,690 branch-misses # 0.98% of all branches ( +- 0.46% )

1.3377 +- 0.0213 seconds time elapsed ( +- 1.59% )

Using IPMODIFY without regs:

# perf stat -r 20 /work/c/hackbench 50
Time: 1.193
Time: 1.115
Time: 1.145
Time: 1.129
Time: 1.121
Time: 1.132
Time: 1.136
Time: 1.156
Time: 1.133
Time: 1.131
Time: 1.138
Time: 1.150
Time: 1.147
Time: 1.137
Time: 1.178
Time: 1.121
Time: 1.142
Time: 1.158
Time: 1.143
Time: 1.168

Performance counter stats for '/work/c/hackbench 50' (20 runs):

9,231.39 msec task-clock # 6.917 CPUs utilized ( +- 0.39% )
136,822 context-switches # 0.015 M/sec ( +- 3.06% )
17,308 cpu-migrations # 0.002 M/sec ( +- 2.61% )
54,521 page-faults # 0.006 M/sec ( +- 0.57% )
33,875,937,399 cycles # 3.670 GHz ( +- 0.39% )
23,575,136,580 stalled-cycles-frontend # 69.59% frontend cycles idle ( +- 0.41% )
24,246,404,007 instructions # 0.72 insn per cycle
# 0.97 stalled cycles per insn ( +- 0.33% )
3,878,453,510 branches # 420.138 M/sec ( +- 0.36% )
47,653,911 branch-misses # 1.23% of all branches ( +- 0.43% )

1.33462 +- 0.00440 seconds time elapsed ( +- 0.33% )


The current ftrace_regs_caller: (old way of doing it)

# perf stat -r 20 /work/c/hackbench 50
Time: 1.114
Time: 1.153
Time: 1.236
Time: 1.208
Time: 1.179
Time: 1.183
Time: 1.217
Time: 1.190
Time: 1.157
Time: 1.172
Time: 1.215
Time: 1.165
Time: 1.171
Time: 1.188
Time: 1.176
Time: 1.217
Time: 1.341
Time: 1.238
Time: 1.363
Time: 1.287

Performance counter stats for '/work/c/hackbench 50' (20 runs):

9,522.76 msec task-clock # 6.653 CPUs utilized ( +- 0.36% )
131,347 context-switches # 0.014 M/sec ( +- 3.37% )
17,090 cpu-migrations # 0.002 M/sec ( +- 3.56% )
54,126 page-faults # 0.006 M/sec ( +- 0.71% )
34,942,273,861 cycles # 3.669 GHz ( +- 0.35% )
24,517,757,272 stalled-cycles-frontend # 70.17% frontend cycles idle ( +- 0.35% )
24,684,124,265 instructions # 0.71 insn per cycle
# 0.99 stalled cycles per insn ( +- 0.34% )
3,859,734,617 branches # 405.317 M/sec ( +- 0.38% )
47,286,857 branch-misses # 1.23% of all branches ( +- 0.70% )

1.4313 +- 0.0244 seconds time elapsed ( +- 1.70% )


In summary we have:

Baseline: (no tracing)

33,227,808,738 cycles # 3.669 GHz ( +- 0.25% )
1.3377 +- 0.0213 seconds time elapsed ( +- 1.59% )

Just ip modification:

33,875,937,399 cycles # 3.670 GHz ( +- 0.39% )
1.33462 +- 0.00440 seconds time elapsed ( +- 0.33% )

Full regs as done today:

34,942,273,861 cycles # 3.669 GHz ( +- 0.35% )
1.4313 +- 0.0244 seconds time elapsed ( +- 1.70% )


Note, the hijack function is done very naively, the function trampoline
is called twice.

do_raw_spin_lock()
call ftrace_ip_caller
call my_highjack_func
test pip (true)
call my_do_spin_lock()
call do_raw_spin_lock()
call ftrace_ip_caller
call my_highjack_func
test pip (false)

We could work on ways to not do this double call, but I just wanted to
get some kind of test case out.

This method could at least help with live kernel patching.

Attached is the patch I used. To switch back to full regs, just
remove the comma and uncomment the SAVE_REGS flag in the
trace-events-samples.c file.

-- Steve


Attachments:
(No filename) (6.61 kB)
test.patch (15.87 kB)
Download all attachments

2019-10-23 04:22:39

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, Oct 22, 2019 at 4:49 PM Alexei Starovoitov
<[email protected]> wrote:
>
> On Tue, Oct 22, 2019 at 03:45:26PM -0700, Andy Lutomirski wrote:
> >
> >
> > >> On Oct 22, 2019, at 2:58 PM, Alexei Starovoitov <[email protected]> wrote:
> > >>
> > >> On Tue, Oct 22, 2019 at 05:04:30PM -0400, Steven Rostedt wrote:
> > >> I gave a solution for this. And that is to add another flag to allow
> > >> for just the minimum to change the ip. And we can even add another flag
> > >> to allow for changing the stack if needed (to emulate a call with the
> > >> same parameters).
> > >
> > > your solution is to reduce the overhead.
> > > my solution is to remove it competely. See the difference?
> > >
> > >> By doing this work, live kernel patching will also benefit. Because it
> > >> is also dealing with the unnecessary overhead of saving regs.
> > >> And we could possibly even have kprobes benefit from this if a kprobe
> > >> doesn't need full regs.
> > >
> > > Neither of two statements are true. The per-function generated trampoline
> > > I'm talking about is bpf specific. For a function with two arguments it's just:
> > > push rbp
> > > mov rbp, rsp
> > > push rdi
> > > push rsi
> > > lea rdi,[rbp-0x10]
> > > call jited_bpf_prog
> > > pop rsi
> > > pop rdi
> > > leave
> > > ret
> >
> > Why are you saving rsi? You said upthread that you’re saving the args, but rsi is already available in rsi.
>
> because rsi is caller saved. The above example is for probing something
> like tcp_set_state(struct sock *sk, int state) that everyone used to
> kprobe until we got a tracepoint there.
> The main bpf prog has only one argument R1 == rdi on x86,
> but it's allowed to clobber all caller saved regs.
> Just like x86 function that accepts one argument in rdi can clobber rsi and others.
> So it's essential to save 'sk' and 'state' for tcp_set_state()
> to continue as nothing happened.

Oh, right, you're hijacking the very first instruction, so you know
that the rest of the arg regs as well as rax are unused.

But I find it hard to believe that this is a particularly meaningful
optimization compared to the version that saves all the C-clobbered
registers. Steven,

Also, Alexei, are you testing on a CONFIG_FRAME_POINTER=y kernel? The
ftrace code has a somewhat nasty special case to make
CONFIG_FRAME_POINTER=y work right, and your example trampoline does
not but arguably should have exaclty the same fixup. For good
performance, you should be using CONFIG_FRAME_POINTER=n.

Steven, with your benchmark, could you easily make your actual ftrace
hook do nothing at all and get a perf report on the result (i.e. call
the traced function in a loop a bunch of times under perf record -e
cycles or similar)? It would be interesting to see exactly what
trampoline code you're generating and just how bad it is. ISTM it
should be possible to squeeze very good performance out of ftrace. I
suppose you could also have a fancier mode than just "IP" that
specifies that the caller knows exactly which registers are live and
what they are. Then you could generate code that's exactly as good as
Alexei's.

2019-10-23 09:06:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, Oct 22, 2019 at 09:20:27PM -0700, Andy Lutomirski wrote:
> Also, Alexei, are you testing on a CONFIG_FRAME_POINTER=y kernel? The
> ftrace code has a somewhat nasty special case to make
> CONFIG_FRAME_POINTER=y work right, and your example trampoline does
> not but arguably should have exaclty the same fixup. For good
> performance, you should be using CONFIG_FRAME_POINTER=n.

Trampolines and JITs (which both lack ORC data) should always have frame
pointers. That way, when ORC fails, it can fall back to FP and we can
still get sensible unwinds.

See:

ae6a45a08689 ("x86/unwind/orc: Fall back to using frame pointers for generated code")

2019-10-24 09:36:34

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, Oct 23, 2019 at 12:23:06PM -0400, Steven Rostedt wrote:
> On Tue, 22 Oct 2019 14:58:43 -0700
> Alexei Starovoitov <[email protected]> wrote:
>
> > Neither of two statements are true. The per-function generated trampoline
> > I'm talking about is bpf specific. For a function with two arguments it's just:
> > push rbp
> > mov rbp, rsp
> > push rdi
> > push rsi
> > lea rdi,[rbp-0x10]
> > call jited_bpf_prog
> > pop rsi
> > pop rdi
> > leave
> > ret
> >
> > fentry's nop is replaced with call to the above.
> > That's it.
> > kprobe and live patching has no use out of it.
> >
>
> Below is a patch that allows you to do this, and you don't need to
> worry about patching the nops. And it also allows to you hook directly
> to any function and still allow kprobes and tracing on those same
> functions (as long as they don't modify the ip, but in the future, we
> may be able to allow that too!). And this code does not depend on
> Peter's code either.
>
> All you need to do is:
>
> register_ftrace_direct((unsigned long)func_you_want_to_trace,
> (unsigned long)your_trampoline);
>
>
> I added to trace-event-samples.c:
>
> void my_direct_func(raw_spinlock_t *lock)
> {
> trace_printk("taking %p\n", lock);
> }
>
> extern void my_tramp(void *);
>
> asm (
> " .pushsection .text, \"ax\", @progbits\n"
> " my_tramp:"
> #if 1
> " pushq %rbp\n"
> " movq %rsp, %rbp\n"
> " pushq %rdi\n"
> " call my_direct_func\n"
> " popq %rdi\n"
> " leave\n"
> #endif
> " ret\n"
> " .popsection\n"
> );
>
>
> (the #if was for testing purposes)
>
> And then in the module load and unload:
>
> ret = register_ftrace_direct((unsigned long)do_raw_spin_lock,
> (unsigned long)my_tramp);
>
> unregister_ftrace_direct((unsigned long)do_raw_spin_lock,
> (unsigned long)my_tramp);
>
> respectively.
>
> And what this does is if there's only a single callback to the
> registered function, it changes the nop in the function to call your
> trampoline directly (just like you want this to do!). But if we add
> another callback, a direct_ops ftrace_ops gets added to the list of the
> functions to go through, and this will set up the code to call your
> trampoline after it calls all the other callbacks.
>
> The called trampoline will be called as if it was called directly from
> the nop.
>
> OK, I wrote this up quickly, and it has some bugs, but nothing that
> can't be straighten out (specifically, the checks fail if you add a
> function trace to one of the direct callbacks, but this can be dealt
> with).
>
> Note, the work needed to port this to other archs is rather minimal
> (just need to tweak the ftrace_regs_caller and have a way to pass back
> the call address via pt_regs that is not saved).
>
> Alexei,
>
> Would this work for you?

Yes!
Looks great. More comments below.

>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 6adaf18b3365..de3372bd08ae 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -159,6 +159,7 @@ config X86
> select HAVE_DYNAMIC_FTRACE
> select HAVE_DYNAMIC_FTRACE_WITH_REGS
> select HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY
> + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> select HAVE_EBPF_JIT
> select HAVE_EFFICIENT_UNALIGNED_ACCESS
> select HAVE_EISA
> diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> index c38a66661576..34da1e424391 100644
> --- a/arch/x86/include/asm/ftrace.h
> +++ b/arch/x86/include/asm/ftrace.h
> @@ -28,6 +28,12 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
> return addr;
> }
>
> +static inline void ftrace_set_call_func(struct pt_regs *regs, unsigned long addr)
> +{
> + /* Emulate a call */
> + regs->orig_ax = addr;

This probably needs a longer comment :)

> + .if \make_call
> + movq ORIG_RAX(%rsp), %rax
> + movq %rax, MCOUNT_REG_SIZE-8(%rsp)

reading asm helps.

> +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> + bool
> +
> config HAVE_FTRACE_MCOUNT_RECORD
> bool
> help
> @@ -565,6 +568,11 @@ config DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
> depends on DYNAMIC_FTRACE
> depends on HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY
>
> +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> + def_bool y
> + depends on DYNAMIC_FTRACE
> + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS

It seems to me that it's a bit of overkill to add new config knob
for every ftrace feature.
HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS (that arch defined) would
be enough to check and return error in register_ftrace_direct()
right?

> -static struct ftrace_hash *
> -__ftrace_hash_move(struct ftrace_hash *src)
> +static void transfer_hash(struct ftrace_hash *dst, struct ftrace_hash *src)
> {
> struct ftrace_func_entry *entry;
> - struct hlist_node *tn;
> struct hlist_head *hhd;
> + struct hlist_node *tn;
> + int size;
> + int i;
> +
> + dst->flags = src->flags;
> +
> + size = 1 << src->size_bits;
> + for (i = 0; i < size; i++) {
> + hhd = &src->buckets[i];
> + hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
> + remove_hash_entry(src, entry);
> + __add_hash_entry(dst, entry);

I don't quite follow why this is needed.
I thought alloc_and_copy_ftrace_hash() can already handle it.
If that is just unrelated cleanup then sure. Looks good.

> +struct ftrace_ops direct_ops = {
> + .func = call_direct_funcs,
> + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
> +#if 1
> + | FTRACE_OPS_FL_DIRECT
> +#endif
> +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
> + | FTRACE_OPS_FL_SAVE_REGS
> +#endif

With FL_DIRECT the CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY won't be needed, right ?
At least not for bpf use case.
Do you see livepatching using it or switching to FL_DIRECT too?

> + ret = -ENOMEM;
> + if (ftrace_hash_empty(direct_functions) ||
> + direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
> + struct ftrace_hash *new_hash;
> + int size = ftrace_hash_empty(direct_functions) ? 0 :
> + direct_functions->count + 1;
> + int bits;
> +
> + if (size < 32)
> + size = 32;
> +
> + for (size /= 2; size; size >>= 1)
> + bits++;
> +
> + /* Don't allocate too much */
> + if (bits > FTRACE_HASH_MAX_BITS)
> + bits = FTRACE_HASH_MAX_BITS;
> +
> + new_hash = alloc_ftrace_hash(bits);
> + if (!new_hash)
> + goto out_unlock;
> +
> + transfer_hash(new_hash, direct_functions);
> + free_ftrace_hash(direct_functions);
> + direct_functions = new_hash;

That's probably racy, no?
ftrace_get_addr_new() is not holding direct_mutex that
protects direct_functions.

> + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED))
> + ret = register_ftrace_function(&direct_ops);

Having single direct_ops is nice.

> @@ -2370,6 +2542,10 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
> {
> struct ftrace_ops *ops;
>
> + if ((rec->flags & FTRACE_FL_DIRECT) &&
> + (ftrace_rec_count(rec) == 1))
> + return find_rec_direct(rec->ip);
> +

I've started playing with this function as well to
implement 2nd nop approach I mentioned earlier.
I'm going to abandon it, since your approach is better.
It allows not only bpf, but anyone else to register direct.

I have one more question/request.
Looks like ftrace can be turned off with sysctl.
Which means that a person or a script can accidently turn it off
and all existing kprobe+bpf stuff that is ftrace based will
be silently switched off.
User services will be impacted and will make people unhappy.
I'm not sure how to solve the existing situation,
but for FL_DIRECT can you add a check that if particular
nop is modified to do direct call then the only interface to
turn it off is to call unregister_ftrace_direct().
There should no side channel to kill it.
ftrace_disable and ftrace_kill make me nervous too.
Fast forward a year and imagine few host critical bpf progs
are running in production and relying on FL_DIRECT.
Now somebody decided to test new ftrace feature and
it hit one of FTRACE_WARN_ON().
That will shutdown the whole ftrace and bpf progs
will stop executing. I'd like to avoid that.
May be I misread the code?

2019-10-24 09:41:47

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, 23 Oct 2019 12:34:43 -0700
Alexei Starovoitov <[email protected]> wrote:

> > Would this work for you?
>
> Yes!
> Looks great. More comments below.

Awesome!

>
> >
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 6adaf18b3365..de3372bd08ae 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -159,6 +159,7 @@ config X86
> > select HAVE_DYNAMIC_FTRACE
> > select HAVE_DYNAMIC_FTRACE_WITH_REGS
> > select HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY
> > + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> > select HAVE_EBPF_JIT
> > select HAVE_EFFICIENT_UNALIGNED_ACCESS
> > select HAVE_EISA
> > diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> > index c38a66661576..34da1e424391 100644
> > --- a/arch/x86/include/asm/ftrace.h
> > +++ b/arch/x86/include/asm/ftrace.h
> > @@ -28,6 +28,12 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
> > return addr;
> > }
> >
> > +static inline void ftrace_set_call_func(struct pt_regs *regs, unsigned long addr)
> > +{
> > + /* Emulate a call */
> > + regs->orig_ax = addr;
>
> This probably needs a longer comment :)

Yes, when I get this to a submission point, I plan on having *a lot*
more comments all over the place.


>
> > + .if \make_call
> > + movq ORIG_RAX(%rsp), %rax
> > + movq %rax, MCOUNT_REG_SIZE-8(%rsp)
>
> reading asm helps.
>
> > +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> > + bool
> > +
> > config HAVE_FTRACE_MCOUNT_RECORD
> > bool
> > help
> > @@ -565,6 +568,11 @@ config DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
> > depends on DYNAMIC_FTRACE
> > depends on HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY
> >
> > +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
> > + def_bool y
> > + depends on DYNAMIC_FTRACE
> > + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
>
> It seems to me that it's a bit of overkill to add new config knob
> for every ftrace feature.
> HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS (that arch defined) would
> be enough to check and return error in register_ftrace_direct()
> right?

IIRC, we started doing this because it allows the dependencies to be
defined in the kernel/trace directory. That is, if
CONFIG_DYNAMIC_FATRCE_WITH_DIRECT_CALLS is set, then we know that
direct calls *and* DYNAMIC_FTRACE is enabled. It cuts down on some of
the more complex #if or the arch needing to do

select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if DYNAMIC_FTRACE

It may be overkill, but it does keep the pain in one place.

>
> > -static struct ftrace_hash *
> > -__ftrace_hash_move(struct ftrace_hash *src)
> > +static void transfer_hash(struct ftrace_hash *dst, struct ftrace_hash *src)
> > {
> > struct ftrace_func_entry *entry;
> > - struct hlist_node *tn;
> > struct hlist_head *hhd;
> > + struct hlist_node *tn;
> > + int size;
> > + int i;
> > +
> > + dst->flags = src->flags;
> > +
> > + size = 1 << src->size_bits;
> > + for (i = 0; i < size; i++) {
> > + hhd = &src->buckets[i];
> > + hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
> > + remove_hash_entry(src, entry);
> > + __add_hash_entry(dst, entry);
>
> I don't quite follow why this is needed.
> I thought alloc_and_copy_ftrace_hash() can already handle it.
> If that is just unrelated cleanup then sure. Looks good.

The alloc and copy is made to always create a new hash (because of the
way we do enabling of new filters in the set_ftrace_filter).

I pulled this part out so that the direct_functions hash only needs to
grow and allocate, when needed. Not to reallocate at every update.

>
> > +struct ftrace_ops direct_ops = {
> > + .func = call_direct_funcs,
> > + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
> > +#if 1
> > + | FTRACE_OPS_FL_DIRECT
> > +#endif
> > +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
> > + | FTRACE_OPS_FL_SAVE_REGS
> > +#endif
>
> With FL_DIRECT the CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY won't be needed, right ?
> At least not for bpf use case.
> Do you see livepatching using it or switching to FL_DIRECT too?

Correct. I talked with Josh on IRC and we are looking into removing the
pushf/popf from the ftrace_regs_caller to see if that helps in the
performance for live patching. I'm also currently working on making
this patch not on top of the IP modify one, so the IP modify doesn't
need to be applied.

This also cleans up the asm code a bit more (getting rid of the macro).


>
> > + ret = -ENOMEM;
> > + if (ftrace_hash_empty(direct_functions) ||
> > + direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
> > + struct ftrace_hash *new_hash;
> > + int size = ftrace_hash_empty(direct_functions) ? 0 :
> > + direct_functions->count + 1;
> > + int bits;
> > +
> > + if (size < 32)
> > + size = 32;
> > +
> > + for (size /= 2; size; size >>= 1)
> > + bits++;
> > +
> > + /* Don't allocate too much */
> > + if (bits > FTRACE_HASH_MAX_BITS)
> > + bits = FTRACE_HASH_MAX_BITS;
> > +
> > + new_hash = alloc_ftrace_hash(bits);
> > + if (!new_hash)
> > + goto out_unlock;
> > +
> > + transfer_hash(new_hash, direct_functions);
> > + free_ftrace_hash(direct_functions);
> > + direct_functions = new_hash;
>
> That's probably racy, no?
> ftrace_get_addr_new() is not holding direct_mutex that
> protects direct_functions.

Yes, there's actually a few places that I noticed needed some more care
with locking. And I also found some that are missing now (without these
changes).

I did say this patch is buggy ;-)


>
> > + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED))
> > + ret = register_ftrace_function(&direct_ops);
>
> Having single direct_ops is nice.
>
> > @@ -2370,6 +2542,10 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
> > {
> > struct ftrace_ops *ops;
> >
> > + if ((rec->flags & FTRACE_FL_DIRECT) &&
> > + (ftrace_rec_count(rec) == 1))
> > + return find_rec_direct(rec->ip);
> > +
>
> I've started playing with this function as well to
> implement 2nd nop approach I mentioned earlier.
> I'm going to abandon it, since your approach is better.
> It allows not only bpf, but anyone else to register direct.
>
> I have one more question/request.
> Looks like ftrace can be turned off with sysctl.
> Which means that a person or a script can accidently turn it off
> and all existing kprobe+bpf stuff that is ftrace based will
> be silently switched off.

See http://lkml.kernel.org/r/[email protected]

I can (and should) add the PERMANENT flag to the direct_ops.

Note, the PERMANENT patches will be added before this one.

> User services will be impacted and will make people unhappy.
> I'm not sure how to solve the existing situation,
> but for FL_DIRECT can you add a check that if particular
> nop is modified to do direct call then the only interface to
> turn it off is to call unregister_ftrace_direct().
> There should no side channel to kill it.
> ftrace_disable and ftrace_kill make me nervous too.

The ftrace_disable and ftrace_kill is when a bug happens that is so bad
that the ftrace can probably kill the system. It's time for a reboot.


> Fast forward a year and imagine few host critical bpf progs
> are running in production and relying on FL_DIRECT.
> Now somebody decided to test new ftrace feature and
> it hit one of FTRACE_WARN_ON().
> That will shutdown the whole ftrace and bpf progs
> will stop executing. I'd like to avoid that.
> May be I misread the code?

It basically freezes it. Current registered ftrace_ops will not be
touched. But nothing can be removed or added.

It's basically saying, "Something totally wrong has happened, and if I
touch the code here, I may panic the box. So don't do anything more!"

And yes, without adding some of theses, I have in fact paniced the box.

This is something that Ingo drilled hard into me, as modifying text is
such a dangerous operation, that there should be constant checks that
things are happening the way they expect them to be, and if anomaly
happens, stop touching everything.

OK, I'll work to get this patch in for the next merge window.

Thanks for the review.

-- Steve

2019-10-24 11:19:29

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Tue, 22 Oct 2019 14:58:43 -0700
Alexei Starovoitov <[email protected]> wrote:

> Neither of two statements are true. The per-function generated trampoline
> I'm talking about is bpf specific. For a function with two arguments it's just:
> push rbp
> mov rbp, rsp
> push rdi
> push rsi
> lea rdi,[rbp-0x10]
> call jited_bpf_prog
> pop rsi
> pop rdi
> leave
> ret
>
> fentry's nop is replaced with call to the above.
> That's it.
> kprobe and live patching has no use out of it.
>

Below is a patch that allows you to do this, and you don't need to
worry about patching the nops. And it also allows to you hook directly
to any function and still allow kprobes and tracing on those same
functions (as long as they don't modify the ip, but in the future, we
may be able to allow that too!). And this code does not depend on
Peter's code either.

All you need to do is:

register_ftrace_direct((unsigned long)func_you_want_to_trace,
(unsigned long)your_trampoline);


I added to trace-event-samples.c:

void my_direct_func(raw_spinlock_t *lock)
{
trace_printk("taking %p\n", lock);
}

extern void my_tramp(void *);

asm (
" .pushsection .text, \"ax\", @progbits\n"
" my_tramp:"
#if 1
" pushq %rbp\n"
" movq %rsp, %rbp\n"
" pushq %rdi\n"
" call my_direct_func\n"
" popq %rdi\n"
" leave\n"
#endif
" ret\n"
" .popsection\n"
);


(the #if was for testing purposes)

And then in the module load and unload:

ret = register_ftrace_direct((unsigned long)do_raw_spin_lock,
(unsigned long)my_tramp);

unregister_ftrace_direct((unsigned long)do_raw_spin_lock,
(unsigned long)my_tramp);

respectively.

And what this does is if there's only a single callback to the
registered function, it changes the nop in the function to call your
trampoline directly (just like you want this to do!). But if we add
another callback, a direct_ops ftrace_ops gets added to the list of the
functions to go through, and this will set up the code to call your
trampoline after it calls all the other callbacks.

The called trampoline will be called as if it was called directly from
the nop.

OK, I wrote this up quickly, and it has some bugs, but nothing that
can't be straighten out (specifically, the checks fail if you add a
function trace to one of the direct callbacks, but this can be dealt
with).

Note, the work needed to port this to other archs is rather minimal
(just need to tweak the ftrace_regs_caller and have a way to pass back
the call address via pt_regs that is not saved).

Alexei,

Would this work for you?

-- Steve

[ This is on top of the last patch I sent ]

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6adaf18b3365..de3372bd08ae 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -159,6 +159,7 @@ config X86
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY
+ select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index c38a66661576..34da1e424391 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,6 +28,12 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
return addr;
}

+static inline void ftrace_set_call_func(struct pt_regs *regs, unsigned long addr)
+{
+ /* Emulate a call */
+ regs->orig_ax = addr;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE

struct dyn_arch_ftrace {
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 5930810bf71d..3fcaf03566b0 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -88,6 +88,7 @@ EXPORT_SYMBOL(__fentry__)
movq %rdi, RDI(%rsp)
movq %r8, R8(%rsp)
movq %r9, R9(%rsp)
+ movq $0, ORIG_RAX(%rsp)
/*
* Save the original RBP. Even though the mcount ABI does not
* require this, it helps out callers.
@@ -114,7 +115,23 @@ EXPORT_SYMBOL(__fentry__)
subq $MCOUNT_INSN_SIZE, %rdi
.endm

-.macro restore_mcount_regs
+.macro restore_mcount_regs make_call=0 save_flags=0
+
+ /* ftrace_regs_caller can modify %rbp */
+ movq RBP(%rsp), %rbp
+
+ .if \make_call
+ movq ORIG_RAX(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE-8(%rsp)
+
+ /* Swap the flags with orig_rax */
+ .if \save_flags
+ movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+ .endif
+ .endif
+
movq R9(%rsp), %r9
movq R8(%rsp), %r8
movq RDI(%rsp), %rdi
@@ -123,10 +140,11 @@ EXPORT_SYMBOL(__fentry__)
movq RCX(%rsp), %rcx
movq RAX(%rsp), %rax

- /* ftrace_regs_caller can modify %rbp */
- movq RBP(%rsp), %rbp
-
+ .if \make_call
+ addq $MCOUNT_REG_SIZE-8, %rsp
+ .else
addq $MCOUNT_REG_SIZE, %rsp
+ .endif

.endm

@@ -189,10 +207,17 @@ GLOBAL(ftrace_ip_call)

/* Handlers can change the RIP */
movq RIP(%rsp), %rax
- movq %rax, MCOUNT_REG_SIZE(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)

- restore_mcount_regs
+ /* If ORIG_RAX is anything but zero, make this a call to that */
+ movq ORIG_RAX(%rsp), %rax
+ cmpq $0, %rax
+ je 1f
+ restore_mcount_regs make_call=1
+ jmp 2f

+1: restore_mcount_regs
+2:
/*
* As this jmp to ftrace_epilogue can be a short jump
* it must not be copied into the trampoline.
@@ -261,7 +286,15 @@ GLOBAL(ftrace_regs_call)
movq R10(%rsp), %r10
movq RBX(%rsp), %rbx

- restore_mcount_regs
+ /* If ORIG_RAX is anything but zero, make this a call to that */
+ movq ORIG_RAX(%rsp), %rax
+ cmpq $0, %rax
+ je 1f
+ restore_mcount_regs make_call=1 save_flags=1
+ jmp 2f
+
+1: restore_mcount_regs
+2:

/* Restore flags */
popfq
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ba1cf5640639..9b7e3ec647c7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -144,6 +144,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
* TRACE_ARRAY - The ops->private points to a trace_array descriptor.
* PERMANENT - Set when the ops is permanent and should not be affected by
* ftrace_enabled.
+ * DIRECT - Used by the direct ftrace_ops helper for direct functions.
*/
enum {
FTRACE_OPS_FL_ENABLED = 1 << 0,
@@ -163,6 +164,7 @@ enum {
FTRACE_OPS_FL_RCU = 1 << 14,
FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15,
FTRACE_OPS_FL_PERMANENT = 1 << 16,
+ FTRACE_OPS_FL_DIRECT = 1 << 17,
};

#ifdef CONFIG_DYNAMIC_FTRACE
@@ -226,6 +228,8 @@ extern enum ftrace_tracing_type_t ftrace_tracing_type;
*/
int register_ftrace_function(struct ftrace_ops *ops);
int unregister_ftrace_function(struct ftrace_ops *ops);
+int register_ftrace_direct(unsigned long ip, unsigned long addr);
+int unregister_ftrace_direct(unsigned long ip, unsigned long addr);

extern void ftrace_stub(unsigned long a0, unsigned long a1,
struct ftrace_ops *op, struct pt_regs *regs);
@@ -335,6 +339,7 @@ bool is_ftrace_trampoline(unsigned long addr);
* DISABLED - the record is not ready to be touched yet
* IP - the record wants ip modification only (no regs)
* IP_EN - the record has ip modification only
+ * DIRECT - there is a direct function to call
*
* When a new ftrace_ops is registered and wants a function to save
* pt_regs, the rec->flag REGS is set. When the function has been
@@ -352,10 +357,11 @@ enum {
FTRACE_FL_DISABLED = (1UL << 25),
FTRACE_FL_IP = (1UL << 24),
FTRACE_FL_IP_EN = (1UL << 23),
+ FTRACE_FL_DIRECT = (1UL << 22),
};

-#define FTRACE_REF_MAX_SHIFT 23
-#define FTRACE_FL_BITS 9
+#define FTRACE_REF_MAX_SHIFT 22
+#define FTRACE_FL_BITS 10
#define FTRACE_FL_MASKED_BITS ((1UL << FTRACE_FL_BITS) - 1)
#define FTRACE_FL_MASK (FTRACE_FL_MASKED_BITS << FTRACE_REF_MAX_SHIFT)
#define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1b88b810b8a..85403d321ee6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -36,6 +36,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
config HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY
bool

+config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ bool
+
config HAVE_FTRACE_MCOUNT_RECORD
bool
help
@@ -565,6 +568,11 @@ config DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_IPMODIFY

+config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fec64bf679e8..c0fa24b75c35 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1030,6 +1030,7 @@ static bool update_all_ops;
struct ftrace_func_entry {
struct hlist_node hlist;
unsigned long ip;
+ unsigned long direct; /* for direct lookup only */
};

struct ftrace_func_probe {
@@ -1379,16 +1380,32 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);

-static struct ftrace_hash *
-__ftrace_hash_move(struct ftrace_hash *src)
+static void transfer_hash(struct ftrace_hash *dst, struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
- struct hlist_node *tn;
struct hlist_head *hhd;
+ struct hlist_node *tn;
+ int size;
+ int i;
+
+ dst->flags = src->flags;
+
+ size = 1 << src->size_bits;
+ for (i = 0; i < size; i++) {
+ hhd = &src->buckets[i];
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
+ remove_hash_entry(src, entry);
+ __add_hash_entry(dst, entry);
+ }
+ }
+}
+
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
+{
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int i;

/*
* If the new source is empty, just return the empty_hash.
@@ -1410,16 +1427,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (!new_hash)
return NULL;

- new_hash->flags = src->flags;
-
- size = 1 << src->size_bits;
- for (i = 0; i < size; i++) {
- hhd = &src->buckets[i];
- hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
- remove_hash_entry(src, entry);
- __add_hash_entry(new_hash, entry);
- }
- }
+ transfer_hash(new_hash, src);

return new_hash;
}
@@ -1543,6 +1551,26 @@ static int ftrace_cmp_recs(const void *a, const void *b)
return 0;
}

+struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec = NULL;
+ struct dyn_ftrace key;
+
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ }
+ return rec;
+}
+
/**
* ftrace_location_range - return the first address of a traced location
* if it touches the given ip range
@@ -1557,23 +1585,11 @@ static int ftrace_cmp_recs(const void *a, const void *b)
*/
unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
- struct ftrace_page *pg;
struct dyn_ftrace *rec;
- struct dyn_ftrace key;

- key.ip = start;
- key.flags = end; /* overload flags, as it is unsigned long */
-
- for (pg = ftrace_pages_start; pg; pg = pg->next) {
- if (end < pg->records[0].ip ||
- start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
- continue;
- rec = bsearch(&key, pg->records, pg->index,
- sizeof(struct dyn_ftrace),
- ftrace_cmp_recs);
- if (rec)
- return rec->ip;
- }
+ rec = lookup_rec(start, end);
+ if (rec)
+ return rec->ip;

return 0;
}
@@ -1744,6 +1760,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return false;

+ if (ops->flags & FTRACE_OPS_FL_DIRECT)
+ rec->flags |= FTRACE_FL_DIRECT;
+
/*
* If there's only a single callback registered to a
* function, and the ops has a trampoline registered
@@ -1783,6 +1802,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return false;
rec->flags--;

+ if (ops->flags & FTRACE_OPS_FL_DIRECT)
+ rec->flags &= ~FTRACE_FL_DIRECT;
+
/*
* If the rec had REGS enabled and the ops that is
* being removed had REGS set, then see if there is
@@ -2356,6 +2378,156 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
return NULL;
}

+static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static DEFINE_MUTEX(direct_mutex);
+
+static unsigned long find_rec_direct(unsigned long ip)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = __ftrace_lookup_ip(direct_functions, ip);
+ if (!entry)
+ return FTRACE_REGS_ADDR;
+
+ return entry->direct;
+}
+
+static void call_direct_funcs(unsigned long ip, unsigned long pip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ unsigned long addr;
+
+ addr = find_rec_direct(ip);
+ if (addr == FTRACE_REGS_ADDR)
+ return;
+
+ ftrace_set_call_func(regs, addr);
+}
+
+struct ftrace_ops direct_ops = {
+ .func = call_direct_funcs,
+ .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+#if 1
+ | FTRACE_OPS_FL_DIRECT
+#endif
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
+ | FTRACE_OPS_FL_SAVE_REGS
+#endif
+ ,
+};
+
+int register_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int ret = -EBUSY;
+
+ mutex_lock(&direct_mutex);
+
+ if (find_rec_direct(ip) != FTRACE_REGS_ADDR)
+ goto out_unlock;
+
+ rec = lookup_rec(ip, ip);
+ ret = -ENODEV;
+ if (!rec)
+ goto out_unlock;
+
+ if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
+ goto out_unlock;
+
+ /* Make sure the ip points to the exact record */
+ ip = rec->ip;
+
+ ret = -ENOMEM;
+ if (ftrace_hash_empty(direct_functions) ||
+ direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+ struct ftrace_hash *new_hash;
+ int size = ftrace_hash_empty(direct_functions) ? 0 :
+ direct_functions->count + 1;
+ int bits;
+
+ if (size < 32)
+ size = 32;
+
+ for (size /= 2; size; size >>= 1)
+ bits++;
+
+ /* Don't allocate too much */
+ if (bits > FTRACE_HASH_MAX_BITS)
+ bits = FTRACE_HASH_MAX_BITS;
+
+ new_hash = alloc_ftrace_hash(bits);
+ if (!new_hash)
+ goto out_unlock;
+
+ transfer_hash(new_hash, direct_functions);
+ free_ftrace_hash(direct_functions);
+ direct_functions = new_hash;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out_unlock;
+
+ entry->ip = ip;
+ entry->direct = addr;
+ __add_hash_entry(direct_functions, entry);
+
+ ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
+ if (ret)
+ remove_hash_entry(direct_functions, entry);
+
+ if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED))
+ ret = register_ftrace_function(&direct_ops);
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
+
+int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+ struct ftrace_func_entry *entry;
+ struct dyn_ftrace *rec;
+ int ret = -ENODEV;
+
+ mutex_lock(&direct_mutex);
+
+
+ entry = __ftrace_lookup_ip(direct_functions, ip);
+ if (!entry) {
+ /* OK if it is off by a little */
+ rec = lookup_rec(ip, ip);
+ if (!rec || rec->ip == ip)
+ goto out_unlock;
+
+ entry = __ftrace_lookup_ip(direct_functions, rec->ip);
+ if (!entry) {
+ WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+ goto out_unlock;
+ }
+
+ WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
+ }
+
+ remove_hash_entry(direct_functions, entry);
+
+ if (!direct_functions->count)
+ unregister_ftrace_function(&direct_ops);
+
+ ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+ WARN_ON(ret);
+
+ out_unlock:
+ mutex_unlock(&direct_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
+
+
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
@@ -2370,6 +2542,10 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;

+ if ((rec->flags & FTRACE_FL_DIRECT) &&
+ (ftrace_rec_count(rec) == 1))
+ return find_rec_direct(rec->ip);
+
/* Trampolines take precedence over regs */
if (rec->flags & FTRACE_FL_TRAMP) {
ops = ftrace_find_tramp_ops_new(rec);
@@ -3523,10 +3699,17 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_ENABLED) {
struct ftrace_ops *ops;

- seq_printf(m, " (%ld)%s%s",
+ seq_printf(m, " (%ld)%s%s%s",
ftrace_rec_count(rec),
rec->flags & FTRACE_FL_REGS ? " R" : " ",
- rec->flags & FTRACE_FL_IP ? " I" : " ");
+ rec->flags & FTRACE_FL_IP ? " I" : " ",
+ rec->flags & FTRACE_FL_DIRECT ? " D": " ");
+ if (rec->flags & FTRACE_FL_DIRECT) {
+ unsigned long direct;
+ direct = find_rec_direct(rec->ip);
+ if (direct != FTRACE_REGS_ADDR)
+ seq_printf(m, " -->%pS\n", (void *)direct);
+ }
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
if (ops) {
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index e87d9decb994..f01f54f45fdc 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -19,8 +19,6 @@ int x;

static void my_do_spin_lock(raw_spinlock_t *lock)
{
- int ret;
-
// trace_printk("takeing %p\n", lock);
do_raw_spin_lock(lock);
/* Force not having a tail call */
@@ -29,6 +27,28 @@ static void my_do_spin_lock(raw_spinlock_t *lock)
x++;
}

+void my_direct_func(raw_spinlock_t *lock)
+{
+ trace_printk("taking %p\n", lock);
+}
+
+extern void my_tramp(void *);
+
+asm (
+" .pushsection .text, \"ax\", @progbits\n"
+" my_tramp:"
+#if 1
+" pushq %rbp\n"
+" movq %rsp, %rbp\n"
+" pushq %rdi\n"
+" call my_direct_func\n"
+" popq %rdi\n"
+" leave\n"
+#endif
+" ret\n"
+" .popsection\n"
+);
+
static void my_hijack_func(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct pt_regs *regs)
{
@@ -152,6 +172,9 @@ static int __init trace_event_init(void)
{
int ret;

+ ret = register_ftrace_direct((unsigned long)do_raw_spin_lock,
+ (unsigned long)my_tramp);
+ return 0;
ret = ftrace_set_filter_ip(&my_ops, (unsigned long)do_raw_spin_lock, 0, 0);
if (!ret)
register_ftrace_function(&my_ops);
@@ -166,6 +189,9 @@ static int __init trace_event_init(void)

static void __exit trace_event_exit(void)
{
+ unregister_ftrace_direct((unsigned long)do_raw_spin_lock,
+ (unsigned long)my_tramp);
+ return;
unregister_ftrace_function(&my_ops);
return;
kthread_stop(simple_tsk);

2019-10-24 11:50:14

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, 23 Oct 2019 12:23:06 -0400
Steven Rostedt <[email protected]> wrote:

> All you need to do is:
>
> register_ftrace_direct((unsigned long)func_you_want_to_trace,
> (unsigned long)your_trampoline);
>


>
> Alexei,
>
> Would this work for you?

I just pushed a test branch up to:

git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git

branch: ftrace/direct

Again, it's mostly development code, and may be buggy. (don't try
tracing when a direct function is added yet).

If this is something that you can use, then I'll work to clean it up
and sort out all the bugs.

Thanks!

-- Steve

2019-10-24 16:13:36

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86/ftrace: Use text_poke()

On Wed, Oct 23, 2019 at 04:08:52PM -0400, Steven Rostedt wrote:
> >
> > It seems to me that it's a bit of overkill to add new config knob
> > for every ftrace feature.
> > HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS (that arch defined) would
> > be enough to check and return error in register_ftrace_direct()
> > right?
>
> IIRC, we started doing this because it allows the dependencies to be
> defined in the kernel/trace directory. That is, if
> CONFIG_DYNAMIC_FATRCE_WITH_DIRECT_CALLS is set, then we know that
> direct calls *and* DYNAMIC_FTRACE is enabled. It cuts down on some of
> the more complex #if or the arch needing to do
>
> select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if DYNAMIC_FTRACE
>
> It may be overkill, but it does keep the pain in one place.

Ok.
could you please add
static inline int register_ftrace_direct(unsigned long ip, unsigned long addr)
{
return -ENOTSUPP;
}
when appropriate configs are not enabled?
The current approach of:
#define register_ftrace_function(ops) ({ 0; })
doesn't look right, since users are being mislead that it's a success.

> > > +struct ftrace_ops direct_ops = {
> > > + .func = call_direct_funcs,
> > > + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
> > > +#if 1
> > > + | FTRACE_OPS_FL_DIRECT
> > > +#endif
> > > +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY
> > > + | FTRACE_OPS_FL_SAVE_REGS
> > > +#endif
> >
> > With FL_DIRECT the CONFIG_DYNAMIC_FTRACE_WITH_IPMODIFY_ONLY won't be needed, right ?
> > At least not for bpf use case.
> > Do you see livepatching using it or switching to FL_DIRECT too?
>
> Correct. I talked with Josh on IRC and we are looking into removing the
> pushf/popf from the ftrace_regs_caller to see if that helps in the
> performance for live patching. I'm also currently working on making
> this patch not on top of the IP modify one, so the IP modify doesn't
> need to be applied.
>
> This also cleans up the asm code a bit more (getting rid of the macro).

awesome.

> >
> > I have one more question/request.
> > Looks like ftrace can be turned off with sysctl.
> > Which means that a person or a script can accidently turn it off
> > and all existing kprobe+bpf stuff that is ftrace based will
> > be silently switched off.
>
> See http://lkml.kernel.org/r/[email protected]
>
> I can (and should) add the PERMANENT flag to the direct_ops.
>
> Note, the PERMANENT patches will be added before this one.

Ahh. Perfect. That works.
I was wondering whether livepatching should care...
clearly they are :)

> > Fast forward a year and imagine few host critical bpf progs
> > are running in production and relying on FL_DIRECT.
> > Now somebody decided to test new ftrace feature and
> > it hit one of FTRACE_WARN_ON().
> > That will shutdown the whole ftrace and bpf progs
> > will stop executing. I'd like to avoid that.
> > May be I misread the code?
>
> It basically freezes it. Current registered ftrace_ops will not be
> touched. But nothing can be removed or added.

got it. that makes sense.

>
> OK, I'll work to get this patch in for the next merge window.

As soon as you do first round of cleanups please ping me with the link
to your latest branch. I'll start building on top in the mean time.
Thanks!