Move ftrace over to using the generic x86 text_poke functions; this
avoids having a second/different copy of that code around.
This also avoids ftrace violating the (new) W^X rule and avoids
fragmenting the kernel text page-tables, due to no longer having to
toggle them RW.
Cc: Steven Rostedt <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/ftrace.h | 2
arch/x86/kernel/alternative.c | 4
arch/x86/kernel/ftrace.c | 630 ++++++------------------------------------
arch/x86/kernel/traps.c | 9
4 files changed, 93 insertions(+), 552 deletions(-)
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -35,8 +35,6 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};
-int ftrace_int3_handler(struct pt_regs *regs);
-
#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif /* CONFIG_DYNAMIC_FTRACE */
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -941,7 +941,7 @@ static struct bp_patching_desc {
int nr_entries;
} bp_patching;
-static int patch_cmp(const void *key, const void *elt)
+static int notrace patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
@@ -953,7 +953,7 @@ static int patch_cmp(const void *key, co
}
NOKPROBE_SYMBOL(patch_cmp);
-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
void *ip;
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -43,16 +43,12 @@ int ftrace_arch_code_modify_prepare(void
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
- set_kernel_text_rw();
- set_all_modules_text_rw();
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
- set_all_modules_text_ro();
- set_kernel_text_ro();
mutex_unlock(&text_mutex);
return 0;
}
@@ -60,67 +56,34 @@ int ftrace_arch_code_modify_post_process
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
struct {
- unsigned char op;
+ char op;
int offset;
} __attribute__((packed));
};
-static int ftrace_calc_offset(long ip, long addr)
-{
- return (int)(addr - ip);
-}
-
-static unsigned char *
-ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+static const char *ftrace_text_replace(char op, unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
- calc.op = op;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+ calc.op = op;
+ calc.offset = (int)(addr - (ip + MCOUNT_INSN_SIZE));
return calc.code;
}
-static unsigned char *
-ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
- return ftrace_text_replace(0xe8, ip, addr);
-}
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
-}
-
-static unsigned long text_ip_addr(unsigned long ip)
+static const char *ftrace_nop_replace(void)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only, so we use
- * the kernel identity mapping instead of the kernel text mapping
- * to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return ip;
+ return ideal_nops[NOP_ATOMIC5];
}
-static const unsigned char *ftrace_nop_replace(void)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- return ideal_nops[NOP_ATOMIC5];
+ return ftrace_text_replace(CALL_INSN_OPCODE, ip, addr);
}
-static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- ftrace_expected = old_code;
+ char cur_code[MCOUNT_INSN_SIZE];
/*
* Note:
@@ -129,31 +92,38 @@ ftrace_modify_code_direct(unsigned long
* Carefully read and modify the code with probe_kernel_*(), and make
* sure what we read is what we expected it to be before modifying it.
*/
-
/* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
return -EFAULT;
+ }
/* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ON(1);
return -EINVAL;
+ }
- ip = text_ip_addr(ip);
-
- /* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
- return -EPERM;
+ return 0;
+}
- sync_core();
+static int
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
+{
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;
+ /* replace the text with the new text */
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
@@ -167,19 +137,20 @@ int ftrace_make_nop(struct module *mod,
* just modify the code directly.
*/
if (addr == MCOUNT_ADDR)
- return ftrace_modify_code_direct(rec->ip, old, new);
-
- ftrace_expected = NULL;
+ return ftrace_modify_code_direct(ip, old, new);
- /* Normal cases use add_brk_on_nop */
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
@@ -189,43 +160,6 @@ int ftrace_make_call(struct dyn_ftrace *
}
/*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- * CPU-0 CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- * <trap-int3> // implicit (r)mb
- * if (atomic_read(mfc))
- * call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
-/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
* ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -237,452 +171,84 @@ int ftrace_modify_call(struct dyn_ftrace
unsigned long addr)
{
WARN_ON(1);
- ftrace_expected = NULL;
return -EINVAL;
}
-static unsigned long ftrace_update_func;
-static unsigned long ftrace_update_func_call;
-
-static int update_ftrace_func(unsigned long ip, void *new)
-{
- unsigned char old[MCOUNT_INSN_SIZE];
- int ret;
-
- memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
-
- ftrace_update_func = ip;
- /* Make sure the breakpoints see the ftrace_update_func update */
- smp_wmb();
-
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
- ret = ftrace_modify_code(ip, old, new);
-
- atomic_dec(&modifying_ftrace_code);
-
- return ret;
-}
-
int ftrace_update_ftrace_func(ftrace_func_t func)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char *new;
- int ret;
-
- ftrace_update_func_call = (unsigned long)func;
-
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
-
- /* Also update the regs callback function */
- if (!ret) {
- ip = (unsigned long)(&ftrace_regs_call);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- }
-
- return ret;
-}
-
-static nokprobe_inline int is_ftrace_caller(unsigned long ip)
-{
- if (ip == ftrace_update_func)
- return 1;
-
- return 0;
-}
-
-/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
unsigned long ip;
+ const char *new;
- if (WARN_ON_ONCE(!regs))
- return 0;
-
- ip = regs->ip - INT3_INSN_SIZE;
-
- if (ftrace_location(ip)) {
- int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
- return 1;
- } else if (is_ftrace_caller(ip)) {
- if (!ftrace_update_func_call) {
- int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
- return 1;
- }
- int3_emulate_call(regs, ftrace_update_func_call);
- return 1;
- }
-
- return 0;
-}
-NOKPROBE_SYMBOL(ftrace_int3_handler);
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- ip = text_ip_addr(ip);
-
- if (probe_kernel_write((void *)ip, val, size))
- return -EPERM;
-
- return 0;
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- ftrace_expected = old;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- return ftrace_write(ip, &brk, 1);
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned const char *old;
- unsigned long ip = rec->ip;
-
- old = ftrace_call_replace(ip, addr);
-
- return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
- unsigned const char *old;
-
- old = ftrace_nop_replace();
-
- return add_break(rec->ip, old);
-}
-
-static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ftrace_addr = ftrace_get_addr_curr(rec);
-
- ret = ftrace_test_record(rec, enable);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
-
-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
-{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return 0;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = ftrace_get_addr_new(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = ftrace_get_addr_curr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- ftrace_expected = nop;
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
- update:
- return ftrace_write(ip, nop, 1);
-}
-
-static int add_update_code(unsigned long ip, unsigned const char *new)
-{
- /* skip breakpoint */
- ip++;
- new++;
- return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
-}
-
-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
- return add_update_code(ip, new);
-}
-
-static int add_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
- return add_update_code(ip, new);
-}
-
-static int add_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_update_nop(rec);
- }
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return finish_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return finish_update_nop(rec);
- }
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
-static void do_sync_core(void *data)
-{
- sync_core();
-}
-
-static void run_sync(void)
-{
- int enable_irqs;
-
- /* No need to sync if there's only one CPU */
- if (num_online_cpus() == 1)
- return;
-
- enable_irqs = irqs_disabled();
-
- /* We may be called with interrupts disabled (on bootup). */
- if (enable_irqs)
- local_irq_enable();
- on_each_cpu(do_sync_core, NULL, 1);
- if (enable_irqs)
- local_irq_disable();
-}
-
void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
- const char *report = "adding breakpoints";
- int count = 0;
+ const char *new, *old;
int ret;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = add_breakpoints(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
-
- report = "updating code";
- count = 0;
-
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
-
- ret = add_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ }
+
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_bug(ret, rec);
+ return;
+ }
}
- run_sync();
-
- report = "removing breakpoints";
- count = 0;
-
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = finish_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
-
- return;
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ }
- remove_breakpoints:
- pr_warn("Failed on %s (%d):\n", report, count);
- ftrace_bug(ret, rec);
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
- /*
- * Breakpoints are handled only when this function is in
- * progress. The system could not work with them.
- */
- if (remove_breakpoint(rec))
- BUG();
+ text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
}
- run_sync();
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- /*
- * The breakpoint is handled only when this function is in progress.
- * The system could not work if we could not remove it.
- */
- BUG_ON(ret);
- out:
- run_sync();
- return ret;
-
- fail_update:
- /* Also here the system could not work with the breakpoint */
- if (ftrace_write(ip, old_code, 1))
- BUG();
- goto out;
+ text_poke_finish();
}
void arch_ftrace_update_code(int command)
{
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ftrace_modify_all_code(command);
-
- atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void)
@@ -828,11 +394,7 @@ create_trampoline(struct ftrace_ops *ops
set_vm_flush_reset_perms(trampoline);
- /*
- * Module allocation needs to be completed by making the page
- * executable. The page is still writable, which is a security hazard,
- * but anyhow ftrace breaks W^X completely.
- */
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -859,11 +421,10 @@ static unsigned long calc_trampoline_cal
void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
ftrace_func_t func;
- unsigned char *new;
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret, npages;
+ const char *new;
if (ops->trampoline) {
/*
@@ -872,14 +433,11 @@ void arch_ftrace_update_trampoline(struc
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
- set_memory_rw(ops->trampoline, npages);
} else {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
- npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -887,15 +445,11 @@ void arch_ftrace_update_trampoline(struc
func = ftrace_ops_get_func(ops);
- ftrace_update_func_call = (unsigned long)func;
-
+ mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- set_memory_ro(ops->trampoline, npages);
-
- /* The update should never fail */
- WARN_ON(ret);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}
/* Return the address of the function the trampoline calls */
@@ -981,19 +535,18 @@ void arch_ftrace_trampoline_free(struct
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- return ftrace_text_replace(0xe9, ip, addr);
+ return ftrace_text_replace(JMP32_INSN_OPCODE, ip, addr);
}
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
- unsigned char *new;
+ const char *new;
- ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
-
- return update_ftrace_func(ip, new);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); // BATCH
+ return 0;
}
int ftrace_enable_ftrace_graph_caller(void)
@@ -1019,10 +572,9 @@ int ftrace_disable_ftrace_graph_caller(v
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
unsigned long old;
int faulted;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;
/*
* When resuming from suspend-to-ram, this function can be indirectly
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -568,15 +568,6 @@ NOKPROBE_SYMBOL(do_general_protection);
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * ftrace must be first, everything else may cause a recursive crash.
- * See note by declaration of modifying_ftrace_code in ftrace.c
- */
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
-#endif
if (poke_int3_handler(regs))
return;
On Mon, 07 Oct 2019 10:17:21 +0200
Peter Zijlstra <[email protected]> wrote:
> Move ftrace over to using the generic x86 text_poke functions; this
> avoids having a second/different copy of that code around.
>
> This also avoids ftrace violating the (new) W^X rule and avoids
> fragmenting the kernel text page-tables, due to no longer having to
> toggle them RW.
>
> Cc: Steven Rostedt <[email protected]>
> Cc: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
BTW, I'd really like to take this patch series through my tree. That
way I can really hammer it, as well as I have code that will be built
on top of it.
I'll review the other series in this thread, but I'm assuming they
don't rely on this series? Or do they?
-- Steve
On Tue, Oct 08, 2019 at 10:43:35AM -0400, Steven Rostedt wrote:
> BTW, I'd really like to take this patch series through my tree. That
> way I can really hammer it, as well as I have code that will be built
> on top of it.
Works for me; or we can have a topic branch in tip we both can use.
Ingo?
> I'll review the other series in this thread, but I'm assuming they
> don't rely on this series? Or do they?
Indeed, this series stands on it's own. The rest depends on this.
On Tue, 8 Oct 2019 19:11:37 +0200
Peter Zijlstra <[email protected]> wrote:
> On Tue, Oct 08, 2019 at 10:43:35AM -0400, Steven Rostedt wrote:
>
> > BTW, I'd really like to take this patch series through my tree. That
> > way I can really hammer it, as well as I have code that will be built
> > on top of it.
>
> Works for me; or we can have a topic branch in tip we both can use.
> Ingo?
In case you want to use this branch, I just pushed:
git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
ftrace/text-poke
That is these 6 patches applied to v5.4-rc2
I'm going to start running them through my tests. I can work on these
directly. And if Ingo wants to pull this into tip, then we can do that,
and apply the other patches on top of them.
Ingo, would that work for you?
-- Steve
>
> > I'll review the other series in this thread, but I'm assuming they
> > don't rely on this series? Or do they?
>
> Indeed, this series stands on it's own. The rest depends on this.
On Tue, 8 Oct 2019 10:43:35 -0400
Steven Rostedt <[email protected]> wrote:
> BTW, I'd really like to take this patch series through my tree. That
> way I can really hammer it, as well as I have code that will be built
> on top of it.
I did a bit of hammering and found two bugs. One I sent a patch to fix
(adding a module when tracing is enabled), but the other bug I
triggered, I'm too tired to debug right now. But figured I'd mention it
anyway.
If you add on the kernel command line:
ftrace=function ftrace_filter=schedule
You will get this (note, I had KASAN enabled, so it showed more info):
[ 1.274356] ftrace: allocating 34274 entries in 134 pages
[ 1.320059] Starting tracer 'function'
[ 1.323724] ==================================================================
[ 1.330798] BUG: KASAN: null-ptr-deref in __get_locked_pte+0x21/0x210
[ 1.337186] Read of size 8 at addr 0000000000000050 by task swapper/0
[ 1.343579]
[ 1.345049] CPU: 0 PID: 0 Comm: swapper Not tainted 5.4.0-rc2-test+ #50
[ 1.351613] Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
[ 1.360510] Call Trace:
[ 1.362932] dump_stack+0x7c/0xc0
[ 1.366213] ? __get_locked_pte+0x21/0x210
[ 1.370272] ? __get_locked_pte+0x21/0x210
[ 1.374331] __kasan_report.cold.10+0x5/0x3e
[ 1.378566] ? __get_locked_pte+0x21/0x210
[ 1.382625] kasan_report+0xe/0x20
[ 1.385993] __get_locked_pte+0x21/0x210
[ 1.389880] ? 0xffffffffa000005c
[ 1.393162] __text_poke+0x1ca/0x5b0
[ 1.396705] ? optimize_nops.isra.8+0xd0/0xd0
[ 1.401023] ? insn_get_length+0x4f/0x70
[ 1.404910] ? text_poke_loc_init+0x186/0x220
[ 1.409229] ? text_poke_kgdb+0x10/0x10
[ 1.413031] ? 0xffffffffa0000000
[ 1.416312] text_poke_bp_batch+0xb4/0x1e0
[ 1.420372] ? __text_poke+0x5b0/0x5b0
[ 1.424088] ? do_raw_spin_lock+0x113/0x1d0
[ 1.428233] ? 0xffffffffa000005c
[ 1.431515] ? 0xffffffffa0000000
[ 1.434797] text_poke_bp+0x7a/0xa0
[ 1.438253] ? text_poke_queue+0xb0/0xb0
[ 1.442139] ? 0xffffffffa0000000
[ 1.445421] ? 0xffffffffa000005c
[ 1.448706] ? find_vmap_area+0x56/0x80
[ 1.452505] arch_ftrace_update_trampoline+0x114/0x380
[ 1.457603] ? ftrace_caller+0x4e/0x4e
[ 1.461316] ? 0xffffffffa0000091
[ 1.464599] ? arch_ftrace_update_code+0x10/0x10
[ 1.469179] ? mutex_lock+0x86/0xd0
[ 1.472634] ? __mutex_lock_slowpath+0x10/0x10
[ 1.477039] ? mutex_unlock+0x1d/0x40
[ 1.480668] ? arch_jump_label_transform_queue+0x7a/0x90
[ 1.485937] ? __jump_label_update+0x91/0x140
[ 1.490256] ? mutex_unlock+0x1d/0x40
[ 1.493885] ? tracing_start_sched_switch.cold.0+0x60/0x60
[ 1.499327] __register_ftrace_function+0xaf/0xf0
[ 1.503991] ftrace_startup+0x24/0x130
[ 1.507706] register_ftrace_function+0x2d/0x80
[ 1.512198] function_trace_init+0xc1/0x100
[ 1.516346] tracing_set_tracer+0x1fb/0x3c0
[ 1.520492] ? free_snapshot+0x50/0x50
[ 1.524205] ? __kasan_kmalloc.constprop.6+0xc1/0xd0
[ 1.529131] ? __list_add_valid+0x29/0xa0
[ 1.533106] register_tracer+0x235/0x26c
[ 1.536993] early_trace_init+0x29b/0x3a0
[ 1.540967] start_kernel+0x2a3/0x5f7
[ 1.544595] ? mem_encrypt_init+0x6/0x6
[ 1.548396] ? load_ucode_intel_bsp+0x5e/0xa3
[ 1.552715] ? init_intel_microcode+0xc3/0xc3
[ 1.557036] ? load_ucode_bsp+0xcc/0x154
[ 1.560923] secondary_startup_64+0xa4/0xb0
[ 1.565070] ==================================================================
-- Steve
On Wed, Oct 09, 2019 at 10:41:35PM -0400, Steven Rostedt wrote:
> On Tue, 8 Oct 2019 10:43:35 -0400
> Steven Rostedt <[email protected]> wrote:
>
>
> > BTW, I'd really like to take this patch series through my tree. That
> > way I can really hammer it, as well as I have code that will be built
> > on top of it.
>
> I did a bit of hammering and found two bugs. One I sent a patch to fix
> (adding a module when tracing is enabled), but the other bug I
> triggered, I'm too tired to debug right now. But figured I'd mention it
> anyway.
I'm thinking this should fix it... Just not sure this is the right plce,
then again, we're doing the same thing in jump_label and static_call, so
perhaps we should do it like this.
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1230,10 +1230,15 @@ void text_poke_queue(void *addr, const v
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
On Thu, 10 Oct 2019 11:20:54 +0200
Peter Zijlstra <[email protected]> wrote:
> On Wed, Oct 09, 2019 at 10:41:35PM -0400, Steven Rostedt wrote:
> > On Tue, 8 Oct 2019 10:43:35 -0400
> > Steven Rostedt <[email protected]> wrote:
> >
> >
> > > BTW, I'd really like to take this patch series through my tree. That
> > > way I can really hammer it, as well as I have code that will be built
> > > on top of it.
> >
> > I did a bit of hammering and found two bugs. One I sent a patch to fix
> > (adding a module when tracing is enabled), but the other bug I
> > triggered, I'm too tired to debug right now. But figured I'd mention it
> > anyway.
>
> I'm thinking this should fix it... Just not sure this is the right plce,
> then again, we're doing the same thing in jump_label and static_call, so
> perhaps we should do it like this.
>
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -1230,10 +1230,15 @@ void text_poke_queue(void *addr, const v
> * dynamically allocated memory. This function should be used when it is
> * not possible to allocate memory.
> */
> -void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> {
> struct text_poke_loc tp;
>
> + if (unlikely(system_state == SYSTEM_BOOTING)) {
> + text_poke_early(addr, opcode, len);
> + return;
> + }
We need a new system state. SYSTEM_UP ? (Arg, that name is confusing,
SYSTEM_BOOTING_SMP?) Or perhaps just test num_online_cpus()?
if (unlikely(system_state == SYSTEM_BOOTING &&
num_online_cpus() == 1)
?
Because we can't do the above once we have more than one CPU running.
-- Steve
> +
> text_poke_loc_init(&tp, addr, opcode, len, emulate);
> text_poke_bp_batch(&tp, 1);
> }
On Thu, Oct 10, 2019 at 09:19:56AM -0400, Steven Rostedt wrote:
> On Thu, 10 Oct 2019 11:20:54 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > On Wed, Oct 09, 2019 at 10:41:35PM -0400, Steven Rostedt wrote:
> > > On Tue, 8 Oct 2019 10:43:35 -0400
> > > Steven Rostedt <[email protected]> wrote:
> > >
> > >
> > > > BTW, I'd really like to take this patch series through my tree. That
> > > > way I can really hammer it, as well as I have code that will be built
> > > > on top of it.
> > >
> > > I did a bit of hammering and found two bugs. One I sent a patch to fix
> > > (adding a module when tracing is enabled), but the other bug I
> > > triggered, I'm too tired to debug right now. But figured I'd mention it
> > > anyway.
> >
> > I'm thinking this should fix it... Just not sure this is the right plce,
> > then again, we're doing the same thing in jump_label and static_call, so
> > perhaps we should do it like this.
> >
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> > @@ -1230,10 +1230,15 @@ void text_poke_queue(void *addr, const v
> > * dynamically allocated memory. This function should be used when it is
> > * not possible to allocate memory.
> > */
> > -void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> > +void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
> > {
> > struct text_poke_loc tp;
> >
> > + if (unlikely(system_state == SYSTEM_BOOTING)) {
> > + text_poke_early(addr, opcode, len);
> > + return;
> > + }
>
> We need a new system state. SYSTEM_UP ? (Arg, that name is confusing,
> SYSTEM_BOOTING_SMP?) Or perhaps just test num_online_cpus()?
>
> if (unlikely(system_state == SYSTEM_BOOTING &&
> num_online_cpus() == 1)
>
> ?
>
> Because we can't do the above once we have more than one CPU running.
We loose BOOTING _long_ before we gain SMP.
On Thu, 10 Oct 2019 16:05:13 +0200
Peter Zijlstra <[email protected]> wrote:
> > Because we can't do the above once we have more than one CPU running.
>
> We loose BOOTING _long_ before we gain SMP.
Ah, yep. But I finally got it working with the following patch:
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 95beb85aef65..d7037d038005 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -25,7 +25,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
*/
#define POKE_MAX_OPCODE_SIZE 5
-extern void text_poke_early(void *addr, const void *opcode, size_t len);
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index fa5dfde9b09a..2ee644a20f46 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -267,7 +267,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-void text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
/*
* Are we looking at a near JMP with a 1 or 4-byte displacement.
@@ -756,8 +756,8 @@ void __init alternative_instructions(void)
* instructions. And on the local CPU you need to be protected against NMI or
* MCE handlers seeing an inconsistent instruction while you patch.
*/
-void __init_or_module text_poke_early(void *addr, const void *opcode,
- size_t len)
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
{
unsigned long flags;
@@ -780,6 +780,7 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
* that causes hangs on some VIA CPUs.
*/
}
+ return NULL;
}
__ro_after_init struct mm_struct *poking_mm;
@@ -1058,10 +1059,14 @@ static int tp_vec_nr;
*/
static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
+ void *(*poke)(void *addr, const void *opcode, size_t len) = text_poke;
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
+ if (system_state == SYSTEM_BOOTING)
+ poke = text_poke_early;
+
lockdep_assert_held(&text_mutex);
bp_patching.vec = tp;
@@ -1077,7 +1082,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, &int3, sizeof(int3));
+ poke(tp[i].addr, &int3, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
@@ -1086,8 +1091,8 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
if (tp[i].len - sizeof(int3) > 0) {
- text_poke((char *)tp[i].addr + sizeof(int3),
- (const char *)tp[i].text + sizeof(int3),
+ poke((char *)tp[i].addr + sizeof(int3),
+ (const char *)tp[i].text + sizeof(int3),
tp[i].len - sizeof(int3));
do_sync++;
}
@@ -1110,7 +1115,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
if (tp[i].text[0] == INT3_INSN_OPCODE)
continue;
- text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+ poke(tp[i].addr, tp[i].text, sizeof(int3));
do_sync++;
}
@@ -1234,6 +1239,10 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulat
{
struct text_poke_loc tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index f2e59d858ca9..2dd462f86d1f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -308,7 +308,8 @@ union ftrace_op_code_union {
#define RET_SIZE 1
static unsigned long
-create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size,
+ unsigned int *pages)
{
unsigned long start_offset;
unsigned long end_offset;
@@ -394,8 +395,11 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
set_vm_flush_reset_perms(trampoline);
- set_memory_ro((unsigned long)trampoline, npages);
+ /* We can't use text_poke_bp() at start up */
+ if (system_state != SYSTEM_BOOTING)
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
+ *pages = npages;
return (unsigned long)trampoline;
fail:
tramp_free(trampoline);
@@ -423,7 +427,9 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
ftrace_func_t func;
unsigned long offset;
unsigned long ip;
+ unsigned int pages;
unsigned int size;
+ bool set_ro = false;
const char *new;
if (ops->trampoline) {
@@ -434,7 +440,9 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
} else {
- ops->trampoline = create_trampoline(ops, &size);
+ if (system_state == SYSTEM_BOOTING)
+ set_ro = true;
+ ops->trampoline = create_trampoline(ops, &size, &pages);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
@@ -450,6 +458,8 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
new = ftrace_call_replace(ip, (unsigned long)func);
text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
mutex_unlock(&text_mutex);
+ if (set_ro)
+ set_memory_ro((unsigned long)ops->trampoline, pages);
}
/* Return the address of the function the trampoline calls */
Is it really important to use text_poke() on text that is coming live?
That is, I really hate the above "set_ro" hack. This is because you
moved the ro setting to create_trampoline() and then forcing the
text_poke() on text that has just been created. I prefer to just modify
it and then setting it to ro before it gets executed. Otherwise we need
to do all these dances.
The same is with the module code. My patch was turning text to
read-write that is not to be executed yet. Really, what's wrong with
that?
-- Steve
On Thu, Oct 10, 2019 at 11:54:49AM -0400, Steven Rostedt wrote:
> On Thu, 10 Oct 2019 16:05:13 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > > Because we can't do the above once we have more than one CPU running.
> >
> > We loose BOOTING _long_ before we gain SMP.
>
> Ah, yep. But I finally got it working with the following patch:
That looks like it can be done simpler; but my head is exploding, I'll
have to look at this in the AM.
> Is it really important to use text_poke() on text that is coming live?
What is really important is that we never have memory that is both
writable and executable (W^X).
Moving as much poking to before marking it RO (or moving the marking RO
later if that is the same thing) is a sane approach.
But once it gains X, we must never again mark it W, without first
removing X.
> That is, I really hate the above "set_ro" hack. This is because you
> moved the ro setting to create_trampoline() and then forcing the
> text_poke() on text that has just been created. I prefer to just modify
> it and then setting it to ro before it gets executed. Otherwise we need
> to do all these dances.
I thought create_trampoline() finished the whole thing; if it does not,
either make create_trampoline() do everything, or add a
finish_trampoline() callback to mark it complete.
> The same is with the module code. My patch was turning text to
> read-write that is not to be executed yet. Really, what's wrong with
> that?
The fact that it is executable; also the fact that you do it right after
we mark it ro. Flipping the memory protections back and forth is just
really poor everything.
Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
executable (kernel) memory writable.
Really the best solution is to move all the poking into
ftrace_module_init(), before we mark it RO+X. That's what I'm going to
do for jump_label and static_call as well, I just need to add that extra
notifier callback.
On Thu, 10 Oct 2019 19:28:19 +0200
Peter Zijlstra <[email protected]> wrote:
> > That is, I really hate the above "set_ro" hack. This is because you
> > moved the ro setting to create_trampoline() and then forcing the
> > text_poke() on text that has just been created. I prefer to just modify
> > it and then setting it to ro before it gets executed. Otherwise we need
> > to do all these dances.
>
> I thought create_trampoline() finished the whole thing; if it does not,
> either make create_trampoline() do everything, or add a
> finish_trampoline() callback to mark it complete.
I'm good with a finish_trampoline(). I can make a patch that does that.
>
> > The same is with the module code. My patch was turning text to
> > read-write that is not to be executed yet. Really, what's wrong with
> > that?
>
> The fact that it is executable; also the fact that you do it right after
> we mark it ro. Flipping the memory protections back and forth is just
> really poor everything.
Hmm, I wonder if we can work with a way to make this work in the module
code as well. Moving the place it sets 'x' and 'ro' around :-/
>
> Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> executable (kernel) memory writable.
OK.
>
> Really the best solution is to move all the poking into
> ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> do for jump_label and static_call as well, I just need to add that extra
> notifier callback.
I'll have to think about other ways to handle the other archs with the
all or nothing approach. Perhaps we can use my patch as an arch
dependent situation, I'll try another approach.
-- Steve
On Thu, Oct 10, 2019 at 01:48:30PM -0400, Steven Rostedt wrote:
> On Thu, 10 Oct 2019 19:28:19 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > > That is, I really hate the above "set_ro" hack. This is because you
> > > moved the ro setting to create_trampoline() and then forcing the
> > > text_poke() on text that has just been created. I prefer to just modify
> > > it and then setting it to ro before it gets executed. Otherwise we need
> > > to do all these dances.
> >
> > I thought create_trampoline() finished the whole thing; if it does not,
> > either make create_trampoline() do everything, or add a
> > finish_trampoline() callback to mark it complete.
>
> I'm good with a finish_trampoline(). I can make a patch that does that.
I found it easier to just make create_trampoline do it all. The below
patch seems to cure both issues for me.
---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1213,6 +1213,11 @@ void text_poke_queue(void *addr, const v
{
struct text_poke_loc *tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_flush(addr);
tp = &tp_vec[tp_vec_nr++];
@@ -1230,10 +1235,15 @@ void text_poke_queue(void *addr, const v
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -34,6 +34,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+static int ftrace_poke_late = 0;
+
int ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
@@ -43,12 +45,15 @@ int ftrace_arch_code_modify_prepare(void
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
+ ftrace_poke_late = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
+ text_poke_finish();
+ ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
return 0;
}
@@ -116,7 +121,10 @@ ftrace_modify_code_direct(unsigned long
return ret;
/* replace the text with the new text */
- text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ if (ftrace_poke_late)
+ text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
@@ -308,11 +316,12 @@ union ftrace_op_code_union {
#define RET_SIZE 1
static unsigned long
-create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size, ftrace_func_t func)
{
unsigned long start_offset;
unsigned long end_offset;
unsigned long op_offset;
+ unsigned long call_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -329,10 +338,12 @@ create_trampoline(struct ftrace_ops *ops
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
}
size = end_offset - start_offset;
@@ -389,6 +400,14 @@ create_trampoline(struct ftrace_ops *ops
/* put in the new offset to the ftrace_ops */
memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, func),
+ CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
@@ -426,23 +445,23 @@ void arch_ftrace_update_trampoline(struc
unsigned int size;
const char *new;
- if (ops->trampoline) {
- /*
- * The ftrace_ops caller may set up its own trampoline.
- * In such a case, this code must not modify it.
- */
- if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
- return;
- } else {
- ops->trampoline = create_trampoline(ops, &size);
+ if (!ops->trampoline) {
+ ops->trampoline = create_trampoline(ops, &size, ftrace_ops_get_func(ops));
if (!ops->trampoline)
return;
ops->trampoline_size = size;
+ return;
}
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
-
func = ftrace_ops_get_func(ops);
mutex_lock(&text_mutex);
On Fri, Oct 11, 2019 at 12:45:52PM +0200, Peter Zijlstra wrote:
> + if (!ops->trampoline) {
> + ops->trampoline = create_trampoline(ops, &size, ftrace_ops_get_func(ops));
And now that I look at what I send, I see we already pass ops, so no
need to pass ftrace_ops_get_func().
Let me respin that.
On Fri, Oct 11, 2019 at 12:47:12PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 11, 2019 at 12:45:52PM +0200, Peter Zijlstra wrote:
> > + if (!ops->trampoline) {
> > + ops->trampoline = create_trampoline(ops, &size, ftrace_ops_get_func(ops));
>
> And now that I look at what I send, I see we already pass ops, so no
> need to pass ftrace_ops_get_func().
>
> Let me respin that.
---
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1209,10 +1209,15 @@ void text_poke_finish(void)
text_poke_flush(NULL);
}
-void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc *tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_flush(addr);
tp = &tp_vec[tp_vec_nr++];
@@ -1230,10 +1235,15 @@ void text_poke_queue(void *addr, const v
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -34,6 +34,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+static int ftrace_poke_late = 0;
+
int ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
@@ -43,12 +45,15 @@ int ftrace_arch_code_modify_prepare(void
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
+ ftrace_poke_late = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
+ text_poke_finish();
+ ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
return 0;
}
@@ -116,7 +121,10 @@ ftrace_modify_code_direct(unsigned long
return ret;
/* replace the text with the new text */
- text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ if (ftrace_poke_late)
+ text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
@@ -313,6 +321,7 @@ create_trampoline(struct ftrace_ops *ops
unsigned long start_offset;
unsigned long end_offset;
unsigned long op_offset;
+ unsigned long call_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -329,10 +338,12 @@ create_trampoline(struct ftrace_ops *ops
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
}
size = end_offset - start_offset;
@@ -389,6 +400,15 @@ create_trampoline(struct ftrace_ops *ops
/* put in the new offset to the ftrace_ops */
memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE,
+ trampoline + call_offset,
+ ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
@@ -426,23 +446,23 @@ void arch_ftrace_update_trampoline(struc
unsigned int size;
const char *new;
- if (ops->trampoline) {
- /*
- * The ftrace_ops caller may set up its own trampoline.
- * In such a case, this code must not modify it.
- */
- if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
- return;
- } else {
+ if (!ops->trampoline) {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
+ return;
}
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
-
func = ftrace_ops_get_func(ops);
mutex_lock(&text_mutex);
On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> Really the best solution is to move all the poking into
> ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> do for jump_label and static_call as well, I just need to add that extra
> notifier callback.
OK, so I started writing that patch... or rather, I wrote the patch and
started on the Changelog when I ran into trouble describing why we need
it.
That is, I'm struggling to explain why we cannot flip
prepare_coming_module() and complete_formation().
Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
see if we can cure that.
On Fri, 11 Oct 2019 14:59:03 +0200
Peter Zijlstra <[email protected]> wrote:
> On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
>
> > Really the best solution is to move all the poking into
> > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > do for jump_label and static_call as well, I just need to add that extra
> > notifier callback.
>
> OK, so I started writing that patch... or rather, I wrote the patch and
> started on the Changelog when I ran into trouble describing why we need
> it.
>
> That is, I'm struggling to explain why we cannot flip
> prepare_coming_module() and complete_formation().
>
> Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> see if we can cure that.
For someone that doesn't use modules, you are making me very nervous
with all the changes you are making to the module code! ;-)
-- Steve
On Fri, Oct 11, 2019 at 09:33:19AM -0400, Steven Rostedt wrote:
> On Fri, 11 Oct 2019 14:59:03 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> >
> > > Really the best solution is to move all the poking into
> > > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > > do for jump_label and static_call as well, I just need to add that extra
> > > notifier callback.
> >
> > OK, so I started writing that patch... or rather, I wrote the patch and
> > started on the Changelog when I ran into trouble describing why we need
> > it.
> >
> > That is, I'm struggling to explain why we cannot flip
> > prepare_coming_module() and complete_formation().
> >
> > Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> > see if we can cure that.
>
> For someone that doesn't use modules, you are making me very nervous
> with all the changes you are making to the module code! ;-)
Hey, today I build a kernel with modules just for you :-)
And whatever it takes right, I just want to clean this crap up.
+++ Peter Zijlstra [11/10/19 14:59 +0200]:
>On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
>
>> Really the best solution is to move all the poking into
>> ftrace_module_init(), before we mark it RO+X. That's what I'm going to
>> do for jump_label and static_call as well, I just need to add that extra
>> notifier callback.
>
>OK, so I started writing that patch... or rather, I wrote the patch and
>started on the Changelog when I ran into trouble describing why we need
>it.
>
>That is, I'm struggling to explain why we cannot flip
>prepare_coming_module() and complete_formation().
>
>Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
>see if we can cure that.
I'm having trouble visualizing what changes you're planning on making.
I get that you want to squash ftrace_module_enable() into
ftrace_module_init(), before module_enable_ro(). I'm fine with that as
long as the races Steven described are addressed for affected arches.
And livepatch should be OK as long as klp_module_coming() is always
called *after* ftrace_module_enable(). But are you moving
klp_module_coming() before the module_enable_* calls as well? And if
so, why?
>The fact that it is executable; also the fact that you do it right after
>we mark it ro. Flipping the memory protections back and forth is just
>really poor everything.
>
>Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
>executable (kernel) memory writable.
Not sure if relevant, but just thought I'd clarify: IIRC,
klp_module_coming() is not poking the coming module, but it calls
module_enable_ro() on itself (the livepatch module) so it can apply
relocations and such on the new code, which lives inside the livepatch
module, and it needs to possibly do this numerous times over the
lifetime of the patch module for any coming module it is responsible
for patching (i.e., call module_enable_ro() on the patch module, not
necessarily the coming module). So I am not be sure why
klp_module_coming() should be moved before complete_formation(). I
hope I'm remembering the details correctly, livepatch folks feel free
to chime in if I'm incorrect here.
On Fri, 11 Oct 2019 14:59:03 +0200
Peter Zijlstra <[email protected]> wrote:
> On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
>
> > Really the best solution is to move all the poking into
> > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > do for jump_label and static_call as well, I just need to add that extra
> > notifier callback.
>
> OK, so I started writing that patch... or rather, I wrote the patch and
> started on the Changelog when I ran into trouble describing why we need
> it.
>
> That is, I'm struggling to explain why we cannot flip
> prepare_coming_module() and complete_formation().
>
> Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> see if we can cure that.
You are mainly worried about making text that is executable into
read-write again. What if we kept my one patch that just changed the
module in ftrace_module_enable() from read-only to read-write, but
before we ever set it executable.
Jessica, would this patch break anything?
It moves the setting of the module execution to after calling both
ftrace_module_enable() and klp_module_coming().
This would make it possible to do the module code and still keep with
the no executable code becoming writable.
-- Steve
diff --git a/kernel/module.c b/kernel/module.c
index ff2d7359a418..6e2fd40a6ed9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_enable_ro(mod, false);
module_enable_nx(mod);
- module_enable_x(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
@@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod)
if (err)
return err;
+ /* Make module executable after ftrace is enabled */
+ mutex_lock(&module_mutex);
+ module_enable_x(mod);
+ mutex_unlock(&module_mutex);
+
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
return 0;
On Tue, Oct 15, 2019 at 09:28:02AM -0400, Steven Rostedt wrote:
> On Fri, 11 Oct 2019 14:59:03 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
> >
> > > Really the best solution is to move all the poking into
> > > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
> > > do for jump_label and static_call as well, I just need to add that extra
> > > notifier callback.
> >
> > OK, so I started writing that patch... or rather, I wrote the patch and
> > started on the Changelog when I ran into trouble describing why we need
> > it.
> >
> > That is, I'm struggling to explain why we cannot flip
> > prepare_coming_module() and complete_formation().
> >
> > Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
> > see if we can cure that.
>
> You are mainly worried about making text that is executable into
> read-write again. What if we kept my one patch that just changed the
> module in ftrace_module_enable() from read-only to read-write, but
> before we ever set it executable.
This still flips the protections back and forth, which is still really
ugly. And afaict the only reason this is required is that
set_all_modules_text_*() stuff.
So please, instead of tinkering around, lets just kill that horrible
interface and be done with it. There's only 2 users left, fixing those
can't be too hard.
On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
> I'm having trouble visualizing what changes you're planning on making.
I want all the text poking (jump_label, ftrace, klp whatever) to happen
_before_ we do the protection changes.
I also want to avoid flipping the protection state around unnecessarily,
because that clearly is just wasted work.
> I get that you want to squash ftrace_module_enable() into
> ftrace_module_init(), before module_enable_ro(). I'm fine with that as
> long as the races Steven described are addressed for affected arches.
Right, the problem is set_all_modules_text_*(), that relies on COMING
having made the protection changes.
After the x86 changes, there's only 2 more architectures that use that
function. I'll work on getting those converted and then we can delete
that function and worry no more about it.
> And livepatch should be OK as long as klp_module_coming() is always
> called *after* ftrace_module_enable(). But are you moving
> klp_module_coming() before the module_enable_* calls as well? And if
> so, why?
I wanted to move the COMING notifier callback before the protection
changes, because that is the easiest way to convert jump_label and
static_call and AFAICT nothing really cared aside from ftrace.
The alternative is introducing additional module states, which just adds
complexity we don't really need if we can just flip things around a
little.
> > The fact that it is executable; also the fact that you do it right after
> > we mark it ro. Flipping the memory protections back and forth is just
> > really poor everything.
> >
> > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> > executable (kernel) memory writable.
>
> Not sure if relevant, but just thought I'd clarify: IIRC,
> klp_module_coming() is not poking the coming module, but it calls
> module_enable_ro() on itself (the livepatch module) so it can apply
> relocations and such on the new code, which lives inside the livepatch
> module, and it needs to possibly do this numerous times over the
> lifetime of the patch module for any coming module it is responsible
> for patching (i.e., call module_enable_ro() on the patch module, not
> necessarily the coming module). So I am not be sure why
> klp_module_coming() should be moved before complete_formation(). I
> hope I'm remembering the details correctly, livepatch folks feel free
> to chime in if I'm incorrect here.
You mean it does module_disable_ro() ? That would be broken and it needs
to be fixed. Can some livepatch person explain what it does and why?
On Tue, Oct 15, 2019 at 03:56:34PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
> > > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> > > executable (kernel) memory writable.
> >
> > Not sure if relevant, but just thought I'd clarify: IIRC,
> > klp_module_coming() is not poking the coming module, but it calls
> > module_enable_ro() on itself (the livepatch module) so it can apply
> > relocations and such on the new code, which lives inside the livepatch
> > module, and it needs to possibly do this numerous times over the
> > lifetime of the patch module for any coming module it is responsible
> > for patching (i.e., call module_enable_ro() on the patch module, not
> > necessarily the coming module). So I am not be sure why
> > klp_module_coming() should be moved before complete_formation(). I
> > hope I'm remembering the details correctly, livepatch folks feel free
> > to chime in if I'm incorrect here.
>
> You mean it does module_disable_ro() ? That would be broken and it needs
> to be fixed. Can some livepatch person explain what it does and why?
mbenes confirmed; what would be needed is for the live-patch module to
have all module dependent parts to be in their own section and have the
sections be page-aligned. Then we can do the protection on sections
instead of on the whole module.
Damn, and I thought I was so close to getting W^X sorted :/
[ live-patching ML CCed ]
On Tue, 15 Oct 2019, Peter Zijlstra wrote:
> On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
>
> > > The fact that it is executable; also the fact that you do it right after
> > > we mark it ro. Flipping the memory protections back and forth is just
> > > really poor everything.
> > >
> > > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
> > > executable (kernel) memory writable.
> >
> > Not sure if relevant, but just thought I'd clarify: IIRC,
> > klp_module_coming() is not poking the coming module, but it calls
> > module_enable_ro() on itself (the livepatch module) so it can apply
> > relocations and such on the new code, which lives inside the livepatch
> > module, and it needs to possibly do this numerous times over the
> > lifetime of the patch module for any coming module it is responsible
> > for patching (i.e., call module_enable_ro() on the patch module, not
> > necessarily the coming module). So I am not be sure why
> > klp_module_coming() should be moved before complete_formation(). I
> > hope I'm remembering the details correctly, livepatch folks feel free
> > to chime in if I'm incorrect here.
>
> You mean it does module_disable_ro() ? That would be broken and it needs
> to be fixed. Can some livepatch person explain what it does and why?
Yes, it does. klp_module_coming() calls module_disable_ro() on all
patching modules which patch the coming module in order to call
apply_relocate_add(). New (patching) code for a module can be relocated
only when the relevant module is loaded.
Miroslav
On Tue, Oct 15, 2019 at 03:56:34PM +0200, Peter Zijlstra wrote:
> Right, the problem is set_all_modules_text_*(), that relies on COMING
> having made the protection changes.
>
> After the x86 changes, there's only 2 more architectures that use that
> function. I'll work on getting those converted and then we can delete
> that function and worry no more about it.
Here's a possible patch for arch/arm, which only leaves arch/nds32/.
---
arch/arm/kernel/ftrace.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index bda949fd84e8..c87e68e4ccf7 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -22,6 +22,7 @@
#include <asm/ftrace.h>
#include <asm/insn.h>
#include <asm/set_memory.h>
+#include <asm/patch.h>
#ifdef CONFIG_THUMB2_KERNEL
#define NOP 0xf85deb04 /* pop.w {lr} */
@@ -31,13 +32,15 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+static int patch_text_remap = 0;
+
static int __ftrace_modify_code(void *data)
{
int *command = data;
- set_kernel_text_rw();
+ patch_text_remap = 1;
ftrace_modify_all_code(*command);
- set_kernel_text_ro();
+ patch_text_remap = 0;
return 0;
}
@@ -59,13 +62,13 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr)
int ftrace_arch_code_modify_prepare(void)
{
- set_all_modules_text_rw();
+ patch_text_remap = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
- set_all_modules_text_ro();
+ patch_text_remap = 0;
/* Make sure any TLB misses during machine stop are cleared. */
flush_tlb_all();
return 0;
@@ -97,10 +100,7 @@ static int ftrace_modify_code(unsigned long pc, unsigned long old,
return -EINVAL;
}
- if (probe_kernel_write((void *)pc, &new, MCOUNT_INSN_SIZE))
- return -EPERM;
-
- flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+ __patch_text_real((void *)pc, new, patch_text_remap);
return 0;
}
On 10/15/19 10:13 AM, Miroslav Benes wrote:
> Yes, it does. klp_module_coming() calls module_disable_ro() on all
> patching modules which patch the coming module in order to call
> apply_relocate_add(). New (patching) code for a module can be relocated
> only when the relevant module is loaded.
FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's
where livepatches only patch a single object and updates are kept on
disk to handle coming module updates as they are loaded) eliminate those
outstanding relocations and the need to perform this late permission
flipping?
-- Joe
+++ Joe Lawrence [15/10/19 11:06 -0400]:
>On 10/15/19 10:13 AM, Miroslav Benes wrote:
>>Yes, it does. klp_module_coming() calls module_disable_ro() on all
>>patching modules which patch the coming module in order to call
>>apply_relocate_add(). New (patching) code for a module can be relocated
>>only when the relevant module is loaded.
>
>FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @
>plumber's where livepatches only patch a single object and updates are
>kept on disk to handle coming module updates as they are loaded)
>eliminate those outstanding relocations and the need to perform this
>late permission flipping?
I wasn't at Plumbers sadly, was this idea documented/talked about in
detail somewhere? (recording, slides, etherpad, etc?). What do you
mean by updates are kept on disk? Maybe someone can explain it more
in detail? :)
+++ Peter Zijlstra [15/10/19 15:56 +0200]:
>On Tue, Oct 15, 2019 at 03:07:40PM +0200, Jessica Yu wrote:
>> I'm having trouble visualizing what changes you're planning on making.
>
>I want all the text poking (jump_label, ftrace, klp whatever) to happen
>_before_ we do the protection changes.
>
>I also want to avoid flipping the protection state around unnecessarily,
>because that clearly is just wasted work.
OK, that makes sense, thanks for clarifying.
>> I get that you want to squash ftrace_module_enable() into
>> ftrace_module_init(), before module_enable_ro(). I'm fine with that as
>> long as the races Steven described are addressed for affected arches.
>
>Right, the problem is set_all_modules_text_*(), that relies on COMING
>having made the protection changes.
>
>After the x86 changes, there's only 2 more architectures that use that
>function. I'll work on getting those converted and then we can delete
>that function and worry no more about it.
>
>> And livepatch should be OK as long as klp_module_coming() is always
>> called *after* ftrace_module_enable(). But are you moving
>> klp_module_coming() before the module_enable_* calls as well? And if
>> so, why?
>
>I wanted to move the COMING notifier callback before the protection
>changes, because that is the easiest way to convert jump_label and
>static_call and AFAICT nothing really cared aside from ftrace.
I think I would be fine with this as long as no notifiers/users rely
on the assumption that COMING == module enabled protections already.
I've yet to audit all the module notifiers (but I trust you've done
this already), and I think ftrace was the only user that relied on
this. For livepatch it's probably not immediately fixable without some
serious overhaul.
>The alternative is introducing additional module states, which just adds
>complexity we don't really need if we can just flip things around a
>little.
Yeah, I would prefer not adding more states if possible :-)
>> > The fact that it is executable; also the fact that you do it right after
>> > we mark it ro. Flipping the memory protections back and forth is just
>> > really poor everything.
>> >
>> > Once this ftrace thing is sorted, we'll change x86 to _refuse_ to make
>> > executable (kernel) memory writable.
>>
>> Not sure if relevant, but just thought I'd clarify: IIRC,
>> klp_module_coming() is not poking the coming module, but it calls
>> module_enable_ro() on itself (the livepatch module) so it can apply
>> relocations and such on the new code, which lives inside the livepatch
>> module, and it needs to possibly do this numerous times over the
>> lifetime of the patch module for any coming module it is responsible
>> for patching (i.e., call module_enable_ro() on the patch module, not
>> necessarily the coming module). So I am not be sure why
>> klp_module_coming() should be moved before complete_formation(). I
>> hope I'm remembering the details correctly, livepatch folks feel free
>> to chime in if I'm incorrect here.
>
>You mean it does module_disable_ro() ? That would be broken and it needs
>to be fixed. Can some livepatch person explain what it does and why?
Gah, sorry, yes I meant module_disable_ro().
+++ Steven Rostedt [15/10/19 09:28 -0400]:
>On Fri, 11 Oct 2019 14:59:03 +0200
>Peter Zijlstra <[email protected]> wrote:
>
>> On Thu, Oct 10, 2019 at 07:28:19PM +0200, Peter Zijlstra wrote:
>>
>> > Really the best solution is to move all the poking into
>> > ftrace_module_init(), before we mark it RO+X. That's what I'm going to
>> > do for jump_label and static_call as well, I just need to add that extra
>> > notifier callback.
>>
>> OK, so I started writing that patch... or rather, I wrote the patch and
>> started on the Changelog when I ran into trouble describing why we need
>> it.
>>
>> That is, I'm struggling to explain why we cannot flip
>> prepare_coming_module() and complete_formation().
>>
>> Yes, it breaks ftrace, but I'm thinking that is all it breaks. So let me
>> see if we can cure that.
>
>You are mainly worried about making text that is executable into
>read-write again. What if we kept my one patch that just changed the
>module in ftrace_module_enable() from read-only to read-write, but
>before we ever set it executable.
>
>Jessica, would this patch break anything?
>
>It moves the setting of the module execution to after calling both
>ftrace_module_enable() and klp_module_coming().
>
>This would make it possible to do the module code and still keep with
>the no executable code becoming writable.
>
>-- Steve
>
>diff --git a/kernel/module.c b/kernel/module.c
>index ff2d7359a418..6e2fd40a6ed9 100644
>--- a/kernel/module.c
>+++ b/kernel/module.c
>@@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
>
> module_enable_ro(mod, false);
> module_enable_nx(mod);
>- module_enable_x(mod);
>
> /* Mark state as coming so strong_try_module_get() ignores us,
> * but kallsyms etc. can see us. */
>@@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod)
> if (err)
> return err;
>
>+ /* Make module executable after ftrace is enabled */
>+ mutex_lock(&module_mutex);
>+ module_enable_x(mod);
>+ mutex_unlock(&module_mutex);
>+
> blocking_notifier_call_chain(&module_notify_list,
> MODULE_STATE_COMING, mod);
> return 0;
As long as we enable x before parse_args(), which this patch does, then
I don't think this change would break anything.
On Tue, Oct 15, 2019 at 04:42:58PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 15, 2019 at 03:56:34PM +0200, Peter Zijlstra wrote:
> > Right, the problem is set_all_modules_text_*(), that relies on COMING
> > having made the protection changes.
> >
> > After the x86 changes, there's only 2 more architectures that use that
> > function. I'll work on getting those converted and then we can delete
> > that function and worry no more about it.
>
> Here's a possible patch for arch/arm, which only leaves arch/nds32/.
*sigh*, so I'd written the patching code for nds32, but then discovered
it doesn't have STRICT_MODULE_RWX and therefore we can simply delete the
thing.
On 10/15/19 11:31 AM, Jessica Yu wrote:
> +++ Joe Lawrence [15/10/19 11:06 -0400]:
>> On 10/15/19 10:13 AM, Miroslav Benes wrote:
>>> Yes, it does. klp_module_coming() calls module_disable_ro() on all
>>> patching modules which patch the coming module in order to call
>>> apply_relocate_add(). New (patching) code for a module can be relocated
>>> only when the relevant module is loaded.
>>
>> FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @
>> plumber's where livepatches only patch a single object and updates are
>> kept on disk to handle coming module updates as they are loaded)
>> eliminate those outstanding relocations and the need to perform this
>> late permission flipping?
>
> I wasn't at Plumbers sadly, was this idea documented/talked about in
> detail somewhere? (recording, slides, etherpad, etc?). What do you
> mean by updates are kept on disk? Maybe someone can explain it more
> in detail? :)
>
Livepatching folks -- I don't have the LPC summary link (etherpad?) that
Jiri put together. Does someone have that handy for Jessica?
Thanks,
-- Joe
On Tue, 15 Oct 2019 18:17:43 -0400
Joe Lawrence <[email protected]> wrote:
>
> Livepatching folks -- I don't have the LPC summary link (etherpad?) that
> Jiri put together. Does someone have that handy for Jessica?
Yes, and I'll be posting this on the LPC web site soon. But here's the
write up that Jiri sent me (with the etherpad link at the end):
Live patching miniconference covered 8 topics overall.
(1) First session, titled "What happened in kernel live patching over the last
year" was led by Miroslav Benes. It was quite a natural followup to where we
ended at the LPC 2018 miniconf, summarizing which of the points that have been
agreed on back then have already been fully implemented, where obstacles have
been enounctered, etc.
The most prominent feature that has been merged during past year was "atomic
replace", which allows for easier stacking of patches. This is especially
useful for distros, as it naturally aligns with the way patches are being
distributed by them.
Another big step forward since LPC 2018 miniconf was addition of livepatching
selftests, which already tremendously helped in various cases, as it e.g.
helped to track down quite a few issues during development of reliable
stacktraces on s390. Proposal has been made that all major KLP features in the
future should be accompanied by accompanying selftest, which the audience
agreed on.
One of the last year's discussion topics / pain points were GCC optimizations
which are not compatible with livepatching. GCC upstream now has
-flive-patching option, which disables all those interfering optimizations.
(2) Second session, titled "Rethinking late module patching" was led by Miroslav
Benes again.
The problem statement is: in case when there is a patch loaded for module that
is yet to be loaded, it has to be patched before it starts executing. The
current solution relies on hooks in the module loader, and module is patched
when its being linked. It gets a bit nasty with the arch-specifics of the
module loader handling all the relocations, patching of alternatives, etc. One
of the issues is that all the paravirt / jump label patching has to be done
after relocations are resolved, this is getting a bit fragile and not well
maintainable.
Miroslav sketched out the possible solutions:
- livepatch would immediately load all the modules for which it has
patch via dependency; half-loading modules (not promoting to final
LIVE state)
- splitting the currently one big monolithic livepatch to a per-object
structure; might cause issues with consistency model
- "blue sky" idea from Joe Lawrence: livepatch loaded modules,
binary-patch .ko on disk, blacklist vulnerable version
Miroslav proposed to actually stick to the current solution, and improve
selftests coverage for all the considered-fragile arch-specific module linking
code hooks. The discussion then mostly focused, based on proposals from several
attendees (most prominently Steven Rostedt and Amit Shah), on expanding on the
"blue sky" idea.
The final proposal converged to having a separate .ko for livepatches that's
installed on the disk along with the module. This addresses the module
signature issue (as signature does not actually change), as well as module
removal case (the case where a module was previously loaded while a livepatch
is applied, and then later unloaded and reloaded). The slight downside is that
this will require changes to the module loader to also look for livepatches
when loading a module. When unloading the module, the livepatch module will
also need to be unloaded. Steven approved of this approach over his previous
suggestion.
(3) Third session, titled "Source-based livepatch creation tooling", was led by
Nicolai Stange.
The primary objective of the session was basing on the source-based creation
of livepatches, while avoiding the tedious (and error-prone task) of copying
a lot of kernel code around (from the source tree to the livepatch). Nicolai
spent par of last year writing a klp-ccp (KLP Copy and Paste) utility, which
automates a big chunk of the process.
Nicolai then presented the still open issues with the tool and with the process
around it, most promonent ones being:
- obtaining original GCC commandline that was used to build the
original kernel
- externalizability of static functions; we need to know whether GCC
emitted static function into the patched object
Miroslav proposed to extend existing IPA dumping capabiity of GCC to emit also
the information about dead code elimination; DWARF information is guaranteed
not to be reliable when it comes to IPA optimizations.
(4) Fourth session, titled "Objtool on power -- update", was led by Kamalesh
Babulal.
Kamalesh reported that as a followup to last year's miniconference, the objtool
support for powerpc actually came to life. It hasn't yet been posted upstream,
but is currently available on github [1].
Kamalesh further reported, that decoder has basic functionality (stack
operations + validation, branches, unreachable code, switch table (through gcc
plugin), conditional branches, prologue sequences). It turns out that stack
validation on powerpc is easier than on x86, as the ABI is much more strict
there; which leaves the validation phase to mostly focus on hand-written
assembly.
The next steps are basing on arm64 objtool code which already abstracted out
the arch-specific bits, and further optimizations can be stacked on top of that
(switch table detection, more testing, different gcc versions).
(5) Fifth session, titled "Do we need a Livepatch Developers Guide?", was led
by Joe Lawrence.
Joe postulated, that Current in-kernel documentation provides very good
documentation for individual features the infrastructure provides to the
livepatch author, but Joe further suggested to also include something along the
lines of what they currently have for kpatch, which takes a more general look
from the point of view of livepatch developer.
Proposals that have been brought up for discussion:
- FAQ
- collecting already existing CVE fixes and ammend them with a lot of
commentary
- creating a livepatch blog on people.kernel.org
Mark Brown asked for documenting what architectures need to implement in order
to support livepatching.
Amit Shah asked if the 'kpatch' and 'kpatch-build' script/program be renamed to
'livepatch'-friendly names so that kernel sources can also reference them for
the user docs part of it.
Both Mark's and Amit's remarks have been considered very valid and useful, and
agreement was reached that they will be taken care of.
(6) Sixth session, titled "API for state changes made by callbacks", was led
by Petr Mladek.
Petr described his proposal for API for changing, updating and disabling
patches (by callbacks). Example where this was needed: L1TF fix, which needed
to change PTE semantics (particular bits). This can't be done before all the
code understands this new PTE format/semantics. Therefore pre-patch and
post-patch callbacks had to do the actual modifications to all the existing
PTEs. What is also currently missing is tracking compatibilities / dependencies
between individual livepatches.
Petr's proposal (v2) is already on ML.
struct klp_state is being introduced which tracks the actual states of the
patch. klp_is_patch_compatible() checks the compatibility of the current states
to the states that the new livepatch is going to bring.
No principal issues / objections have been raised, and it's appreciated by the
patch author(s), so v3 will be submitted and pre-merge bikeshedding will start.
(7) Seventh session, titled "klp-convert and livepatch relocations", was led
by Joe Lawrence.
Joe started the session with problem statement: accessing non exported / static
symbols from inside the patch module. One possible workardound is manually via
kallsyms. Second workaround is klp-convert, which actually creates proper
relocations inside the livepatch module from the symbol database during the
final .ko link.
Currently module loader looks for special livepatch relocations and resolves
those during runtime; kernel support for these relocations have so far been
added for x86 only. Special livepatch relocations are supported and processed
also on other architectures. Special quirks/sections are not yet supported.
Plus klp-convert would still be needed even with late module patching update.
vmlinux or modules could have ambiguous static symbols.
It turns out that the features / bugs below have to be resolved before we
can claim the klp-convert support for relocation complete:
- handle all the corner cases (jump labels, static keys, ...) properly and
have a good regression tests in place
- one day we might (or might not) add support for out-of-tree modules which
need klp-convert
- BFD bug 24456 (multiple relocations to the same .text section)
(8) Eight sesstion, titled "Making livepatching infrastructure better", was led
by Kamalesh Babulal.
The primary goal of the discussion as presented by Kamalesh was simple: how to
improve our testing coverage. Currently we have sample modules + kselftests.
We seem to be currently missing specific unit cases and tests for corner cases.
What Kamalesh would also like to see would be more stress testing oriented
tests for the infrastructure. We should make sure that projects like kernelCI
are running with CONFIG_LIVEPATCH=y.
Another thing Kamalesh currently sees as missing are failure test cases too.
It should be checked with sosreport and supportconfig guys whether those
diagnostic tools do provide necessary coverage of (at least) livepatching sysfs
state. This is especially a task for distro people to figure out.
Nicolai proposed as one of the testcases identity patching, as that should
reveal issues directly in the infrastructure.
(9) Last, ninth session, titled "Open sourcing live patching services", was led
by Alice Ferrazzi.
This session followed up on previous suggestion of having public repository for
livepatches against LTS kernel.
Alice reported on improviement of elivepatch since last year as having moved
everything to docker.
Alice proposed to more share livepatch sources; SUSE does publish those [2][3],
but it's important to mention that livepatches are very closely tied to
particular kernel version.
[1] https://github.com/kamalesh-babulal/linux/tree/objtool-v1
[2] On https://kernel.suse.com/
[3] Example source-based SUSE's livepatch is at https://kernel.suse.com/cgit/kernel-livepatch/tree/uname_patch/kgr_patch_uname.c?id=d4e00de0b0a3f858fec4e83640f12e1f17298667
Eherpad: https://etherpad.net/p/LPC2019_Live_Patching
On Tue, 15 Oct 2019, Joe Lawrence wrote:
> On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > patching modules which patch the coming module in order to call
> > apply_relocate_add(). New (patching) code for a module can be relocated
> > only when the relevant module is loaded.
>
> FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> livepatches only patch a single object and updates are kept on disk to handle
> coming module updates as they are loaded) eliminate those outstanding
> relocations and the need to perform this late permission flipping?
Yes, it should, but we don't have to wait for it. PeterZ proposed a
different solution to this specific issue in
https://lore.kernel.org/lkml/[email protected]/
It should not be a problem to create a live patch module like that and the
code in kernel/livepatch/ is almost ready. Something like
module_section_disable_ro(mod, section) (and similar for X protection)
should be enough. Module reloads would still require juggling with the
protections, but I think it is all feasible.
Miroslav
> which are not compatible with livepatching. GCC upstream now has
> -flive-patching option, which disables all those interfering optimizations.
Which, IIRC, has a significant performance impact and should thus really
not be used...
If distros ship that crap, I'm going to laugh at them the next time they
want a single digit performance improvement because *important*.
On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
> (7) Seventh session, titled "klp-convert and livepatch relocations", was led
> by Joe Lawrence.
>
> Joe started the session with problem statement: accessing non exported / static
> symbols from inside the patch module. One possible workardound is manually via
> kallsyms. Second workaround is klp-convert, which actually creates proper
> relocations inside the livepatch module from the symbol database during the
> final .ko link.
> Currently module loader looks for special livepatch relocations and resolves
> those during runtime; kernel support for these relocations have so far been
> added for x86 only. Special livepatch relocations are supported and processed
> also on other architectures. Special quirks/sections are not yet supported.
> Plus klp-convert would still be needed even with late module patching update.
> vmlinux or modules could have ambiguous static symbols.
>
> It turns out that the features / bugs below have to be resolved before we
> can claim the klp-convert support for relocation complete:
> - handle all the corner cases (jump labels, static keys, ...) properly and
> have a good regression tests in place
I suppose all the patches in this series-of-series here will make life
harder for KLP, static_call() and 2 byte jumps etc..
> - one day we might (or might not) add support for out-of-tree modules which
> need klp-convert
> - BFD bug 24456 (multiple relocations to the same .text section)
On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> On Tue, 15 Oct 2019, Joe Lawrence wrote:
>
> > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > patching modules which patch the coming module in order to call
> > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > only when the relevant module is loaded.
> >
> > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > livepatches only patch a single object and updates are kept on disk to handle
> > coming module updates as they are loaded) eliminate those outstanding
> > relocations and the need to perform this late permission flipping?
>
> Yes, it should, but we don't have to wait for it. PeterZ proposed a
> different solution to this specific issue in
> https://lore.kernel.org/lkml/[email protected]/
>
> It should not be a problem to create a live patch module like that and the
> code in kernel/livepatch/ is almost ready. Something like
> module_section_disable_ro(mod, section) (and similar for X protection)
> should be enough. Module reloads would still require juggling with the
> protections, but I think it is all feasible.
Just had a browse around the module code, and while the section info is
in load_info, it doesn't get retained for active modules.
So I suppose I'll go add that for KLP enabled things.
+++ Peter Zijlstra [16/10/19 11:23 +0200]:
>On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
>> On Tue, 15 Oct 2019, Joe Lawrence wrote:
>>
>> > On 10/15/19 10:13 AM, Miroslav Benes wrote:
>> > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
>> > > patching modules which patch the coming module in order to call
>> > > apply_relocate_add(). New (patching) code for a module can be relocated
>> > > only when the relevant module is loaded.
>> >
>> > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
>> > livepatches only patch a single object and updates are kept on disk to handle
>> > coming module updates as they are loaded) eliminate those outstanding
>> > relocations and the need to perform this late permission flipping?
>>
>> Yes, it should, but we don't have to wait for it. PeterZ proposed a
>> different solution to this specific issue in
>> https://lore.kernel.org/lkml/[email protected]/
>>
>> It should not be a problem to create a live patch module like that and the
>> code in kernel/livepatch/ is almost ready. Something like
>> module_section_disable_ro(mod, section) (and similar for X protection)
>> should be enough. Module reloads would still require juggling with the
>> protections, but I think it is all feasible.
>
>Just had a browse around the module code, and while the section info is
>in load_info, it doesn't get retained for active modules.
>
>So I suppose I'll go add that for KLP enabled things.
Elf section info does get saved for livepatch modules though, see
mod->klp_info. And wouldn't this only be needed for livepatch modules?
On Wed, Oct 16, 2019 at 11:36:10AM +0200, Jessica Yu wrote:
> +++ Peter Zijlstra [16/10/19 11:23 +0200]:
> > On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> > > On Tue, 15 Oct 2019, Joe Lawrence wrote:
> > >
> > > > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > > > patching modules which patch the coming module in order to call
> > > > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > > > only when the relevant module is loaded.
> > > >
> > > > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > > > livepatches only patch a single object and updates are kept on disk to handle
> > > > coming module updates as they are loaded) eliminate those outstanding
> > > > relocations and the need to perform this late permission flipping?
> > >
> > > Yes, it should, but we don't have to wait for it. PeterZ proposed a
> > > different solution to this specific issue in
> > > https://lore.kernel.org/lkml/[email protected]/
> > >
> > > It should not be a problem to create a live patch module like that and the
> > > code in kernel/livepatch/ is almost ready. Something like
> > > module_section_disable_ro(mod, section) (and similar for X protection)
> > > should be enough. Module reloads would still require juggling with the
> > > protections, but I think it is all feasible.
> >
> > Just had a browse around the module code, and while the section info is
> > in load_info, it doesn't get retained for active modules.
> >
> > So I suppose I'll go add that for KLP enabled things.
>
> Elf section info does get saved for livepatch modules though, see
> mod->klp_info. And wouldn't this only be needed for livepatch modules?
Right, I just found that, but it is x86 only for some mysterious reason.
And yes, it's KLP only.
I was thikning of adding a KLP only list of {name,addr,size} sections
that start with ".klp" in layout_sections(). Would that not work across
architectures?
On Wed, 16 Oct 2019, Peter Zijlstra wrote:
> > which are not compatible with livepatching. GCC upstream now has
> > -flive-patching option, which disables all those interfering optimizations.
>
> Which, IIRC, has a significant performance impact and should thus really
> not be used...
Fortunately, the impact is negligible.
Miroslav
On Wed, 16 Oct 2019, Peter Zijlstra wrote:
> On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
>
> > (7) Seventh session, titled "klp-convert and livepatch relocations", was led
> > by Joe Lawrence.
> >
> > Joe started the session with problem statement: accessing non exported / static
> > symbols from inside the patch module. One possible workardound is manually via
> > kallsyms. Second workaround is klp-convert, which actually creates proper
> > relocations inside the livepatch module from the symbol database during the
> > final .ko link.
> > Currently module loader looks for special livepatch relocations and resolves
> > those during runtime; kernel support for these relocations have so far been
> > added for x86 only. Special livepatch relocations are supported and processed
> > also on other architectures. Special quirks/sections are not yet supported.
> > Plus klp-convert would still be needed even with late module patching update.
> > vmlinux or modules could have ambiguous static symbols.
> >
> > It turns out that the features / bugs below have to be resolved before we
> > can claim the klp-convert support for relocation complete:
> > - handle all the corner cases (jump labels, static keys, ...) properly and
> > have a good regression tests in place
>
> I suppose all the patches in this series-of-series here will make life
> harder for KLP, static_call() and 2 byte jumps etc..
Yes, I think so. We'll have to deal with that once it lands. That is why
we want to get rid of all this arch-specific code in livepatch and
reinvent the late module patching. So it is perhaps better to start
working on it sooner than later. Adding Petr, who hesitantly signed up for
the task...
Miroslav
On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> On Tue, 15 Oct 2019, Joe Lawrence wrote:
>
> > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > patching modules which patch the coming module in order to call
> > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > only when the relevant module is loaded.
> >
> > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > livepatches only patch a single object and updates are kept on disk to handle
> > coming module updates as they are loaded) eliminate those outstanding
> > relocations and the need to perform this late permission flipping?
>
> Yes, it should, but we don't have to wait for it. PeterZ proposed a
> different solution to this specific issue in
> https://lore.kernel.org/lkml/[email protected]/
>
> It should not be a problem to create a live patch module like that and the
> code in kernel/livepatch/ is almost ready. Something like
> module_section_disable_ro(mod, section) (and similar for X protection)
> should be enough. Module reloads would still require juggling with the
> protections, but I think it is all feasible.
Something a little like so.. completely fresh of the keyboard.
---
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -853,6 +853,18 @@ static inline void module_enable_ro(cons
static inline void module_disable_ro(const struct module *mod) { }
#endif
+#if defined(CONFIG_STRICT_MODULE_RWX) && defined(CONFIG_LIVEPATCH)
+extern void module_section_disable_ro(struct module *mod, const char *sec);
+extern void module_section_enable_ro(struct module *mod, const char *sec);
+extern void module_section_disable_x(struct module *mod, const char *sec);
+extern void module_section_enable_x(struct module *mod, const char *sec);
+#else
+static inline void module_section_disable_ro(struct module *mod, const char *sec) { }
+static inline void module_section_enable_ro(struct module *mod, const char *sec) { }
+static inline void module_section_disable_x(struct module *mod, const char *sec) { }
+static inline void module_section_enable_x(struct module *mod, const char *sec) { }
+#endif
+
#ifdef CONFIG_GENERIC_BUG
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
struct module *);
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2107,6 +2107,54 @@ static void free_module_elf(struct modul
kfree(mod->klp_info->secstrings);
kfree(mod->klp_info);
}
+
+#ifdef CONFIG_STRICT_MODULE_RWX
+
+static void __frob_section(struct Elf_Shdr *sec, int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)sec->sh_addr & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)sec->sh_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)sec->sh_addr, sec->sh_size >> PAGE_SHIFT);
+}
+
+static void frob_section(struct module *mod, const char *section,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ struct klp_modinfo *info = mod->klp_info;
+ const char *secname;
+ Elf_Shdr *s;
+
+ for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+ secname = mod->klp_info->secstrings + s->sh_name;
+ if (strcmp(secname, section))
+ continue;
+
+ __frob_section(s, set_memory);
+ }
+}
+
+void module_section_disable_ro(struct module *mod, const char *section)
+{
+ frob_section(mod, section, set_memory_rw);
+}
+
+void module_section_enable_ro(struct module *mod, const char *section)
+{
+ frob_section(mod, section, set_memory_ro);
+}
+
+void module_section_disable_x(struct module *mod, const char *section)
+{
+ frob_section(mod, section, set_memory_nx);
+}
+
+void module_section_enable_x(struct module *mod, const char *section)
+{
+ frob_section(mod, section, set_memory_x);
+}
+
+#endif /* ONFIG_STRICT_MODULE_RWX */
+
#else /* !CONFIG_LIVEPATCH */
static int copy_module_elf(struct module *mod, struct load_info *info)
{
On Wed, 16 Oct 2019, Miroslav Benes wrote:
> On Wed, 16 Oct 2019, Peter Zijlstra wrote:
>
> > On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
> >
> > > (7) Seventh session, titled "klp-convert and livepatch relocations", was led
> > > by Joe Lawrence.
> > >
> > > Joe started the session with problem statement: accessing non exported / static
> > > symbols from inside the patch module. One possible workardound is manually via
> > > kallsyms. Second workaround is klp-convert, which actually creates proper
> > > relocations inside the livepatch module from the symbol database during the
> > > final .ko link.
> > > Currently module loader looks for special livepatch relocations and resolves
> > > those during runtime; kernel support for these relocations have so far been
> > > added for x86 only. Special livepatch relocations are supported and processed
> > > also on other architectures. Special quirks/sections are not yet supported.
> > > Plus klp-convert would still be needed even with late module patching update.
> > > vmlinux or modules could have ambiguous static symbols.
> > >
> > > It turns out that the features / bugs below have to be resolved before we
> > > can claim the klp-convert support for relocation complete:
> > > - handle all the corner cases (jump labels, static keys, ...) properly and
> > > have a good regression tests in place
> >
> > I suppose all the patches in this series-of-series here will make life
> > harder for KLP, static_call() and 2 byte jumps etc..
>
> Yes, I think so. We'll have to deal with that once it lands. That is why
> we want to get rid of all this arch-specific code in livepatch and
> reinvent the late module patching. So it is perhaps better to start
> working on it sooner than later. Adding Petr, who hesitantly signed up for
> the task...
Thinking about it more... crazy idea. I think we could leverage these new
ELF .text per vmlinux/module sections for the reinvention I was talking
about. If we teach module loader to relocate (and apply alternatives and
so on, everything in arch-specific module_finalize()) not the whole module
in case of live patch modules, but separate ELF .text sections, it could
solve the issue with late module patching we have. It is a variation on
Steven's idea. When live patch module is loaded, only its section for
present modules would be processed. Then whenever a to-be-patched module
is loaded, its .text section in all present patch module would be
processed.
The upside is that almost no work would be required on patch modules
creation side. The downside is that klp_modinfo must stay. Module loader
needs to be hacked a lot in both cases. So it remains to be seen which
idea is easier to implement.
Jessica, do you think it would be feasible?
Petr, Joe, Josh, am I missing something or would it work?
Miroslav
+++ Miroslav Benes [16/10/19 15:29 +0200]:
>On Wed, 16 Oct 2019, Miroslav Benes wrote:
>
>> On Wed, 16 Oct 2019, Peter Zijlstra wrote:
>>
>> > On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
>> >
>> > > (7) Seventh session, titled "klp-convert and livepatch relocations", was led
>> > > by Joe Lawrence.
>> > >
>> > > Joe started the session with problem statement: accessing non exported / static
>> > > symbols from inside the patch module. One possible workardound is manually via
>> > > kallsyms. Second workaround is klp-convert, which actually creates proper
>> > > relocations inside the livepatch module from the symbol database during the
>> > > final .ko link.
>> > > Currently module loader looks for special livepatch relocations and resolves
>> > > those during runtime; kernel support for these relocations have so far been
>> > > added for x86 only. Special livepatch relocations are supported and processed
>> > > also on other architectures. Special quirks/sections are not yet supported.
>> > > Plus klp-convert would still be needed even with late module patching update.
>> > > vmlinux or modules could have ambiguous static symbols.
>> > >
>> > > It turns out that the features / bugs below have to be resolved before we
>> > > can claim the klp-convert support for relocation complete:
>> > > - handle all the corner cases (jump labels, static keys, ...) properly and
>> > > have a good regression tests in place
>> >
>> > I suppose all the patches in this series-of-series here will make life
>> > harder for KLP, static_call() and 2 byte jumps etc..
>>
>> Yes, I think so. We'll have to deal with that once it lands. That is why
>> we want to get rid of all this arch-specific code in livepatch and
>> reinvent the late module patching. So it is perhaps better to start
>> working on it sooner than later. Adding Petr, who hesitantly signed up for
>> the task...
>
>Thinking about it more... crazy idea. I think we could leverage these new
>ELF .text per vmlinux/module sections for the reinvention I was talking
>about. If we teach module loader to relocate (and apply alternatives and
>so on, everything in arch-specific module_finalize()) not the whole module
>in case of live patch modules, but separate ELF .text sections, it could
>solve the issue with late module patching we have. It is a variation on
>Steven's idea. When live patch module is loaded, only its section for
>present modules would be processed. Then whenever a to-be-patched module
>is loaded, its .text section in all present patch module would be
>processed.
>
>The upside is that almost no work would be required on patch modules
>creation side. The downside is that klp_modinfo must stay. Module loader
>needs to be hacked a lot in both cases. So it remains to be seen which
>idea is easier to implement.
>
>Jessica, do you think it would be feasible?
I think that does sound feasible. I'm trying to visualize how that
would look. I guess there would need to be various livepatching hooks
called during the different stages (apply_relocate_add(),
module_finalize(), module_enable_ro/x()).
So maybe something like the following?
When a livepatch module loads:
apply_relocate_add()
klp hook: apply .klp.rela.$objname relocations *only* for
already loaded modules
module_finalize()
klp hook: apply .klp.arch.$objname changes for already loaded modules
module_enable_ro()
klp hook: only enable ro/x for .klp.text.$objname for already
loaded modules
When a to-be-patched module loads:
apply_relocate_add()
klp hook: for each patch module that patches the coming
module, apply .klp.rela.$objname relocations for this object
module_finalize()
klp hook: for each patch module that patches the coming
module, apply .klp.arch.$objname changes for this object
module_enable_ro()
klp hook: for each patch module, apply ro/x permissions for
.klp.text.$objname for this object
Then, in klp_module_coming, we only need to do the callbacks and
enable the patch, and get rid of the module_disable_ro->apply
relocs->module_enable_ro block.
Does that sound like what you had in mind or am I totally off?
Thanks!
Jessica
On Fri 2019-10-18 15:03:42, Jessica Yu wrote:
> +++ Miroslav Benes [16/10/19 15:29 +0200]:
> > On Wed, 16 Oct 2019, Miroslav Benes wrote:
> > Thinking about it more... crazy idea. I think we could leverage these new
> > ELF .text per vmlinux/module sections for the reinvention I was talking
> > about. If we teach module loader to relocate (and apply alternatives and
> > so on, everything in arch-specific module_finalize()) not the whole module
> > in case of live patch modules, but separate ELF .text sections, it could
> > solve the issue with late module patching we have. It is a variation on
> > Steven's idea. When live patch module is loaded, only its section for
> > present modules would be processed. Then whenever a to-be-patched module
> > is loaded, its .text section in all present patch module would be
> > processed.
> >
> > The upside is that almost no work would be required on patch modules
> > creation side. The downside is that klp_modinfo must stay. Module loader
> > needs to be hacked a lot in both cases. So it remains to be seen which
> > idea is easier to implement.
> >
> > Jessica, do you think it would be feasible?
>
> I think that does sound feasible. I'm trying to visualize how that
> would look. I guess there would need to be various livepatching hooks
> called during the different stages (apply_relocate_add(),
> module_finalize(), module_enable_ro/x()).
>
> So maybe something like the following?
>
> When a livepatch module loads:
> apply_relocate_add()
> klp hook: apply .klp.rela.$objname relocations *only* for
> already loaded modules
> module_finalize()
> klp hook: apply .klp.arch.$objname changes for already loaded modules
> module_enable_ro()
> klp hook: only enable ro/x for .klp.text.$objname for already
> loaded modules
Just for record. We should also set ro for the not-yet used
.klp.text.$objname at this stage so that it can't be modified
easily "by accident".
> When a to-be-patched module loads:
> apply_relocate_add()
> klp hook: for each patch module that patches the coming
> module, apply .klp.rela.$objname relocations for this object
> module_finalize()
> klp hook: for each patch module that patches the coming
> module, apply .klp.arch.$objname changes for this object
> module_enable_ro()
> klp hook: for each patch module, apply ro/x permissions for
> .klp.text.$objname for this object
>
> Then, in klp_module_coming, we only need to do the callbacks and
> enable the patch, and get rid of the module_disable_ro->apply
> relocs->module_enable_ro block.
>
> Does that sound like what you had in mind or am I totally off?
Makes sense to me.
Well, I wonder if it is really any better from what we have now.
We would still need special delayed handling for the module-specific
elf sections. Also we still would not need to clear the modifications
in these sections when the livepatched object gets unloaded.
I am afraid that the real difference might come when we split
the livepatch into per-livepatched object modules. This would
move the complexity to another parts of the code ;-) I am
unable to say what approach is easier and more safe to maintain
at the moment.
Best Regards,
Petr
+++ Petr Mladek [18/10/19 15:40 +0200]:
>On Fri 2019-10-18 15:03:42, Jessica Yu wrote:
>> +++ Miroslav Benes [16/10/19 15:29 +0200]:
>> > On Wed, 16 Oct 2019, Miroslav Benes wrote:
>> > Thinking about it more... crazy idea. I think we could leverage these new
>> > ELF .text per vmlinux/module sections for the reinvention I was talking
>> > about. If we teach module loader to relocate (and apply alternatives and
>> > so on, everything in arch-specific module_finalize()) not the whole module
>> > in case of live patch modules, but separate ELF .text sections, it could
>> > solve the issue with late module patching we have. It is a variation on
>> > Steven's idea. When live patch module is loaded, only its section for
>> > present modules would be processed. Then whenever a to-be-patched module
>> > is loaded, its .text section in all present patch module would be
>> > processed.
>> >
>> > The upside is that almost no work would be required on patch modules
>> > creation side. The downside is that klp_modinfo must stay. Module loader
>> > needs to be hacked a lot in both cases. So it remains to be seen which
>> > idea is easier to implement.
>> >
>> > Jessica, do you think it would be feasible?
>>
>> I think that does sound feasible. I'm trying to visualize how that
>> would look. I guess there would need to be various livepatching hooks
>> called during the different stages (apply_relocate_add(),
>> module_finalize(), module_enable_ro/x()).
>>
>> So maybe something like the following?
>>
>> When a livepatch module loads:
>> apply_relocate_add()
>> klp hook: apply .klp.rela.$objname relocations *only* for
>> already loaded modules
>> module_finalize()
>> klp hook: apply .klp.arch.$objname changes for already loaded modules
>> module_enable_ro()
>> klp hook: only enable ro/x for .klp.text.$objname for already
>> loaded modules
>
>Just for record. We should also set ro for the not-yet used
>.klp.text.$objname at this stage so that it can't be modified
>easily "by accident".
If we also set ro protection already for .klp.text.$objname for
not-yet loaded modules, I think this would unfortunately mean we would
still have to do the protection flipping for late module patching that
Peter was trying to avoid, right?
That is, we *still* end up having to do the whole module_disable_ro()
-> apply_relocate_add() -> module_finalize() -> module_enable_ro()
thing for late module patching, except now we've moved that work to
the module loader instead of in klp_module_coming.. It sounds just as
complicated as the current way :/
However, I think this complaint would not apply if livepatch switches
to the one patch module per module model..
On Wed, Oct 16, 2019 at 09:42:17AM +0200, Peter Zijlstra wrote:
> > which are not compatible with livepatching. GCC upstream now has
> > -flive-patching option, which disables all those interfering optimizations.
>
> Which, IIRC, has a significant performance impact and should thus really
> not be used...
>
> If distros ship that crap, I'm going to laugh at them the next time they
> want a single digit performance improvement because *important*.
I have a crazy plan to try to use objtool to detect function changes at
a binary level, which would hopefully allow us to drop this flag.
But regardless, I wonder if we enabled this flag prematurely. We still
don't have a reasonable way to use it for creating source-based live
patches upstream, and it should really be optional for CONFIG_LIVEPATCH,
since kpatch-build doesn't need it.
--
Josh
On Fri, Oct 18, 2019 at 03:40:58PM +0200, Petr Mladek wrote:
> On Fri 2019-10-18 15:03:42, Jessica Yu wrote:
> > +++ Miroslav Benes [16/10/19 15:29 +0200]:
> > > On Wed, 16 Oct 2019, Miroslav Benes wrote:
> > > Thinking about it more... crazy idea. I think we could leverage these new
> > > ELF .text per vmlinux/module sections for the reinvention I was talking
> > > about. If we teach module loader to relocate (and apply alternatives and
> > > so on, everything in arch-specific module_finalize()) not the whole module
> > > in case of live patch modules, but separate ELF .text sections, it could
> > > solve the issue with late module patching we have. It is a variation on
> > > Steven's idea. When live patch module is loaded, only its section for
> > > present modules would be processed. Then whenever a to-be-patched module
> > > is loaded, its .text section in all present patch module would be
> > > processed.
> > >
> > > The upside is that almost no work would be required on patch modules
> > > creation side. The downside is that klp_modinfo must stay. Module loader
> > > needs to be hacked a lot in both cases. So it remains to be seen which
> > > idea is easier to implement.
> > >
> > > Jessica, do you think it would be feasible?
> >
> > I think that does sound feasible. I'm trying to visualize how that
> > would look. I guess there would need to be various livepatching hooks
> > called during the different stages (apply_relocate_add(),
> > module_finalize(), module_enable_ro/x()).
> >
> > So maybe something like the following?
> >
> > When a livepatch module loads:
> > apply_relocate_add()
> > klp hook: apply .klp.rela.$objname relocations *only* for
> > already loaded modules
> > module_finalize()
> > klp hook: apply .klp.arch.$objname changes for already loaded modules
> > module_enable_ro()
> > klp hook: only enable ro/x for .klp.text.$objname for already
> > loaded modules
>
> Just for record. We should also set ro for the not-yet used
> .klp.text.$objname at this stage so that it can't be modified
> easily "by accident".
>
>
> > When a to-be-patched module loads:
> > apply_relocate_add()
> > klp hook: for each patch module that patches the coming
> > module, apply .klp.rela.$objname relocations for this object
> > module_finalize()
> > klp hook: for each patch module that patches the coming
> > module, apply .klp.arch.$objname changes for this object
> > module_enable_ro()
> > klp hook: for each patch module, apply ro/x permissions for
> > .klp.text.$objname for this object
> >
> > Then, in klp_module_coming, we only need to do the callbacks and
> > enable the patch, and get rid of the module_disable_ro->apply
> > relocs->module_enable_ro block.
> >
> > Does that sound like what you had in mind or am I totally off?
>
> Makes sense to me.
>
> Well, I wonder if it is really any better from what we have now.
AFAICT, this would still have a lot of the same problems we have today.
It has a lot of complexity. It needs arch-specific livepatch code and
sections, and introduces special cases in the module code.
I'd much prefer the proposal from LPC to have per-module live patches.
It's simpler and has less things that can go wrong IMO.
--
Josh
On Fri, 18 Oct 2019, Jessica Yu wrote:
> +++ Miroslav Benes [16/10/19 15:29 +0200]:
> >On Wed, 16 Oct 2019, Miroslav Benes wrote:
> >
> >> On Wed, 16 Oct 2019, Peter Zijlstra wrote:
> >>
> >> > On Tue, Oct 15, 2019 at 06:27:05PM -0400, Steven Rostedt wrote:
> >> >
> >> > > (7) Seventh session, titled "klp-convert and livepatch relocations",
> >> > > was led
> >> > > by Joe Lawrence.
> >> > >
> >> > > Joe started the session with problem statement: accessing non exported
> >> > > / static
> >> > > symbols from inside the patch module. One possible workardound is
> >> > > manually via
> >> > > kallsyms. Second workaround is klp-convert, which actually creates
> >> > > proper
> >> > > relocations inside the livepatch module from the symbol database during
> >> > > the
> >> > > final .ko link.
> >> > > Currently module loader looks for special livepatch relocations and
> >> > > resolves
> >> > > those during runtime; kernel support for these relocations have so far
> >> > > been
> >> > > added for x86 only. Special livepatch relocations are supported and
> >> > > processed
> >> > > also on other architectures. Special quirks/sections are not yet
> >> > > supported.
> >> > > Plus klp-convert would still be needed even with late module patching
> >> > > update.
> >> > > vmlinux or modules could have ambiguous static symbols.
> >> > >
> >> > > It turns out that the features / bugs below have to be resolved before
> >> > > we
> >> > > can claim the klp-convert support for relocation complete:
> >> > > - handle all the corner cases (jump labels, static keys, ...)
> >> > > properly and
> >> > > have a good regression tests in place
> >> >
> >> > I suppose all the patches in this series-of-series here will make life
> >> > harder for KLP, static_call() and 2 byte jumps etc..
> >>
> >> Yes, I think so. We'll have to deal with that once it lands. That is why
> >> we want to get rid of all this arch-specific code in livepatch and
> >> reinvent the late module patching. So it is perhaps better to start
> >> working on it sooner than later. Adding Petr, who hesitantly signed up for
> >> the task...
> >
> >Thinking about it more... crazy idea. I think we could leverage these new
> >ELF .text per vmlinux/module sections for the reinvention I was talking
> >about. If we teach module loader to relocate (and apply alternatives and
> >so on, everything in arch-specific module_finalize()) not the whole module
> >in case of live patch modules, but separate ELF .text sections, it could
> >solve the issue with late module patching we have. It is a variation on
> >Steven's idea. When live patch module is loaded, only its section for
> >present modules would be processed. Then whenever a to-be-patched module
> >is loaded, its .text section in all present patch module would be
> >processed.
> >
> >The upside is that almost no work would be required on patch modules
> >creation side. The downside is that klp_modinfo must stay. Module loader
> >needs to be hacked a lot in both cases. So it remains to be seen which
> >idea is easier to implement.
> >
> >Jessica, do you think it would be feasible?
>
> I think that does sound feasible. I'm trying to visualize how that
> would look. I guess there would need to be various livepatching hooks
> called during the different stages (apply_relocate_add(),
> module_finalize(), module_enable_ro/x()).
>
> So maybe something like the following?
>
> When a livepatch module loads:
> apply_relocate_add()
> klp hook: apply .klp.rela.$objname relocations *only* for
> already loaded modules
> module_finalize()
> klp hook: apply .klp.arch.$objname changes for already loaded modules
> module_enable_ro()
> klp hook: only enable ro/x for .klp.text.$objname for already
> loaded modules
>
> When a to-be-patched module loads:
> apply_relocate_add()
> klp hook: for each patch module that patches the coming
> module, apply .klp.rela.$objname relocations for this object
> module_finalize()
> klp hook: for each patch module that patches the coming
> module, apply .klp.arch.$objname changes for this object
> module_enable_ro()
> klp hook: for each patch module, apply ro/x permissions for
> .klp.text.$objname for this object
>
> Then, in klp_module_coming, we only need to do the callbacks and
> enable the patch, and get rid of the module_disable_ro->apply
> relocs->module_enable_ro block.
>
> Does that sound like what you had in mind or am I totally off?
Sort of. What I had in mind was that we could get rid of all special .klp
ELF section if module loader guarantees that only sections for loaded
modules are processed. Then .klp.rela.$objname is not needed and proper
.rela.text.$objname (or whatever its text section is named) should be
sufficient. The same for the rest (.klp.arch).
Only then it would be useful.
Miroslav
On Wed, 16 Oct 2019, Peter Zijlstra wrote:
> On Wed, Oct 16, 2019 at 08:51:27AM +0200, Miroslav Benes wrote:
> > On Tue, 15 Oct 2019, Joe Lawrence wrote:
> >
> > > On 10/15/19 10:13 AM, Miroslav Benes wrote:
> > > > Yes, it does. klp_module_coming() calls module_disable_ro() on all
> > > > patching modules which patch the coming module in order to call
> > > > apply_relocate_add(). New (patching) code for a module can be relocated
> > > > only when the relevant module is loaded.
> > >
> > > FWIW, would the LPC blue-sky2 model (ie, Steve's suggestion @ plumber's where
> > > livepatches only patch a single object and updates are kept on disk to handle
> > > coming module updates as they are loaded) eliminate those outstanding
> > > relocations and the need to perform this late permission flipping?
> >
> > Yes, it should, but we don't have to wait for it. PeterZ proposed a
> > different solution to this specific issue in
> > https://lore.kernel.org/lkml/[email protected]/
> >
> > It should not be a problem to create a live patch module like that and the
> > code in kernel/livepatch/ is almost ready. Something like
> > module_section_disable_ro(mod, section) (and similar for X protection)
> > should be enough. Module reloads would still require juggling with the
> > protections, but I think it is all feasible.
>
> Something a little like so.. completely fresh of the keyboard.
Yes, but I noticed you found different and better way through text_poke()
(I was not aware that text_poke() works around the protections).
Miroslav
> ---
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -853,6 +853,18 @@ static inline void module_enable_ro(cons
> static inline void module_disable_ro(const struct module *mod) { }
> #endif
>
> +#if defined(CONFIG_STRICT_MODULE_RWX) && defined(CONFIG_LIVEPATCH)
> +extern void module_section_disable_ro(struct module *mod, const char *sec);
> +extern void module_section_enable_ro(struct module *mod, const char *sec);
> +extern void module_section_disable_x(struct module *mod, const char *sec);
> +extern void module_section_enable_x(struct module *mod, const char *sec);
> +#else
> +static inline void module_section_disable_ro(struct module *mod, const char *sec) { }
> +static inline void module_section_enable_ro(struct module *mod, const char *sec) { }
> +static inline void module_section_disable_x(struct module *mod, const char *sec) { }
> +static inline void module_section_enable_x(struct module *mod, const char *sec) { }
> +#endif
> +
> #ifdef CONFIG_GENERIC_BUG
> void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
> struct module *);
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -2107,6 +2107,54 @@ static void free_module_elf(struct modul
> kfree(mod->klp_info->secstrings);
> kfree(mod->klp_info);
> }
> +
> +#ifdef CONFIG_STRICT_MODULE_RWX
> +
> +static void __frob_section(struct Elf_Shdr *sec, int (*set_memory)(unsigned long start, int num_pages))
> +{
> + BUG_ON((unsigned long)sec->sh_addr & (PAGE_SIZE-1));
> + BUG_ON((unsigned long)sec->sh_size & (PAGE_SIZE-1));
> + set_memory((unsigned long)sec->sh_addr, sec->sh_size >> PAGE_SHIFT);
> +}
> +
> +static void frob_section(struct module *mod, const char *section,
> + int (*set_memory)(unsigned long start, int num_pages))
> +{
> + struct klp_modinfo *info = mod->klp_info;
> + const char *secname;
> + Elf_Shdr *s;
> +
> + for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
> + secname = mod->klp_info->secstrings + s->sh_name;
> + if (strcmp(secname, section))
> + continue;
> +
> + __frob_section(s, set_memory);
> + }
> +}
> +
> +void module_section_disable_ro(struct module *mod, const char *section)
> +{
> + frob_section(mod, section, set_memory_rw);
> +}
> +
> +void module_section_enable_ro(struct module *mod, const char *section)
> +{
> + frob_section(mod, section, set_memory_ro);
> +}
> +
> +void module_section_disable_x(struct module *mod, const char *section)
> +{
> + frob_section(mod, section, set_memory_nx);
> +}
> +
> +void module_section_enable_x(struct module *mod, const char *section)
> +{
> + frob_section(mod, section, set_memory_x);
> +}
> +
> +#endif /* ONFIG_STRICT_MODULE_RWX */
> +
> #else /* !CONFIG_LIVEPATCH */
> static int copy_module_elf(struct module *mod, struct load_info *info)
> {
>
On Tue, Oct 22, 2019 at 10:27:49AM +0200, Miroslav Benes wrote:
> > Does that sound like what you had in mind or am I totally off?
>
> Sort of. What I had in mind was that we could get rid of all special .klp
> ELF section if module loader guarantees that only sections for loaded
> modules are processed. Then .klp.rela.$objname is not needed and proper
> .rela.text.$objname (or whatever its text section is named) should be
> sufficient. The same for the rest (.klp.arch).
If I understand correctly, using kvm as an example to-be-patched module,
we'd have:
.text.kvm
.rela.text.kvm
.altinstructions.kvm
.rela.altinstructions.kvm
__jump_table.kvm
.rela__jump_table.kvm
etc. i.e. any "special" sections would need to be renamed.
Is that right?
But also I think *any* sections which need relocations would need to be
renamed, for example:
.rodata.kvm
.rela.rodata.kvm
.orc_unwind_ip.kvm
.rela.orc_unwind_ip.kvm
It's an interesting idea.
We'd have to be careful about ordering issues. For example, there are
module-specific jump labels stored in mod->jump_entries. Right now
that's just a pointer to the module's __jump_table section. With late
module patching, when kvm is loaded we'd have to insert the klp module's
__jump_table.kvm entries into kvm's mod->jump_entries list somehow.
Presumably we'd also have that issue for other sections. Handling that
_might_ be as simple as just hacking up find_module_sections() to
re-allocate sections and append "patched sections" to them.
But then you still have to worry about when to apply the relocations.
If you apply them before patching the sections, then relative
relocations would have the wrong values. If you apply them after, then
you have to figure out where the appended relocations are.
And if we allow unpatching then we'd presumably have to be able to
remove entries from the module specific section lists.
So I get the feeling a lot of complexity would creep in. Even just
thinking about it requires more mental gymnastics than the
one-patch-per-module idea, so I view that as a bad sign.
--
Josh
On Tue, 22 Oct 2019, Josh Poimboeuf wrote:
> On Tue, Oct 22, 2019 at 10:27:49AM +0200, Miroslav Benes wrote:
> > > Does that sound like what you had in mind or am I totally off?
> >
> > Sort of. What I had in mind was that we could get rid of all special .klp
> > ELF section if module loader guarantees that only sections for loaded
> > modules are processed. Then .klp.rela.$objname is not needed and proper
> > .rela.text.$objname (or whatever its text section is named) should be
> > sufficient. The same for the rest (.klp.arch).
>
> If I understand correctly, using kvm as an example to-be-patched module,
> we'd have:
>
> .text.kvm
> .rela.text.kvm
> .altinstructions.kvm
> .rela.altinstructions.kvm
> __jump_table.kvm
> .rela__jump_table.kvm
>
> etc. i.e. any "special" sections would need to be renamed.
>
> Is that right?
Yes.
> But also I think *any* sections which need relocations would need to be
> renamed, for example:
>
> .rodata.kvm
> .rela.rodata.kvm
> .orc_unwind_ip.kvm
> .rela.orc_unwind_ip.kvm
Correct.
> It's an interesting idea.
>
> We'd have to be careful about ordering issues. For example, there are
> module-specific jump labels stored in mod->jump_entries. Right now
> that's just a pointer to the module's __jump_table section. With late
> module patching, when kvm is loaded we'd have to insert the klp module's
> __jump_table.kvm entries into kvm's mod->jump_entries list somehow.
Yes.
> Presumably we'd also have that issue for other sections. Handling that
> _might_ be as simple as just hacking up find_module_sections() to
> re-allocate sections and append "patched sections" to them.
>
> But then you still have to worry about when to apply the relocations.
> If you apply them before patching the sections, then relative
> relocations would have the wrong values. If you apply them after, then
> you have to figure out where the appended relocations are.
Ah, right. That is a valid remark.
> And if we allow unpatching then we'd presumably have to be able to
> remove entries from the module specific section lists.
Correct.
> So I get the feeling a lot of complexity would creep in. Even just
> thinking about it requires more mental gymnastics than the
> one-patch-per-module idea, so I view that as a bad sign.
Yes, the devil is in the details. It would be better if the approach
helped even someone/something else in the kernel. Without it, it is
probably better to stick to Steven's proposal and handle the complexity
elsewhere.
Thanks
Miroslav
On Mon, Oct 21, 2019 at 10:05:49AM -0500, Josh Poimboeuf wrote:
> On Wed, Oct 16, 2019 at 09:42:17AM +0200, Peter Zijlstra wrote:
> > > which are not compatible with livepatching. GCC upstream now has
> > > -flive-patching option, which disables all those interfering optimizations.
> >
> > Which, IIRC, has a significant performance impact and should thus really
> > not be used...
> >
> > If distros ship that crap, I'm going to laugh at them the next time they
> > want a single digit performance improvement because *important*.
>
> I have a crazy plan to try to use objtool to detect function changes at
> a binary level, which would hopefully allow us to drop this flag.
>
> But regardless, I wonder if we enabled this flag prematurely. We still
> don't have a reasonable way to use it for creating source-based live
> patches upstream, and it should really be optional for CONFIG_LIVEPATCH,
> since kpatch-build doesn't need it.
I also just discovered that -flive-patching is responsible for all those
"unreachable instruction" objtool warnings which Randy has been
dutifully bugging me about over the last several months. For some
reason it subtly breaks GCC implicit noreturn detection for local
functions.
At this point, I only see downsides of -flive-patching, at least until
we actually have real upstream code which needs it.
If there aren't any objections I'll be posting a patch soon to revert.
--
Josh
On Mon, 20 Jan 2020, Josh Poimboeuf wrote:
> On Mon, Oct 21, 2019 at 10:05:49AM -0500, Josh Poimboeuf wrote:
> > On Wed, Oct 16, 2019 at 09:42:17AM +0200, Peter Zijlstra wrote:
> > > > which are not compatible with livepatching. GCC upstream now has
> > > > -flive-patching option, which disables all those interfering optimizations.
> > >
> > > Which, IIRC, has a significant performance impact and should thus really
> > > not be used...
> > >
> > > If distros ship that crap, I'm going to laugh at them the next time they
> > > want a single digit performance improvement because *important*.
> >
> > I have a crazy plan to try to use objtool to detect function changes at
> > a binary level, which would hopefully allow us to drop this flag.
> >
> > But regardless, I wonder if we enabled this flag prematurely. We still
> > don't have a reasonable way to use it for creating source-based live
> > patches upstream, and it should really be optional for CONFIG_LIVEPATCH,
> > since kpatch-build doesn't need it.
>
> I also just discovered that -flive-patching is responsible for all those
> "unreachable instruction" objtool warnings which Randy has been
> dutifully bugging me about over the last several months. For some
> reason it subtly breaks GCC implicit noreturn detection for local
> functions.
Ugh, that is unfortunate. Have you reported it?
> At this point, I only see downsides of -flive-patching, at least until
> we actually have real upstream code which needs it.
Can you explain this? The option makes GCC to avoid optimizations which
are difficult to detect and would make live patching unsafe. I consider it
useful as it is, so if you shared the other downsides and what you meant
by real upstream code, we could discuss it.
> If there aren't any objections I'll be posting a patch soon to revert.
I think it would be a setback.
Regards
Miroslav
On Tue, Jan 21, 2020 at 09:35:28AM +0100, Miroslav Benes wrote:
> On Mon, 20 Jan 2020, Josh Poimboeuf wrote:
>
> > On Mon, Oct 21, 2019 at 10:05:49AM -0500, Josh Poimboeuf wrote:
> > > On Wed, Oct 16, 2019 at 09:42:17AM +0200, Peter Zijlstra wrote:
> > > > > which are not compatible with livepatching. GCC upstream now has
> > > > > -flive-patching option, which disables all those interfering optimizations.
> > > >
> > > > Which, IIRC, has a significant performance impact and should thus really
> > > > not be used...
> > > >
> > > > If distros ship that crap, I'm going to laugh at them the next time they
> > > > want a single digit performance improvement because *important*.
> > >
> > > I have a crazy plan to try to use objtool to detect function changes at
> > > a binary level, which would hopefully allow us to drop this flag.
> > >
> > > But regardless, I wonder if we enabled this flag prematurely. We still
> > > don't have a reasonable way to use it for creating source-based live
> > > patches upstream, and it should really be optional for CONFIG_LIVEPATCH,
> > > since kpatch-build doesn't need it.
> >
> > I also just discovered that -flive-patching is responsible for all those
> > "unreachable instruction" objtool warnings which Randy has been
> > dutifully bugging me about over the last several months. For some
> > reason it subtly breaks GCC implicit noreturn detection for local
> > functions.
>
> Ugh, that is unfortunate. Have you reported it?
Not yet (but I plan to).
> > At this point, I only see downsides of -flive-patching, at least until
> > we actually have real upstream code which needs it.
>
> Can you explain this? The option makes GCC to avoid optimizations which
> are difficult to detect and would make live patching unsafe. I consider it
> useful as it is, so if you shared the other downsides and what you meant
> by real upstream code, we could discuss it.
Only SLES needs it right? Why inflict it on other livepatch users? By
"real upstream code" I mean there's no (documented) way to create live
patches using the method which relies on this flag. So I don't see any
upstream benefits for having it enabled.
--
Josh
> > > At this point, I only see downsides of -flive-patching, at least until
> > > we actually have real upstream code which needs it.
> >
> > Can you explain this? The option makes GCC to avoid optimizations which
> > are difficult to detect and would make live patching unsafe. I consider it
> > useful as it is, so if you shared the other downsides and what you meant
> > by real upstream code, we could discuss it.
>
> Only SLES needs it right? Why inflict it on other livepatch users? By
> "real upstream code" I mean there's no (documented) way to create live
> patches using the method which relies on this flag. So I don't see any
> upstream benefits for having it enabled.
I'd put it differently. SLES and upstream need it, RHEL does not need it.
Or anyone using kpatch-build. It is perfectly fine to prepare live patches
just from the source code using upstream live patching infrastructure.
After all, SLES is nothing else than upstream here. We were creating live
patches manually for quite a long time and only recently we have been
using Nicolai's klp-ccp automation (https://github.com/SUSE/klp-ccp).
So, everyone using upstream directly relies on the flag, which seems to be
a clear benefit to me. Reverting the patch would be a step back.
Also I think we're moving in the right direction to make the life of
upstream user easier with a proposal of klp-ccp and Petr's patch set to
split live patch modules. It is a path from inconvenient to comfortable
and not from impossible to possible.
Miroslav
On Tue, 21 Jan 2020, Josh Poimboeuf wrote:
> On Tue, Jan 21, 2020 at 09:35:28AM +0100, Miroslav Benes wrote:
> > On Mon, 20 Jan 2020, Josh Poimboeuf wrote:
> >
> > > On Mon, Oct 21, 2019 at 10:05:49AM -0500, Josh Poimboeuf wrote:
> > > > On Wed, Oct 16, 2019 at 09:42:17AM +0200, Peter Zijlstra wrote:
> > > > > > which are not compatible with livepatching. GCC upstream now has
> > > > > > -flive-patching option, which disables all those interfering optimizations.
> > > > >
> > > > > Which, IIRC, has a significant performance impact and should thus really
> > > > > not be used...
> > > > >
> > > > > If distros ship that crap, I'm going to laugh at them the next time they
> > > > > want a single digit performance improvement because *important*.
> > > >
> > > > I have a crazy plan to try to use objtool to detect function changes at
> > > > a binary level, which would hopefully allow us to drop this flag.
> > > >
> > > > But regardless, I wonder if we enabled this flag prematurely. We still
> > > > don't have a reasonable way to use it for creating source-based live
> > > > patches upstream, and it should really be optional for CONFIG_LIVEPATCH,
> > > > since kpatch-build doesn't need it.
> > >
> > > I also just discovered that -flive-patching is responsible for all those
> > > "unreachable instruction" objtool warnings which Randy has been
> > > dutifully bugging me about over the last several months. For some
> > > reason it subtly breaks GCC implicit noreturn detection for local
> > > functions.
> >
> > Ugh, that is unfortunate. Have you reported it?
>
> Not yet (but I plan to).
My findings so far...
I bisected through GCC options which -flive-patching disables and
-fno-ipa-pure-const is the culprit. I got no warnings without the option
with my config.
Then I found out allmodconfig was ok even with -flive-patching.
CONFIG_GCOV is the difference. CONFIG_GCOV=y seems to make the warnings go
away here.
/me goes staring
On Wed, 22 Jan 2020, Miroslav Benes wrote:
> On Tue, 21 Jan 2020, Josh Poimboeuf wrote:
>
> > On Tue, Jan 21, 2020 at 09:35:28AM +0100, Miroslav Benes wrote:
> > > On Mon, 20 Jan 2020, Josh Poimboeuf wrote:
> > >
> > > > On Mon, Oct 21, 2019 at 10:05:49AM -0500, Josh Poimboeuf wrote:
> > > > > On Wed, Oct 16, 2019 at 09:42:17AM +0200, Peter Zijlstra wrote:
> > > > > > > which are not compatible with livepatching. GCC upstream now has
> > > > > > > -flive-patching option, which disables all those interfering optimizations.
> > > > > >
> > > > > > Which, IIRC, has a significant performance impact and should thus really
> > > > > > not be used...
> > > > > >
> > > > > > If distros ship that crap, I'm going to laugh at them the next time they
> > > > > > want a single digit performance improvement because *important*.
> > > > >
> > > > > I have a crazy plan to try to use objtool to detect function changes at
> > > > > a binary level, which would hopefully allow us to drop this flag.
> > > > >
> > > > > But regardless, I wonder if we enabled this flag prematurely. We still
> > > > > don't have a reasonable way to use it for creating source-based live
> > > > > patches upstream, and it should really be optional for CONFIG_LIVEPATCH,
> > > > > since kpatch-build doesn't need it.
> > > >
> > > > I also just discovered that -flive-patching is responsible for all those
> > > > "unreachable instruction" objtool warnings which Randy has been
> > > > dutifully bugging me about over the last several months. For some
> > > > reason it subtly breaks GCC implicit noreturn detection for local
> > > > functions.
> > >
> > > Ugh, that is unfortunate. Have you reported it?
> >
> > Not yet (but I plan to).
>
> My findings so far...
>
> I bisected through GCC options which -flive-patching disables and
> -fno-ipa-pure-const is the culprit. I got no warnings without the option
> with my config.
>
> Then I found out allmodconfig was ok even with -flive-patching.
> CONFIG_GCOV is the difference. CONFIG_GCOV=y seems to make the warnings go
> away here.
Sorry, that was a red herring. See 867ac9d73709 ("objtool: Fix gcov check
for older versions of GCC").
I started looking at some btrfs reports and then found out those were
already fixed.
https://lore.kernel.org/linux-btrfs/[email protected]/
arch/x86/kernel/cpu/mce/core.o: warning: objtool: mce_panic()+0x11b: unreachable instruction
was next...
Broken code (-fno-ipa-pure-const):
...
1186: e8 a5 fe ff ff callq 1030 <wait_for_panic>
118b: e9 23 ff ff ff jmpq 10b3 <mce_panic+0x43>
</end of function>
Working code (-fipa-pure-const):
753: e8 88 fe ff ff callq 5e0 <wait_for_panic>
758: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
75f: 00
mce_panic() has:
if (atomic_inc_return(&mce_panicked) > 1)
wait_for_panic();
barrier();
bust_spinlocks(1);
jmpq in the broken code goes to bust_spinlocks(1), because GCC does not
know that wait_for_panic() is noreturn... because it is not.
wait_for_panic() calls panic() unconditionally in the end, which is
noreturn.
So the question is why ipa-pure-const optimization knows about panic()'s
noreturn. The answer is that it is right one of the things the
optimization does. It propagates inner noreturns to its callers. (Martin
Jambor CCed).
Marking wait_for_panic() as noreturn (__noreturn), of course, fixes it
then. Now I don't know what the right fix should be. Should we mark all
these sites as noreturn, or is it ok for the kernel to rely on GCC
behaviour in this case? Could we teach objtool to recognize this?
Miroslav
On Wed, Jan 22, 2020 at 11:09:56AM +0100, Miroslav Benes wrote:
>
> > > > At this point, I only see downsides of -flive-patching, at least until
> > > > we actually have real upstream code which needs it.
> > >
> > > Can you explain this? The option makes GCC to avoid optimizations which
> > > are difficult to detect and would make live patching unsafe. I consider it
> > > useful as it is, so if you shared the other downsides and what you meant
> > > by real upstream code, we could discuss it.
> >
> > Only SLES needs it right? Why inflict it on other livepatch users? By
> > "real upstream code" I mean there's no (documented) way to create live
> > patches using the method which relies on this flag. So I don't see any
> > upstream benefits for having it enabled.
>
> I'd put it differently. SLES and upstream need it, RHEL does not need it.
> Or anyone using kpatch-build.
I'm confused about why you think upstream needs it.
Is all the tooling available somewhere? Is there documentation
available which describes how to build patches using that method from
start to finish? Are there actual users other than SUSE?
BTW, kpatch-build has a *lot* of users other than RHEL. All its tooling
and documentation are available on Github.
> It is perfectly fine to prepare live patches just from the source code
> using upstream live patching infrastructure.
Do you mean the dangerous method used by the livepatch sample code which
completely ignores interprocedural optimizations? I wouldn't call that
perfectly fine.
> After all, SLES is nothing else than upstream here. We were creating live
> patches manually for quite a long time and only recently we have been
> using Nicolai's klp-ccp automation (https://github.com/SUSE/klp-ccp).
>
> So, everyone using upstream directly relies on the flag, which seems to be
> a clear benefit to me. Reverting the patch would be a step back.
Who exactly is "everyone using upstream"?
From what I can tell, kpatch-build is the only known way (to those
outside of SUSE) to make safe patches for an upstream kernel. And it
doesn't need this flag and the problems associated with it: performance,
LTO incompatibility, clang incompatibility (I think?), the GCC dead code
issue.
--
Josh
On Wed, Jan 22, 2020 at 04:05:27PM +0100, Miroslav Benes wrote:
> I started looking at some btrfs reports and then found out those were
> already fixed.
> https://lore.kernel.org/linux-btrfs/[email protected]/
>
> arch/x86/kernel/cpu/mce/core.o: warning: objtool: mce_panic()+0x11b: unreachable instruction
> was next...
>
> Broken code (-fno-ipa-pure-const):
> ...
> 1186: e8 a5 fe ff ff callq 1030 <wait_for_panic>
> 118b: e9 23 ff ff ff jmpq 10b3 <mce_panic+0x43>
> </end of function>
>
> Working code (-fipa-pure-const):
> 753: e8 88 fe ff ff callq 5e0 <wait_for_panic>
> 758: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
> 75f: 00
>
> mce_panic() has:
> if (atomic_inc_return(&mce_panicked) > 1)
> wait_for_panic();
> barrier();
>
> bust_spinlocks(1);
>
> jmpq in the broken code goes to bust_spinlocks(1), because GCC does not
> know that wait_for_panic() is noreturn... because it is not.
> wait_for_panic() calls panic() unconditionally in the end, which is
> noreturn.
>
> So the question is why ipa-pure-const optimization knows about panic()'s
> noreturn. The answer is that it is right one of the things the
> optimization does. It propagates inner noreturns to its callers. (Martin
> Jambor CCed).
>
> Marking wait_for_panic() as noreturn (__noreturn), of course, fixes it
> then. Now I don't know what the right fix should be. Should we mark all
> these sites as noreturn, or is it ok for the kernel to rely on GCC
> behaviour in this case? Could we teach objtool to recognize this?
Thanks for looking at it. I cam to a similar conclusion and I already
had the manual noreturns added (see patch below) before I realized that
-flive-patching was the culprit.
The patch works, but the problem is that more warnings will pop up in
the future and it'll be my job to fix them...
Global noreturns are already a pain today. There's no way for objtool
to know whether GCC considered a function to be noreturn, we have
already have to keep a hard-coded list of global noreturns in objtool.
It's been a constant source of annoyance and this will add to that.
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 8d91b0428af1..8a8696b32120 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -192,7 +192,7 @@ static void ttm_bo_add_mem_to_lru(struct ttm_buffer_object *bo,
}
}
-static void ttm_bo_ref_bug(struct kref *list_kref)
+static void __noreturn ttm_bo_ref_bug(struct kref *list_kref)
{
BUG();
}
diff --git a/drivers/message/fusion/mptbase.h b/drivers/message/fusion/mptbase.h
index 813d46311f6a..2932ecef4dcf 100644
--- a/drivers/message/fusion/mptbase.h
+++ b/drivers/message/fusion/mptbase.h
@@ -945,7 +945,7 @@ extern int mpt_raid_phys_disk_get_num_paths(MPT_ADAPTER *ioc,
u8 phys_disk_num);
extern int mpt_set_taskmgmt_in_progress_flag(MPT_ADAPTER *ioc);
extern void mpt_clear_taskmgmt_in_progress_flag(MPT_ADAPTER *ioc);
-extern void mpt_halt_firmware(MPT_ADAPTER *ioc);
+extern void mpt_halt_firmware(MPT_ADAPTER *ioc) __noreturn;
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb8bd0258360..4db39fef3b56 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -655,7 +655,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
return prealloc;
}
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void __noreturn extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
struct inode *inode = tree->private_data;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d897a8e5e430..b7a94b1739ae 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -321,7 +321,7 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
return NULL;
}
-static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
+static void __noreturn backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
{
struct btrfs_fs_info *fs_info = NULL;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 18639c069263..3ee230a3dee2 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -175,7 +175,7 @@ extern void __init cred_init(void);
* check for validity of credentials
*/
#ifdef CONFIG_DEBUG_CREDENTIALS
-extern void __invalid_creds(const struct cred *, const char *, unsigned);
+extern void __noreturn __invalid_creds(const struct cred *, const char *, unsigned);
extern void __validate_process_creds(struct task_struct *,
const char *, unsigned);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1879884238e..44ca6000b5f1 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -86,7 +86,7 @@ static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
-extern void do_group_exit(int);
+extern void __noreturn do_group_exit(int);
extern void exit_files(struct task_struct *);
extern void exit_itimers(struct signal_struct *);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 973a71f4bc89..29024c578997 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -98,8 +98,8 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
* Keep out of line to prevent kernel bloat.
* __builtin_return_address is not used because it is not always reliable.
*/
-static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
- const char msg[])
+static void __noreturn
+skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[])
{
pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
msg, addr, skb->len, sz, skb->head, skb->data,
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b6da413bcbd6..ac8807732b10 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -145,6 +145,9 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
"machine_real_restart",
"rewind_stack_do_exit",
"kunit_try_catch_throw",
+ "__invalid_creds",
+ "do_group_exit",
+ "mpt_halt_firmware",
};
if (!func)
Hello,
On Wed, Jan 22 2020, Josh Poimboeuf wrote:
> Global noreturns are already a pain today. There's no way for objtool
> to know whether GCC considered a function to be noreturn,
You should be able to get a good idea with -Wsuggest-attribute=noreturn:
$ cat a.c
int __attribute__((noreturn)) my_abort (void)
{
__builtin_abort ();
}
int foo (void)
{
return my_abort ();
}
int bar (int flag)
{
if (flag)
foo ();
return 4;
}
$ gcc -S -O2 -Wsuggest-attribute=noreturn a.c
a.c: In function ‘foo’:
a.c:6:5: warning: function might be candidate for attribute ‘noreturn’ [-Wsuggest-attribute=noreturn]
6 | int foo (void)
| ^~~
GCC 9 and newer even have -fdiagnostics-format=json if you are into that
kind of thing.
Hope this helps a little,
Martin
On Wed, 22 Jan 2020, Josh Poimboeuf wrote:
> On Wed, Jan 22, 2020 at 11:09:56AM +0100, Miroslav Benes wrote:
> >
> > > > > At this point, I only see downsides of -flive-patching, at least until
> > > > > we actually have real upstream code which needs it.
> > > >
> > > > Can you explain this? The option makes GCC to avoid optimizations which
> > > > are difficult to detect and would make live patching unsafe. I consider it
> > > > useful as it is, so if you shared the other downsides and what you meant
> > > > by real upstream code, we could discuss it.
> > >
> > > Only SLES needs it right? Why inflict it on other livepatch users? By
> > > "real upstream code" I mean there's no (documented) way to create live
> > > patches using the method which relies on this flag. So I don't see any
> > > upstream benefits for having it enabled.
> >
> > I'd put it differently. SLES and upstream need it, RHEL does not need it.
> > Or anyone using kpatch-build.
>
> I'm confused about why you think upstream needs it.
>
> Is all the tooling available somewhere? Is there documentation
> available which describes how to build patches using that method from
> start to finish? Are there actual users other than SUSE?
>
> BTW, kpatch-build has a *lot* of users other than RHEL. All its tooling
> and documentation are available on Github.
>
> > It is perfectly fine to prepare live patches just from the source code
> > using upstream live patching infrastructure.
>
> Do you mean the dangerous method used by the livepatch sample code which
> completely ignores interprocedural optimizations? I wouldn't call that
> perfectly fine.
>
> > After all, SLES is nothing else than upstream here. We were creating live
> > patches manually for quite a long time and only recently we have been
> > using Nicolai's klp-ccp automation (https://github.com/SUSE/klp-ccp).
> >
> > So, everyone using upstream directly relies on the flag, which seems to be
> > a clear benefit to me. Reverting the patch would be a step back.
>
> Who exactly is "everyone using upstream"?
>
> >From what I can tell, kpatch-build is the only known way (to those
> outside of SUSE) to make safe patches for an upstream kernel. And it
> doesn't need this flag and the problems associated with it: performance,
> LTO incompatibility, clang incompatibility (I think?), the GCC dead code
> issue.
I don't think we have something special at SUSE not generally available...
...and I don't think it is really important to discuss that and replying
to the above, because there is a legitimate use case which relies on the
flag. We decided to support different use cases right at the beginning.
I understand it currently complicates things for objtool, but objtool is
sensitive to GCC code generation by definition. "Issues" appear with every
new GCC version. I see no difference here and luckily it is not so
difficult to fix it.
I am happy to help with acting on those objtool warning reports you
mentioned in the other email. Just Cc me where appropriate. We will take a
look.
Regards
Miroslav
On Tue, Jan 28, 2020 at 10:28:07AM +0100, Miroslav Benes wrote:
> I don't think we have something special at SUSE not generally available...
>
> ...and I don't think it is really important to discuss that and replying
> to the above, because there is a legitimate use case which relies on the
> flag. We decided to support different use cases right at the beginning.
>
> I understand it currently complicates things for objtool, but objtool is
> sensitive to GCC code generation by definition. "Issues" appear with every
> new GCC version. I see no difference here and luckily it is not so
> difficult to fix it.
>
> I am happy to help with acting on those objtool warning reports you
> mentioned in the other email. Just Cc me where appropriate. We will take a
> look.
As I said, the objtool warnings aren't even the main issue.
There are N users[*] of CONFIG_LIVEPATCH, where N is perhaps dozens.
For N-1 users, they have to suffer ALL the drawbacks, with NONE of the
benefits.
And, even if they wanted those benefits, they have no idea how to get
them because the patch creation process isn't documented.
And, there's no direct upstream usage of the flag, i.e. the only user
does so in a distro which can easily modify KCFLAGS in the spec file.
As best as I can tell, these are facts, which you seem to keep glossing
over. Did I get any of the facts wrong?
[*] The term 'user' describes the creator/distributor of the
live patches.
--
Josh
On Tue 2020-01-28 09:00:14, Josh Poimboeuf wrote:
> On Tue, Jan 28, 2020 at 10:28:07AM +0100, Miroslav Benes wrote:
> > I don't think we have something special at SUSE not generally available...
> >
> > ...and I don't think it is really important to discuss that and replying
> > to the above, because there is a legitimate use case which relies on the
> > flag. We decided to support different use cases right at the beginning.
> >
> > I understand it currently complicates things for objtool, but objtool is
> > sensitive to GCC code generation by definition. "Issues" appear with every
> > new GCC version. I see no difference here and luckily it is not so
> > difficult to fix it.
> >
> > I am happy to help with acting on those objtool warning reports you
> > mentioned in the other email. Just Cc me where appropriate. We will take a
> > look.
>
> As I said, the objtool warnings aren't even the main issue.
Great.
Anyway, I think that we might make your life easier with using
the proposed -Wsuggest-attribute=noreturn.
Also it might be possible to create the list of global
noreturn functions using some gcc tool. Similar way that we get
the list of functions that need to be livepatched explicitly
because of the problematic optimizations.
It sounds like a win-win approach.
> There are N users[*] of CONFIG_LIVEPATCH, where N is perhaps dozens.
> For N-1 users, they have to suffer ALL the drawbacks, with NONE of the
> benefits.
You wrote in the other mail:
> The problems associated with it: performance, LTO incompatibility,
> clang incompatibility (I think?), the GCC dead code issue.
SUSE performance team did extensive testing and did not found
any real performance issues. It was discussed when the option
was enabled upstream.
Are the other problems affecting real life usage, please?
Could you be more specific about them, please?
> And, even if they wanted those benefits, they have no idea how to get
> them because the patch creation process isn't documented.
I do not understand this. All the sample modules and selftests are
using source based livepatches. It is actually the only somehow
documented way. Sure, the documentation might get improved.
Patches are welcome.
The option is not currently needed by the selftests only because
there is no selftest for this type of problems. But the problems
are real. They would actually deserve selftests. Again, patches
are welcome.
My understanding is that the source based livepatches is the future.
N-1 users are just waiting until the 1 user develops more helper
tools for this. I would really like to hear about some serious problems
before we do this step back in upstream.
Best Regards,
Petr
On Tue, Jan 28, 2020 at 04:40:46PM +0100, Petr Mladek wrote:
> On Tue 2020-01-28 09:00:14, Josh Poimboeuf wrote:
> > On Tue, Jan 28, 2020 at 10:28:07AM +0100, Miroslav Benes wrote:
> > > I don't think we have something special at SUSE not generally available...
> > >
> > > ...and I don't think it is really important to discuss that and replying
> > > to the above, because there is a legitimate use case which relies on the
> > > flag. We decided to support different use cases right at the beginning.
> > >
> > > I understand it currently complicates things for objtool, but objtool is
> > > sensitive to GCC code generation by definition. "Issues" appear with every
> > > new GCC version. I see no difference here and luckily it is not so
> > > difficult to fix it.
> > >
> > > I am happy to help with acting on those objtool warning reports you
> > > mentioned in the other email. Just Cc me where appropriate. We will take a
> > > look.
> >
> > As I said, the objtool warnings aren't even the main issue.
>
> Great.
>
> Anyway, I think that we might make your life easier with using
> the proposed -Wsuggest-attribute=noreturn.
Maybe. Though if I understand correctly, this doesn't help for any of
the new warnings because they're for static functions, and this only
warns about global functions.
> Also it might be possible to create the list of global
> noreturn functions using some gcc tool. Similar way that we get
> the list of functions that need to be livepatched explicitly
> because of the problematic optimizations.
>
> It sounds like a win-win approach.
I don't quite get how that could be done in an automated way, but ideas
about how to implement it would certainly be welcome.
> > There are N users[*] of CONFIG_LIVEPATCH, where N is perhaps dozens.
> > For N-1 users, they have to suffer ALL the drawbacks, with NONE of the
> > benefits.
>
> You wrote in the other mail:
>
> > The problems associated with it: performance, LTO incompatibility,
> > clang incompatibility (I think?), the GCC dead code issue.
>
> SUSE performance team did extensive testing and did not found
> any real performance issues. It was discussed when the option
> was enabled upstream.
>
> Are the other problems affecting real life usage, please?
> Could you be more specific about them, please?
The original commit mentioned 1-3% scheduler degradation. And I'd
expect things to worsen over time as interprocedural optimizations
improve.
Also, LTO is coming whether we like it or not. As is Clang. Those are
real-world things which will need to work with livepatching sooner or
later.
> > And, even if they wanted those benefits, they have no idea how to get
> > them because the patch creation process isn't documented.
>
> I do not understand this. All the sample modules and selftests are
> using source based livepatches.
We're talking in circles. Have you read the thread?
The samples are a (dangerous) joke. With or without -flive-patching.
> It is actually the only somehow documented way. Sure, the
> documentation might get improved. Patches are welcome.
Are you suggesting for *me* to send documentation for how *you* build
patches?
> The option is not currently needed by the selftests only because there
> is no selftest for this type of problems. But the problems are real.
> They would actually deserve selftests. Again, patches are welcome.
>
> My understanding is that the source based livepatches is the future.
I think that still remains to be seen.
> N-1 users are just waiting until the 1 user develops more helper tools
> for this.
No. N-1 users have no idea how to make (safe) source-based patches in
the first place. And if *you* don't need the tools, why would anyone
else? Why not document the process and encourage the existence of other
users so they can get involved and help with the tooling?
> I would really like to hear about some serious problems
> before we do this step back in upstream.
Sometimes you need to take 1 step back before you can take 2 steps
forward. I regret ACKing the original patch. It was too early.
--
Josh
On Tue, 28 Jan 2020, Josh Poimboeuf wrote:
> > Anyway, I think that we might make your life easier with using the
> > proposed -Wsuggest-attribute=noreturn.
>
> Maybe. Though if I understand correctly, this doesn't help for any of
> the new warnings because they're for static functions, and this only
> warns about global functions.
Could you please provide a pointer where those have been
reported/analyzed?
For the cases I've seen so far, it has always been gcc deciding under
certain circumstances not to propagate __attribute__((__noreturn__)) from
callee to caller even in the cases when caller unconditionally called
callee.
AFAIU, the behavior is (and always will) be dependent on the state of gcc
optimizations, and therefore I don't see any other way than adding
__noreturn anotation transitively everywhere in order to silence objtool.
So those cases have to be fixed anyway.
What are the other cases please? Either I have completely missed those, or
they haven't been mentioned in this thread.
Thanks,
--
Jiri Kosina
SUSE Labs
On Wed, Jan 29, 2020 at 01:46:55AM +0100, Jiri Kosina wrote:
> On Tue, 28 Jan 2020, Josh Poimboeuf wrote:
>
> > > Anyway, I think that we might make your life easier with using the
> > > proposed -Wsuggest-attribute=noreturn.
> >
> > Maybe. Though if I understand correctly, this doesn't help for any of
> > the new warnings because they're for static functions, and this only
> > warns about global functions.
>
> Could you please provide a pointer where those have been
> reported/analyzed?
>
> For the cases I've seen so far, it has always been gcc deciding under
> certain circumstances not to propagate __attribute__((__noreturn__)) from
> callee to caller even in the cases when caller unconditionally called
> callee.
>
> AFAIU, the behavior is (and always will) be dependent on the state of gcc
> optimizations, and therefore I don't see any other way than adding
> __noreturn anotation transitively everywhere in order to silence objtool.
>
> So those cases have to be fixed anyway.
>
> What are the other cases please? Either I have completely missed those, or
> they haven't been mentioned in this thread.
For example, see:
https://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux.git/commit/?h=objtool-fixes&id=6265238af90b395a1e5e5032a41f012a552d8a9e
Many of those callees are static noreturns, for which we've *never*
needed annotations. Disabling -fipa-pure-const has apparently changed
that.
-Wsuggest-attribute=noreturn doesn't seem to suggest annotations for
static functions, probably because most reasonable setups use -O2 which
allows GCC to detect them.
--
Josh
On Tue, 28 Jan 2020, Josh Poimboeuf wrote:
> > For the cases I've seen so far, it has always been gcc deciding under
> > certain circumstances not to propagate __attribute__((__noreturn__)) from
> > callee to caller even in the cases when caller unconditionally called
> > callee.
> >
> > AFAIU, the behavior is (and always will) be dependent on the state of gcc
> > optimizations, and therefore I don't see any other way than adding
> > __noreturn anotation transitively everywhere in order to silence objtool.
> >
> > So those cases have to be fixed anyway.
> >
> > What are the other cases please? Either I have completely missed those, or
> > they haven't been mentioned in this thread.
>
> For example, see:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux.git/commit/?h=objtool-fixes&id=6265238af90b395a1e5e5032a41f012a552d8a9e
>
> Many of those callees are static noreturns, for which we've *never*
> needed annotations. Disabling -fipa-pure-const has apparently changed
> that.
For some reason I thought you were talking about static inlines, sorry for
the noise.
Yeah, so I agree with you -- whether we need those anotations depends on
compiler implementation of optimizations, and most importantly on (the
current state of) internal implementation of specific optimizations in
gcc.
Leaving live patching completely aside for the sake of this discussion for
now -- I believe we either fully rely on gcc to propagate the 'noreturn'
propery throughout the callstack upward, or we don't.
If we don't, then we do need the anotations (both the global and static
ones), and problem solved.
If we do, well, where is the 'this is *the* behavior of any current/future
clang^Wcompiler' invariant guarantee?
Thanks,
--
Jiri Kosina
SUSE Labs
> > > There are N users[*] of CONFIG_LIVEPATCH, where N is perhaps dozens.
> > > For N-1 users, they have to suffer ALL the drawbacks, with NONE of the
> > > benefits.
> >
> > You wrote in the other mail:
> >
> > > The problems associated with it: performance, LTO incompatibility,
> > > clang incompatibility (I think?), the GCC dead code issue.
> >
> > SUSE performance team did extensive testing and did not found
> > any real performance issues. It was discussed when the option
> > was enabled upstream.
> >
> > Are the other problems affecting real life usage, please?
> > Could you be more specific about them, please?
>
> The original commit mentioned 1-3% scheduler degradation. And I'd
> expect things to worsen over time as interprocedural optimizations
> improve.
Or maybe not.
Anyway, -flive-patching does not disable all interprocedural
optimizations. By far. Only a subset of optimizations whose usage on the
linux kernel is reportedly even not that prominent (compared to heavily
C++ template based source codes). Reportedly, because we did some tests
but nothing exhaustive. So I'd leave any expectations aside now.
The fact is that -fno-ipa-pure-const caused the objtool issue. One could
argue that it should be fixed anyway, because it relies on GCC internal
implementation which could easily change, and we luckily found it out
thanks to -flive-patching. But you pointed out that was not even the main
problem here, so I'd leave it for the separate subthread which Jiri
started.
Regarding the scheduler degradation. hackbench performance degradation to
make it clear. It might be interesting to find out what really changed
there. Which disabled optimization caused it and how. Maybe it could be
gained back if proven again (because it may have changed, right?).
It all sound artificial to me though. I am not saying the degradation is
not there, but many people also lived with frame pointers enabled for
quite a long time and no one seemed to be bothered. And that was even more
serious because the decline was bigger and it was measurable in many
workflows. Not just a schedule microbenchmark. That is why Petr asked
about real life reports, I guess.
> Also, LTO is coming whether we like it or not. As is Clang. Those are
> real-world things which will need to work with livepatching sooner or
> later.
Yes, but we are not there yet. Once a user has problem with that, we will
try to solve it.
LTO might not be a big problem. The number of ipa clones would probably
grow, but that is not directly dangerous. It remains to be seen.
I don't know much about Clang.
> > > And, even if they wanted those benefits, they have no idea how to get
> > > them because the patch creation process isn't documented.
> >
> > I do not understand this. All the sample modules and selftests are
> > using source based livepatches.
>
> We're talking in circles. Have you read the thread?
>
> The samples are a (dangerous) joke. With or without -flive-patching.
How come?
In my opinion, the samples and selftests try to show the way to prepare a
(simple, yes) live patch. We try to ensure it always works (selftests
should).
After all, there is not much more we do at SUSE to prepare a live patch.
1. take a patch and put all touched functions in a live patch
2. if the functions cannot be patched, patch their callers
3. do the function closure and/or add references (relocations or
kallsyms trick) so it can all be compiled.
4. done
See? Samples and selftests are not different. Our live patches are not
different (https://kernel.suse.com/cgit/kernel-livepatch/). Can we
implement the samples and selftests without -flive-patching? No, not
really. Or we could, but no guarantees they would work.
For 2., we use -fdump-ipa-clones and Martin Liska's tool
(https://github.com/marxin/kgraft-analysis-tool) to parse the output.
Yes, sometimes it is more complicated. Source based approach allows us to
cope with that quite well. But that is case by case and cannot be easily
documented.
Do we lack the documentation of our approach? Definitely. We are moving to
klp-ccp automation now (https://github.com/SUSE/klp-ccp) and once done
completely, we will hopefully have some documention. CCing Nicolai if he
wants to add something.
Should it be upstream? I don't know. I don't think so. For the same reason
kpatch-build documentation is not upstream either. Use cases of the
infrastructure differ. Maybe there are users who use it in a completely
different way. I don't know. In fact, it does not matter to me. I think we
should support it all if they make sense.
And that is my message which (in my opinion) makes more sense. Definitely
more sense than your "kpatch-build is the only safe way to prepare a live
patch" mantra you are trying to sell here for whatever reason. I don't
agree with it.
> > It is actually the only somehow documented way. Sure, the
> > documentation might get improved. Patches are welcome.
>
> Are you suggesting for *me* to send documentation for how *you* build
> patches?
I don't think that is what Petr meant (he will definitely correct me). If
you think there is a space for improvement in our upstream documentation
of the infrastructure, you are welcome to send patches. The space is
definitely there.
> > The option is not currently needed by the selftests only because there
> > is no selftest for this type of problems. But the problems are real.
> > They would actually deserve selftests. Again, patches are welcome.
> >
> > My understanding is that the source based livepatches is the future.
>
> I think that still remains to be seen.
>
> > N-1 users are just waiting until the 1 user develops more helper tools
> > for this.
>
> No. N-1 users have no idea how to make (safe) source-based patches in
> the first place. And if *you* don't need the tools, why would anyone
> else? Why not document the process and encourage the existence of other
> users so they can get involved and help with the tooling?
I replied to this one above. You are right we should document our approach
better. I think it is off topic of the thread and problem here.
Regards
Miroslav
On Wed, Jan 29, 2020 at 01:28:30PM +0100, Miroslav Benes wrote:
> The fact is that -fno-ipa-pure-const caused the objtool issue. One could
> argue that it should be fixed anyway, because it relies on GCC internal
> implementation which could easily change, and we luckily found it out
> thanks to -flive-patching. But you pointed out that was not even the main
> problem here, so I'd leave it for the separate subthread which Jiri
> started.
It's not an objtool "issue". The warnings were correct. And objtool
*has* to rely on GCC internals.
And why would this particular internal implementation ever change
(detecting static noreturns)? I don't see why optimizing the call
interface to a pure/const static function would break GCC's implicit
noreturn detection anyway. It smells like a GCC bug.
> Regarding the scheduler degradation. hackbench performance degradation to
> make it clear. It might be interesting to find out what really changed
> there. Which disabled optimization caused it and how. Maybe it could be
> gained back if proven again (because it may have changed, right?).
Fixing the scheduler performance regression would be a good thing to
have done *before* merging the patch.
> It all sound artificial to me though. I am not saying the degradation is
> not there, but many people also lived with frame pointers enabled for
> quite a long time and no one seemed to be bothered. And that was even more
> serious because the decline was bigger and it was measurable in many
> workflows. Not just a schedule microbenchmark. That is why Petr asked
> about real life reports, I guess.
Many people were happy to get rid of frame pointers.
> > The samples are a (dangerous) joke. With or without -flive-patching.
>
> How come?
>
> In my opinion, the samples and selftests try to show the way to prepare a
> (simple, yes) live patch. We try to ensure it always works (selftests
> should).
>
> After all, there is not much more we do at SUSE to prepare a live patch.
>
> 1. take a patch and put all touched functions in a live patch
> 2. if the functions cannot be patched, patch their callers
> 3. do the function closure and/or add references (relocations or
> kallsyms trick) so it can all be compiled.
> 4. done
>
> See? Samples and selftests are not different.
How much ABI optimization analysis was done before creating the samples?
(hint: none)
And how would somebody using the samples as a guide know to do all that?
> Do we lack the documentation of our approach? Definitely. We are moving to
> klp-ccp automation now (https://github.com/SUSE/klp-ccp) and once done
> completely, we will hopefully have some documention. CCing Nicolai if he
> wants to add something.
>
> Should it be upstream? I don't know. I don't think so. For the same reason
> kpatch-build documentation is not upstream either. Use cases of the
> infrastructure differ. Maybe there are users who use it in a completely
> different way. I don't know. In fact, it does not matter to me. I think we
> should support it all if they make sense.
Of course the documentation should be in-tree. Otherwise the samples
are *very* misleading.
> And that is my message which (in my opinion) makes more sense. Definitely
> more sense than your "kpatch-build is the only safe way to prepare a live
> patch" mantra you are trying to sell here for whatever reason. I don't
> agree with it.
Of course I didn't say that.
The only thing I'm trying to "sell" is that this flag has several
drawbacks and no benefits for the upstream community. Why do you keep
dancing around that unavoidable fact?
> > > It is actually the only somehow documented way. Sure, the
> > > documentation might get improved. Patches are welcome.
> >
> > Are you suggesting for *me* to send documentation for how *you* build
> > patches?
>
> I don't think that is what Petr meant (he will definitely correct me). If
> you think there is a space for improvement in our upstream documentation
> of the infrastructure, you are welcome to send patches. The space is
> definitely there.
If you want to use the -flive-patching flag for CONFIG_LIVEPATCH, then
yes, there's a huge gap in the documentation. I don't understand why
you seem to be suggesting that I'm the one whose qualified to write that
documentation.
> > > N-1 users are just waiting until the 1 user develops more helper tools
> > > for this.
> >
> > No. N-1 users have no idea how to make (safe) source-based patches in
> > the first place. And if *you* don't need the tools, why would anyone
> > else? Why not document the process and encourage the existence of other
> > users so they can get involved and help with the tooling?
>
> I replied to this one above. You are right we should document our approach
> better. I think it is off topic of the thread and problem here.
It's actually very much on-topic. It's one of the main reasons why I
wanted to revert the patch. Surely that's clear by now?
In retrospect, the prerequisites for merging it should have been:
1) Document how source-based patches can be safely generated;
2) Fix the scheduler performance regression;
3) Figure out if there are any other regressions by detecting which
function interfaces are affected by the flag and seeing if they're
hot path;
4) Provide a way for the N-1 users to opt-out
5) Fix the objtool warnings (or is it a GCC bug)
6) Make -flive-patching compatible with LTO (or at least acknowledge
that it should and will be done soon)
7) At least make it build- or runtime-incompatible with Clang-built
kernels to prevent people from assuming it's safe.
If you don't want to revert the patch, then address my concerns instead
of minimizing and deflecting at every opportunity.
--
Josh
On Wed 2020-01-29 09:59:51, Josh Poimboeuf wrote:
> In retrospect, the prerequisites for merging it should have been:
OK, let me do one more move in this game.
> 1) Document how source-based patches can be safely generated;
I agree that the information are really scattered over many files
in Documentation/livepatch/. Anyway, there is a lot of useful
hints:
+ structure and behavior of the livepatch module, link
to a sample, limitations, are described in livepatch.rst
+ many other catches are described in the other files:
callbacks, module-elf-fomat, cumulative-patches,
system-state.
Yes, it would be great to have a better structure, more information.
But do not get me wrong. Anyone, Joe definitely, is able to create
livepatch from sources by this information.
Anyone could play with it, ask questions, and improve the
documentation. Better documentation would help but it is
not a blocker, definitely.
> 2) Fix the scheduler performance regression;
The optimizations are disabled only when livepatching is enabled.
I would consider this as a prize for the feature. There are
many things like this.
As it was said. It was 1-3 percent in scheduler microbenchmark.
It would make sense to fix it only when it causes such a regression
in real workloads. Do you have any?
> 3) Figure out if there are any other regressions by detecting which
> function interfaces are affected by the flag and seeing if they're
> hot path;
IMHO, benchmarks are much more effective and we spent non-trivial
resources when running them.
> 4) Provide a way for the N-1 users to opt-out
AFAIK, the only prize is the 1-3 percent scheduler performance degradation.
If you really do not want to pay this prize, let's make it configurable.
But the option is definitely needed when source livepatches are used.
There is no other reasonable way to detect and workaround these
problems. For this, it has to be in upstream kernel. It is in line
with the effort to make livepatching less and less error prone.
And please, let's stop playing this multi-user games. There is at least
one known user of source based livepatches. By coincidence, it is also
a big contributor to this subsystem. Adding an extra option into
CFLAGS is quite error prone. You can imagine how complicated is
a kernel rpm spec file for more kernel flavors. The only safe way
is to have the optimization tight with the CONFIG option in
kernel sources.
> 5) Fix the objtool warnings (or is it a GCC bug)
Nobody was aware of them. I wonder if they even existed at that time.
We have a simple fix now. Let's continue in the thread started by
Jikos if we could get a better solution.
> 6) Make -flive-patching compatible with LTO (or at least acknowledge
> that it should and will be done soon)
Is LTO officially supported upstream?
Are all patches, features tested for LTO compactibility?
Is there any simple way to build and run LTO kernel?
> 7) At least make it build- or runtime-incompatible with Clang-built
> kernels to prevent people from assuming it's safe.
Same questions as for LTO.
> If you don't want to revert the patch, then address my concerns instead
> of minimizing and deflecting at every opportunity.
I would really like to keep focusing on realistic problems and
realistic solutions:
+ make the optimization configurable if you resist on it
+ fix the objtool warnings
Anything else is out of scope of this thread from my POV.
Best Regards,
Petr
On Thu, Jan 30, 2020 at 10:53:46AM +0100, Petr Mladek wrote:
> On Wed 2020-01-29 09:59:51, Josh Poimboeuf wrote:
> > In retrospect, the prerequisites for merging it should have been:
>
> OK, let me do one more move in this game.
>
>
> > 1) Document how source-based patches can be safely generated;
>
> I agree that the information are really scattered over many files
> in Documentation/livepatch/.
Once again you're blithely ignoring my point and pretending I'm saying
something else. And you did that again further down in the email, but
what's the point of arguing if you're not going to listen.
This has nothing to do with the organization of the existing
documentation. When did I say that?
Adding the -flive-patching flag doesn't remove *all*
function-ABI-breaking optimizations. It's only a partial solution. The
rest of the solution involves tooling and processes which need to be
documented. But you already know that.
If we weren't co-maintainers I would have reverted the patch days ago.
I've tried to give you all the benefit of the doubt. But you seem to be
playing company politics.
I would ask that you please put on your upstream hats and stop playing
politics. If the patch creation process is a secret, then by all means,
keep it secret. But then keep your GCC flag to yourself.
--
Josh
On Thu 2020-01-30 08:17:33, Josh Poimboeuf wrote:
> On Thu, Jan 30, 2020 at 10:53:46AM +0100, Petr Mladek wrote:
> > On Wed 2020-01-29 09:59:51, Josh Poimboeuf wrote:
> > > In retrospect, the prerequisites for merging it should have been:
> >
> > OK, let me do one more move in this game.
> >
> >
> > > 1) Document how source-based patches can be safely generated;
> >
> > I agree that the information are really scattered over many files
> > in Documentation/livepatch/.
>
> Once again you're blithely ignoring my point and pretending I'm saying
> something else. And you did that again further down in the email, but
> what's the point of arguing if you're not going to listen.
I have exactly the same feeling but the opposite way.
> I would ask that you please put on your upstream hats and stop playing
> politics. If the patch creation process is a secret, then by all means,
> keep it secret. But then keep your GCC flag to yourself.
The thing is that we do not have any magic secret.
Best Regards,
Petr