DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 1406320853
Date: Mon, 18 Dec 2017 18:51:00 +0900
From: Masami Hiramatsu <mhiramat@kernel.org>
To: Josef Bacik <josef@toxicpanda.com>
Cc: rostedt@goodmis.org, mingo@redhat.com, davem@davemloft.net,
        netdev@vger.kernel.org, linux-kernel@vger.kernel.org, ast@kernel.org,
        kernel-team@fb.com, daniel@iogearbox.net, linux-btrfs@vger.kernel.org,
        darrick.wong@oracle.com, Josef Bacik <jbacik@fb.com>
Subject: Re: [PATCH v10 3/5] bpf: add a bpf_override_function helper
Message-Id: <20171218185100.4a8174488dfebf93d28899c3@kernel.org>
In-Reply-To: <1513365176-6744-4-git-send-email-josef@toxicpanda.com>
References: <1513365176-6744-1-git-send-email-josef@toxicpanda.com>
        <1513365176-6744-4-git-send-email-josef@toxicpanda.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7390
Lines: 231

On Fri, 15 Dec 2017 14:12:54 -0500
Josef Bacik <josef@toxicpanda.com> wrote:

> From: Josef Bacik <jbacik@fb.com>
> 
> Error injection is sloppy and very ad-hoc.  BPF could fill this niche
> perfectly with it's kprobe functionality.  We could make sure errors are
> only triggered in specific call chains that we care about with very
> specific situations.  Accomplish this with the bpf_override_funciton
> helper.  This will modify the probe'd callers return value to the
> specified value and set the PC to an override function that simply
> returns, bypassing the originally probed function.  This gives us a nice
> clean way to implement systematic error injection for all of our code
> paths.

OK, got it. I think the error_injectable function list should be defined
in kernel/trace/bpf_trace.c because only bpf calls it and needs to care
the "safeness".

[...]
> diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
> index 8dc0161cec8f..1ea748d682fd 100644
> --- a/arch/x86/kernel/kprobes/ftrace.c
> +++ b/arch/x86/kernel/kprobes/ftrace.c
> @@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
>  	p->ainsn.boostable = false;
>  	return 0;
>  }
> +
> +asmlinkage void override_func(void);
> +asm(
> +	".type override_func, @function\n"
> +	"override_func:\n"
> +	"	ret\n"
> +	".size override_func, .-override_func\n"
> +);
> +
> +void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
> +{
> +	regs->ip = (unsigned long)&override_func;
> +}
> +NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);

Calling this as "override_function" is meaningless. This is a function
which just return. So I think combination of just_return_func() and
arch_bpf_override_func_just_return() will be better.

Moreover, this arch/x86/kernel/kprobes/ftrace.c is an archtecture
dependent implementation of kprobes, not bpf.

Hmm, arch/x86/net/bpf_jit_comp.c will be better place?
(why don't we have arch/x86/kernel/bpf.c?)

[..]
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 492700c5fb4d..91f4b57dab82 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -42,6 +42,7 @@ struct trace_kprobe {
>  	(offsetof(struct trace_kprobe, tp.args) +	\
>  	(sizeof(struct probe_arg) * (n)))
>  
> +DEFINE_PER_CPU(int, bpf_kprobe_override);
>  
>  static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
>  {
> @@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
>  	return nhit;
>  }
>  
> +int trace_kprobe_ftrace(struct trace_event_call *call)
> +{
> +	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
> +	return kprobe_ftrace(&tk->rp.kp);
> +}
> +
> +int trace_kprobe_error_injectable(struct trace_event_call *call)
> +{
> +	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
> +	unsigned long addr;
> +
> +	if (tk->symbol) {
> +		addr = (unsigned long)
> +			kallsyms_lookup_name(trace_kprobe_symbol(tk));
> +		addr += tk->rp.kp.offset;

If the tk is already registered, you don't need to get address,
you can use kp.addr. Anyway, kprobe_ftrace() also requires the
kprobe already registered.

> +	} else {
> +		addr = (unsigned long)tk->rp.kp.addr;
> +	}
> +	return within_kprobe_error_injection_list(addr);
> +}
> +
>  static int register_kprobe_event(struct trace_kprobe *tk);
>  static int unregister_kprobe_event(struct trace_kprobe *tk);
>  
> @@ -1170,7 +1192,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
>  #ifdef CONFIG_PERF_EVENTS
>  
>  /* Kprobe profile handler */
> -static void
> +static int
>  kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
>  {
>  	struct trace_event_call *call = &tk->tp.call;
> @@ -1179,12 +1201,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
>  	int size, __size, dsize;
>  	int rctx;
>  
> -	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
> -		return;
> +	if (bpf_prog_array_valid(call)) {
> +		int ret;
> +
> +		ret = trace_call_bpf(call, regs);
> +
> +		/*
> +		 * We need to check and see if we modified the pc of the
> +		 * pt_regs, and if so clear the kprobe and return 1 so that we
> +		 * don't do the instruction skipping.  Also reset our state so
> +		 * we are clean the next pass through.
> +		 */
> +		if (__this_cpu_read(bpf_kprobe_override)) {
> +			__this_cpu_write(bpf_kprobe_override, 0);
> +			reset_current_kprobe();

OK, I will fix this issue(reset kprobe and preempt-enable) by removing
jprobe soon.
(currently waiting for removing {tcp,sctp,dccp}_probe code, which are
 only users of jprobe in the kernel)

Thank you,


> +			return 1;
> +		}
> +		if (!ret)
> +			return 0;
> +	}
>  
>  	head = this_cpu_ptr(call->perf_events);
>  	if (hlist_empty(head))
> -		return;
> +		return 0;
>  
>  	dsize = __get_data_size(&tk->tp, regs);
>  	__size = sizeof(*entry) + tk->tp.size + dsize;
> @@ -1193,13 +1232,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
>  
>  	entry = perf_trace_buf_alloc(size, NULL, &rctx);
>  	if (!entry)
> -		return;
> +		return 0;
>  
>  	entry->ip = (unsigned long)tk->rp.kp.addr;
>  	memset(&entry[1], 0, dsize);
>  	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
>  	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
>  			      head, NULL);
> +	return 0;
>  }
>  NOKPROBE_SYMBOL(kprobe_perf_func);
>  
> @@ -1275,16 +1315,24 @@ static int kprobe_register(struct trace_event_call *event,
>  static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
>  {
>  	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
> +	int ret = 0;
>  
>  	raw_cpu_inc(*tk->nhit);
>  
>  	if (tk->tp.flags & TP_FLAG_TRACE)
>  		kprobe_trace_func(tk, regs);
>  #ifdef CONFIG_PERF_EVENTS
> -	if (tk->tp.flags & TP_FLAG_PROFILE)
> -		kprobe_perf_func(tk, regs);
> +	if (tk->tp.flags & TP_FLAG_PROFILE) {
> +		ret = kprobe_perf_func(tk, regs);
> +		/*
> +		 * The ftrace kprobe handler leaves it up to us to re-enable
> +		 * preemption here before returning if we've modified the ip.
> +		 */
> +		if (ret)
> +			preempt_enable_no_resched();
> +	}
>  #endif
> -	return 0;	/* We don't tweek kernel, so just return 0 */
> +	return ret;
>  }
>  NOKPROBE_SYMBOL(kprobe_dispatcher);
>  
> diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
> index fb66e3eaa192..5e54d748c84c 100644
> --- a/kernel/trace/trace_probe.h
> +++ b/kernel/trace/trace_probe.h
> @@ -252,6 +252,8 @@ struct symbol_cache;
>  unsigned long update_symbol_cache(struct symbol_cache *sc);
>  void free_symbol_cache(struct symbol_cache *sc);
>  struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
> +int trace_kprobe_ftrace(struct trace_event_call *call);
> +int trace_kprobe_error_injectable(struct trace_event_call *call);
>  #else
>  /* uprobes do not support symbol fetch methods */
>  #define fetch_symbol_u8			NULL
> @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset)
>  {
>  	return NULL;
>  }
> +
> +static inline int trace_kprobe_ftrace(struct trace_event_call *call)
> +{
> +	return 0;
> +}
> +
> +static inline int trace_kprobe_error_injectable(struct trace_event_call *call)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_KPROBE_EVENTS */
>  
>  struct probe_arg {
> -- 
> 2.7.5
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>