2017-12-06 00:34:41

by Yonghong Song

[permalink] [raw]
Subject: Re: [PATCH][v5] uprobes/x86: emulate push insns for uprobe on x86

Hi, Ingo and Peter,

Could you take a look at this patch and if no objection
merge it into tip? This patch has been reviewed by
Oleg Nesterov.

Thanks!

Yonghong

On 11/30/17 4:12 PM, Yonghong Song wrote:
> Uprobe is a tracing mechanism for userspace programs.
> Typical uprobe will incur overhead of two traps.
> First trap is caused by replaced trap insn, and
> the second trap is to execute the original displaced
> insn in user space.
>
> To reduce the overhead, kernel provides hooks
> for architectures to emulate the original insn
> and skip the second trap. In x86, emulation
> is done for certain branch insns.
>
> This patch extends the emulation to "push <reg>"
> insns. These insns are typical in the beginning
> of the function. For example, bcc
> in https://github.com/iovisor/bcc repo provides
> tools to measure funclantency, detect memleak, etc.
> The tools will place uprobes in the beginning of
> function and possibly uretprobes at the end of function.
> This patch is able to reduce the trap overhead for
> uprobe from 2 to 1.
>
> Without this patch, uretprobe will typically incur
> three traps. With this patch, if the function starts
> with "push" insn, the number of traps can be
> reduced from 3 to 2.
>
> An experiment was conducted on two local VMs,
> fedora 26 64-bit VM and 32-bit VM, both 4 processors
> and 4GB memory, booted with latest tip repo (and this patch).
> The host is MacBook with intel i7 processor.
>
> The test program looks like
> #include <stdio.h>
> #include <stdlib.h>
> #include <time.h>
> #include <sys/time.h>
>
> static void test() __attribute__((noinline));
> void test() {}
> int main() {
> struct timeval start, end;
>
> gettimeofday(&start, NULL);
> for (int i = 0; i < 1000000; i++) {
> test();
> }
> gettimeofday(&end, NULL);
>
> printf("%ld\n", ((end.tv_sec * 1000000 + end.tv_usec)
> - (start.tv_sec * 1000000 + start.tv_usec)));
> return 0;
> }
>
> The program is compiled without optimization, and
> the first insn for function "test" is "push %rbp".
> The host is relatively idle.
>
> Before the test run, the uprobe is inserted as below for uprobe:
> echo 'p <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
> echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
> and for uretprobe:
> echo 'r <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
> echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
>
> Unit: microsecond(usec) per loop iteration
>
> x86_64 W/ this patch W/O this patch
> uprobe 1.55 3.1
> uretprobe 2.0 3.6
>
> x86_32 W/ this patch W/O this patch
> uprobe 1.41 3.5
> uretprobe 1.75 4.0
>
> You can see that this patch significantly reduced the overhead,
> 50% for uprobe and 44% for uretprobe on x86_64, and even more
> on x86_32.
>
> Signed-off-by: Yonghong Song <[email protected]>
> Reviewed-by: Oleg Nesterov <[email protected]>
> ---
> arch/x86/include/asm/uprobes.h | 4 ++
> arch/x86/kernel/uprobes.c | 107 +++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 107 insertions(+), 4 deletions(-)
>
> Changelogs:
> v4 -> v5:
> . No code change from v4.
> . Rebased on top of tip and added Reviewed-by from Oleg.
> v3 -> v4:
> . Revert most of v3 change as 32bit emulation is not really working
> on x86_64 platform as function emulate_push_stack() needs to account
> for 32bit app on 64bit platform. A separate effort is ongoing to
> address this issue.
> v2 -> v3:
> . Do not emulate 32bit application on x86_64 platforms
> v1 -> v2:
> . Address Oleg's comments
>
> diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
> index 74f4c2f..d8bfa98 100644
> --- a/arch/x86/include/asm/uprobes.h
> +++ b/arch/x86/include/asm/uprobes.h
> @@ -53,6 +53,10 @@ struct arch_uprobe {
> u8 fixups;
> u8 ilen;
> } defparam;
> + struct {
> + u8 reg_offset; /* to the start of pt_regs */
> + u8 ilen;
> + } push;
> };
> };
>
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index a3755d2..85c7ef2 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
> return 0;
> }
>
> -static int push_ret_address(struct pt_regs *regs, unsigned long ip)
> +static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
> {
> unsigned long new_sp = regs->sp - sizeof_long();
>
> - if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
> + if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
> return -EFAULT;
>
> regs->sp = new_sp;
> @@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
> regs->ip += correction;
> } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
> regs->sp += sizeof_long(); /* Pop incorrect return address */
> - if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
> + if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
> return -ERESTART;
> }
> /* popf; tell the caller to not touch TF */
> @@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
> *
> * But there is corner case, see the comment in ->post_xol().
> */
> - if (push_ret_address(regs, new_ip))
> + if (emulate_push_stack(regs, new_ip))
> return false;
> } else if (!check_jmp_cond(auprobe, regs)) {
> offs = 0;
> @@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
> return true;
> }
>
> +static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
> +{
> + unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
> +
> + if (emulate_push_stack(regs, *src_ptr))
> + return false;
> + regs->ip += auprobe->push.ilen;
> + return true;
> +}
> +
> static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
> {
> BUG_ON(!branch_is_call(auprobe));
> @@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
> .post_xol = branch_post_xol_op,
> };
>
> +static const struct uprobe_xol_ops push_xol_ops = {
> + .emulate = push_emulate_op,
> +};
> +
> /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
> static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
> {
> @@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
> return 0;
> }
>
> +/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
> +static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
> +{
> + u8 opc1 = OPCODE1(insn), reg_offset = 0;
> +
> + if (opc1 < 0x50 || opc1 > 0x57)
> + return -ENOSYS;
> +
> + if (insn->length > 2)
> + return -ENOSYS;
> + if (insn->length == 2) {
> + /* only support rex_prefix 0x41 (x64 only) */
> +#ifdef CONFIG_X86_64
> + if (insn->rex_prefix.nbytes != 1 ||
> + insn->rex_prefix.bytes[0] != 0x41)
> + return -ENOSYS;
> +
> + switch (opc1) {
> + case 0x50:
> + reg_offset = offsetof(struct pt_regs, r8);
> + break;
> + case 0x51:
> + reg_offset = offsetof(struct pt_regs, r9);
> + break;
> + case 0x52:
> + reg_offset = offsetof(struct pt_regs, r10);
> + break;
> + case 0x53:
> + reg_offset = offsetof(struct pt_regs, r11);
> + break;
> + case 0x54:
> + reg_offset = offsetof(struct pt_regs, r12);
> + break;
> + case 0x55:
> + reg_offset = offsetof(struct pt_regs, r13);
> + break;
> + case 0x56:
> + reg_offset = offsetof(struct pt_regs, r14);
> + break;
> + case 0x57:
> + reg_offset = offsetof(struct pt_regs, r15);
> + break;
> + }
> +#else
> + return -ENOSYS;
> +#endif
> + } else {
> + switch (opc1) {
> + case 0x50:
> + reg_offset = offsetof(struct pt_regs, ax);
> + break;
> + case 0x51:
> + reg_offset = offsetof(struct pt_regs, cx);
> + break;
> + case 0x52:
> + reg_offset = offsetof(struct pt_regs, dx);
> + break;
> + case 0x53:
> + reg_offset = offsetof(struct pt_regs, bx);
> + break;
> + case 0x54:
> + reg_offset = offsetof(struct pt_regs, sp);
> + break;
> + case 0x55:
> + reg_offset = offsetof(struct pt_regs, bp);
> + break;
> + case 0x56:
> + reg_offset = offsetof(struct pt_regs, si);
> + break;
> + case 0x57:
> + reg_offset = offsetof(struct pt_regs, di);
> + break;
> + }
> + }
> +
> + auprobe->push.reg_offset = reg_offset;
> + auprobe->push.ilen = insn->length;
> + auprobe->ops = &push_xol_ops;
> + return 0;
> +}
> +
> /**
> * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
> * @mm: the probed address space.
> @@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
> if (ret != -ENOSYS)
> return ret;
>
> + ret = push_setup_xol_ops(auprobe, &insn);
> + if (ret != -ENOSYS)
> + return ret;
> +
> /*
> * Figure out which fixups default_post_xol_op() will need to perform,
> * and annotate defparam->fixups accordingly.
>