Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751932AbcCIBZJ (ORCPT ); Tue, 8 Mar 2016 20:25:09 -0500 Received: from mail.kernel.org ([198.145.29.136]:52060 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750850AbcCIBZC (ORCPT ); Tue, 8 Mar 2016 20:25:02 -0500 From: Andy Lutomirski To: x86@kernel.org Cc: linux-kernel@vger.kernel.org, Borislav Petkov , "musl@lists.openwall.com" , Andy Lutomirski Subject: [RFC PATCH] x86/vdso/32: Add AT_SYSINFO cancellation helpers Date: Tue, 8 Mar 2016 17:24:52 -0800 Message-Id: <06079088639eddd756e2092b735ce4a682081308.1457486598.git.luto@kernel.org> X-Mailer: git-send-email 2.5.0 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11476 Lines: 333 musl implements system call cancellation in an unusual but clever way. When a thread issues a cancellable syscall, musl issues the syscall through a special thunk that looks roughly like this: cancellable_syscall: test whether a cancel is queued jnz cancel_me int $0x80 end_cancellable_syscall: If a pthread cancellation signal hits with cancellable_syscall <= EIP < end_cancellable_syscall, then the signal interrupted a cancellation point before the syscall in question started. If so, it rewrites the calling context to skip the syscall and simulate a -EINTR return. The caller will detect this simulated -EINTR or an actual -EINTR and handle a possible cancellation event. This technique doesn't work if int $0x80 is replaced by a call to AT_SYSINFO: the signal handler can no longer tell whether it's interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was called from. Add minimal helpers so that musl's signal handler can learn the status of a possible pending AT_SYSINFO invocation and, if it hasn't entered the kernel yet, abort it without needing to parse the vdso DWARF unwind data. Signed-off-by: Andy Lutomirski --- musl people- Does this solve your AT_SYSINFO cancellation problem? I'd like to make sure it survives an actual implementation before I commit to the ABI. x86 people- Are you okay with this idea? arch/x86/entry/vdso/Makefile | 3 +- arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++ arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 + tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++-- 4 files changed, 171 insertions(+), 7 deletions(-) create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index b88846471247..465052b49603 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -130,7 +130,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o -targets += vdso32/vclock_gettime.o +targets += vdso32/vclock_gettime.o vdso32/cancellation_helpers.o KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) @@ -150,6 +150,7 @@ $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/cancellation_helpers.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/system_call.o \ $(obj)/vdso32/sigreturn.o diff --git a/arch/x86/entry/vdso/vdso32/cancellation_helpers.c b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c new file mode 100644 index 000000000000..3cb2e88baec6 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016 Andrew Lutomirski + * Subject to the GNU Public License, v.2 + * + * This provides helpers to enable libc implementations to cancel + * interrupted AT_SYSINFO invocations without needing to parse the + * DWARF unwinding instructions. + */ + +#include +#include + +extern char __kernel_vsyscall[] __attribute__((visibility("hidden"))); +extern char int80_landing_pad[] __attribute__((visibility("hidden"))); + +static unsigned long *pending_syscall_retaddr_ptr(const void *context) +{ + const struct ucontext_ia32 *uc = context; + unsigned long ctx_eip = uc->uc_mcontext.ip; + unsigned long offset_into_vsyscall; + unsigned long *retaddr; + + /* + * An AT_SYSINFO system call is pending if and only if we're in + * __kernel_vsyscall before int80_landing_pad. If we're at + * int80_landing_pad or beyond, we've finished the system call + * and are on our way out. + * + * If we're at int80_landing_pad-2, then either we're using the + * int $0x80 slow path because we have no fast system call + * support or we are restarting a fast system call. Either way, + * the system call is still pending. + */ + + if (ctx_eip < (unsigned long)__kernel_vsyscall || + ctx_eip >= (unsigned long)int80_landing_pad) + return NULL; + + /* + * The first three instructions of __kernel_vsyscall are one-byte + * pushes. + */ + offset_into_vsyscall = (ctx_eip - (unsigned long)__kernel_vsyscall); + retaddr = (unsigned long *)uc->uc_mcontext.sp; + if (offset_into_vsyscall < 3) + retaddr += offset_into_vsyscall; + else + retaddr += 3; + + /* + * GCC (correctly) fails to deduce out that retaddr can't be NULL + * in the success path. Helping it out reduces code size. + */ + if (!retaddr) + __builtin_unreachable(); + + return retaddr; +} + +/* + * If context is a sigcontet for a pending AT_SYSINFO syscall, returns + * the return address of that syscall. Otherwise returns -1UL. + */ +unsigned long __vdso_pending_syscall_return_address(const void *context) +{ + unsigned long *retaddr = pending_syscall_retaddr_ptr(context); + return retaddr ? *retaddr : -1UL; +} + +/* + * If context is a sigcontext for a pending AT_SYSINFO syscall, then + * this will pop off the call frame and point the context to + * AT_SYSINFO's return address. ESP will contain whatever value it had + * immediately prior to the call instruction (i.e. ESP acts as though + * the system call returned normally). EAX will be set to -EINTR. All + * other GPRs will be clobbered. __vdso_abort_pending_syscall will + * return 0. + * + * If context is a valid sigcontext that does not represent a pending + * AT_SYSINFO syscall, then __vdso_abort_pending_syscall returns + * -EINVAL. + * + * If context is not a valid sigcontext at all, behavior is undefined. + */ +long __vdso_abort_pending_syscall(void *context) +{ + struct ucontext_ia32 *uc = context; + unsigned long *retaddr = pending_syscall_retaddr_ptr(context); + + if (!retaddr) + return -EINVAL; + + uc->uc_mcontext.ip = *retaddr; + uc->uc_mcontext.sp = (unsigned long)(retaddr + 1); + + /* + * Clobber GPRs -- we don't want to implement full unwinding, and we + * don't want userspace to start expecting anything about the final + * state of the GPRs. + * + * (There really are subtleties here. EAX can be clobbered by + * syscall restart, and register limitations mean that the + * saved context has at least one of the argument registers + * used for a different purpose by the calling sequence just + * prior to kernel entry. In the current implementation, that + * register is EBP, but it could change.) + */ + uc->uc_mcontext.ax = -EINTR; + uc->uc_mcontext.bx = 0xFFFFFFFF; + uc->uc_mcontext.cx = 0xFFFFFFFF; + uc->uc_mcontext.dx = 0xFFFFFFFF; + uc->uc_mcontext.si = 0xFFFFFFFF; + uc->uc_mcontext.di = 0xFFFFFFFF; + uc->uc_mcontext.bp = 0xFFFFFFFF; + return 0; +} diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 31056cf294bf..f04e8bd30755 100644 --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -25,6 +25,8 @@ VERSION __vdso_clock_gettime; __vdso_gettimeofday; __vdso_time; + __vdso_pending_syscall_return_address; + __vdso_abort_pending_syscall; }; LINUX_2.5 { diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c index 00a26a82fa98..7c649b4b6834 100644 --- a/tools/testing/selftests/x86/unwind_vdso.c +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -35,6 +35,7 @@ int main() #include #include #include +#include #include #include #include @@ -88,8 +89,12 @@ static unsigned long sysinfo; static bool got_sysinfo = false; static unsigned long return_address; +static unsigned long (*vdso_pending_syscall_return_address)( + const void *context); + struct unwind_state { unsigned long ip; /* trap source */ + unsigned long ax; /* ax at call site */ int depth; /* -1 until we hit the trap source */ }; @@ -115,7 +120,7 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque) unsigned long ebp = _Unwind_GetGR(ctx, 5); unsigned long esi = _Unwind_GetGR(ctx, 6); unsigned long edi = _Unwind_GetGR(ctx, 7); - bool ok = (eax == SYS_getpid || eax == getpid()) && + bool ok = (eax == SYS_break || eax == -ENOSYS) && ebx == 1 && ecx == 2 && edx == 3 && esi == 4 && edi == 5 && ebp == 6; @@ -125,6 +130,8 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque) (ok ? "OK" : "FAIL"), eax, ebx, ecx, edx, esi, edi, ebp); + state->ax = eax; + return _URC_NORMAL_STOP; } else { state->depth++; @@ -137,6 +144,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) ucontext_t *ctx = (ucontext_t *)ctx_void; struct unwind_state state; unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP]; + unsigned long reported_return_address = 0; if (!got_sysinfo && ip == sysinfo) { got_sysinfo = true; @@ -148,8 +156,15 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) ip, return_address); } - if (!got_sysinfo) - return; /* Not there yet */ + if (!got_sysinfo) { + if (vdso_pending_syscall_return_address && + vdso_pending_syscall_return_address(ctx_void) != -1UL) { + printf("[FAIL]\t__vdso_pending_syscall_return_address incorrectly detected a pending syscall\n"); + nerrs++; + } + + return; /* We haven't started AT_SYSINFO yet */ + } if (ip == return_address) { ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF; @@ -157,11 +172,32 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) return; } - printf("\tSIGTRAP at 0x%lx\n", ip); + if (vdso_pending_syscall_return_address) { + reported_return_address = + vdso_pending_syscall_return_address(ctx_void); + if (reported_return_address != -1UL) + printf("\tSIGTRAP at 0x%lx, pending syscall will return to 0x%lx\n", + ip, reported_return_address); + else + printf("\tSIGTRAP at 0x%lx, no syscall pending\n", ip); + } else { + printf("\tSIGTRAP at 0x%lx\n", ip); + } state.ip = ip; state.depth = -1; _Unwind_Backtrace(trace_fn, &state); + + if (vdso_pending_syscall_return_address) { + unsigned long expected = + (state.ax == SYS_break ? return_address : -1UL); + if (reported_return_address != expected) { + printf("[FAIL]\t __vdso_pending_syscall_return_address returned 0x%lx; expected 0x%lx\n", reported_return_address, expected); + nerrs++; + } else { + printf("[OK]\t __vdso_pending_syscall_return_address returned the correct value\n"); + } + } } int main() @@ -177,12 +213,21 @@ int main() info.dli_fname, info.dli_fbase); } + void *vdso = dlopen("linux-gate.so.1", RTLD_NOW); + if (vdso) + vdso_pending_syscall_return_address = dlsym(vdso, "__vdso_pending_syscall_return_address"); + sethandler(SIGTRAP, sigtrap, 0); - syscall(SYS_getpid); /* Force symbol binding without TF set. */ + syscall(SYS_break); /* Force symbol binding without TF set. */ printf("[RUN]\tSet TF and check a fast syscall\n"); set_eflags(get_eflags() | X86_EFLAGS_TF); - syscall(SYS_getpid, 1, 2, 3, 4, 5, 6); + + /* + * We need a harmless syscall that will never return its own syscall + * nr. SYS_break is not implemented and returns -ENOSYS. + */ + syscall(SYS_break, 1, 2, 3, 4, 5, 6); if (!got_sysinfo) { set_eflags(get_eflags() & ~X86_EFLAGS_TF); -- 2.5.0