musl implements system call cancellation in an unusual but clever way.
When a thread issues a cancellable syscall, musl issues the syscall
through a special thunk that looks roughly like this:
cancellable_syscall:
test whether a cancel is queued
jnz cancel_me
int $0x80
end_cancellable_syscall:
If a pthread cancellation signal hits with
cancellable_syscall <= EIP < end_cancellable_syscall, then the
signal interrupted a cancellation point before the syscall in
question started. If so, it rewrites the calling context to skip
the syscall and simulate a -EINTR return. The caller will detect
this simulated -EINTR or an actual -EINTR and handle a possible
cancellation event.
This technique doesn't work if int $0x80 is replaced by a call to
AT_SYSINFO: the signal handler can no longer tell whether it's
interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was
called from.
Add minimal helpers so that musl's signal handler can learn the
status of a possible pending AT_SYSINFO invocation and, if it hasn't
entered the kernel yet, abort it without needing to parse the vdso
DWARF unwind data.
Signed-off-by: Andy Lutomirski <[email protected]>
---
musl people-
Does this solve your AT_SYSINFO cancellation problem? I'd like to
make sure it survives an actual implementation before I commit to the ABI.
x86 people-
Are you okay with this idea?
arch/x86/entry/vdso/Makefile | 3 +-
arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 +
tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++--
4 files changed, 171 insertions(+), 7 deletions(-)
create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index b88846471247..465052b49603 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -130,7 +130,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-targets += vdso32/vclock_gettime.o
+targets += vdso32/vclock_gettime.o vdso32/cancellation_helpers.o
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
@@ -150,6 +150,7 @@ $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(obj)/vdso32.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/cancellation_helpers.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/system_call.o \
$(obj)/vdso32/sigreturn.o
diff --git a/arch/x86/entry/vdso/vdso32/cancellation_helpers.c b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
new file mode 100644
index 000000000000..3cb2e88baec6
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Andrew Lutomirski
+ * Subject to the GNU Public License, v.2
+ *
+ * This provides helpers to enable libc implementations to cancel
+ * interrupted AT_SYSINFO invocations without needing to parse the
+ * DWARF unwinding instructions.
+ */
+
+#include <asm/signal.h>
+#include <asm/sigframe.h>
+
+extern char __kernel_vsyscall[] __attribute__((visibility("hidden")));
+extern char int80_landing_pad[] __attribute__((visibility("hidden")));
+
+static unsigned long *pending_syscall_retaddr_ptr(const void *context)
+{
+ const struct ucontext_ia32 *uc = context;
+ unsigned long ctx_eip = uc->uc_mcontext.ip;
+ unsigned long offset_into_vsyscall;
+ unsigned long *retaddr;
+
+ /*
+ * An AT_SYSINFO system call is pending if and only if we're in
+ * __kernel_vsyscall before int80_landing_pad. If we're at
+ * int80_landing_pad or beyond, we've finished the system call
+ * and are on our way out.
+ *
+ * If we're at int80_landing_pad-2, then either we're using the
+ * int $0x80 slow path because we have no fast system call
+ * support or we are restarting a fast system call. Either way,
+ * the system call is still pending.
+ */
+
+ if (ctx_eip < (unsigned long)__kernel_vsyscall ||
+ ctx_eip >= (unsigned long)int80_landing_pad)
+ return NULL;
+
+ /*
+ * The first three instructions of __kernel_vsyscall are one-byte
+ * pushes.
+ */
+ offset_into_vsyscall = (ctx_eip - (unsigned long)__kernel_vsyscall);
+ retaddr = (unsigned long *)uc->uc_mcontext.sp;
+ if (offset_into_vsyscall < 3)
+ retaddr += offset_into_vsyscall;
+ else
+ retaddr += 3;
+
+ /*
+ * GCC (correctly) fails to deduce out that retaddr can't be NULL
+ * in the success path. Helping it out reduces code size.
+ */
+ if (!retaddr)
+ __builtin_unreachable();
+
+ return retaddr;
+}
+
+/*
+ * If context is a sigcontet for a pending AT_SYSINFO syscall, returns
+ * the return address of that syscall. Otherwise returns -1UL.
+ */
+unsigned long __vdso_pending_syscall_return_address(const void *context)
+{
+ unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+ return retaddr ? *retaddr : -1UL;
+}
+
+/*
+ * If context is a sigcontext for a pending AT_SYSINFO syscall, then
+ * this will pop off the call frame and point the context to
+ * AT_SYSINFO's return address. ESP will contain whatever value it had
+ * immediately prior to the call instruction (i.e. ESP acts as though
+ * the system call returned normally). EAX will be set to -EINTR. All
+ * other GPRs will be clobbered. __vdso_abort_pending_syscall will
+ * return 0.
+ *
+ * If context is a valid sigcontext that does not represent a pending
+ * AT_SYSINFO syscall, then __vdso_abort_pending_syscall returns
+ * -EINVAL.
+ *
+ * If context is not a valid sigcontext at all, behavior is undefined.
+ */
+long __vdso_abort_pending_syscall(void *context)
+{
+ struct ucontext_ia32 *uc = context;
+ unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+
+ if (!retaddr)
+ return -EINVAL;
+
+ uc->uc_mcontext.ip = *retaddr;
+ uc->uc_mcontext.sp = (unsigned long)(retaddr + 1);
+
+ /*
+ * Clobber GPRs -- we don't want to implement full unwinding, and we
+ * don't want userspace to start expecting anything about the final
+ * state of the GPRs.
+ *
+ * (There really are subtleties here. EAX can be clobbered by
+ * syscall restart, and register limitations mean that the
+ * saved context has at least one of the argument registers
+ * used for a different purpose by the calling sequence just
+ * prior to kernel entry. In the current implementation, that
+ * register is EBP, but it could change.)
+ */
+ uc->uc_mcontext.ax = -EINTR;
+ uc->uc_mcontext.bx = 0xFFFFFFFF;
+ uc->uc_mcontext.cx = 0xFFFFFFFF;
+ uc->uc_mcontext.dx = 0xFFFFFFFF;
+ uc->uc_mcontext.si = 0xFFFFFFFF;
+ uc->uc_mcontext.di = 0xFFFFFFFF;
+ uc->uc_mcontext.bp = 0xFFFFFFFF;
+ return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf294bf..f04e8bd30755 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,6 +25,8 @@ VERSION
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_time;
+ __vdso_pending_syscall_return_address;
+ __vdso_abort_pending_syscall;
};
LINUX_2.5 {
diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c
index 00a26a82fa98..7c649b4b6834 100644
--- a/tools/testing/selftests/x86/unwind_vdso.c
+++ b/tools/testing/selftests/x86/unwind_vdso.c
@@ -35,6 +35,7 @@ int main()
#include <syscall.h>
#include <unistd.h>
#include <string.h>
+#include <errno.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <signal.h>
@@ -88,8 +89,12 @@ static unsigned long sysinfo;
static bool got_sysinfo = false;
static unsigned long return_address;
+static unsigned long (*vdso_pending_syscall_return_address)(
+ const void *context);
+
struct unwind_state {
unsigned long ip; /* trap source */
+ unsigned long ax; /* ax at call site */
int depth; /* -1 until we hit the trap source */
};
@@ -115,7 +120,7 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
unsigned long ebp = _Unwind_GetGR(ctx, 5);
unsigned long esi = _Unwind_GetGR(ctx, 6);
unsigned long edi = _Unwind_GetGR(ctx, 7);
- bool ok = (eax == SYS_getpid || eax == getpid()) &&
+ bool ok = (eax == SYS_break || eax == -ENOSYS) &&
ebx == 1 && ecx == 2 && edx == 3 &&
esi == 4 && edi == 5 && ebp == 6;
@@ -125,6 +130,8 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
(ok ? "OK" : "FAIL"),
eax, ebx, ecx, edx, esi, edi, ebp);
+ state->ax = eax;
+
return _URC_NORMAL_STOP;
} else {
state->depth++;
@@ -137,6 +144,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
ucontext_t *ctx = (ucontext_t *)ctx_void;
struct unwind_state state;
unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
+ unsigned long reported_return_address = 0;
if (!got_sysinfo && ip == sysinfo) {
got_sysinfo = true;
@@ -148,8 +156,15 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
ip, return_address);
}
- if (!got_sysinfo)
- return; /* Not there yet */
+ if (!got_sysinfo) {
+ if (vdso_pending_syscall_return_address &&
+ vdso_pending_syscall_return_address(ctx_void) != -1UL) {
+ printf("[FAIL]\t__vdso_pending_syscall_return_address incorrectly detected a pending syscall\n");
+ nerrs++;
+ }
+
+ return; /* We haven't started AT_SYSINFO yet */
+ }
if (ip == return_address) {
ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
@@ -157,11 +172,32 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
return;
}
- printf("\tSIGTRAP at 0x%lx\n", ip);
+ if (vdso_pending_syscall_return_address) {
+ reported_return_address =
+ vdso_pending_syscall_return_address(ctx_void);
+ if (reported_return_address != -1UL)
+ printf("\tSIGTRAP at 0x%lx, pending syscall will return to 0x%lx\n",
+ ip, reported_return_address);
+ else
+ printf("\tSIGTRAP at 0x%lx, no syscall pending\n", ip);
+ } else {
+ printf("\tSIGTRAP at 0x%lx\n", ip);
+ }
state.ip = ip;
state.depth = -1;
_Unwind_Backtrace(trace_fn, &state);
+
+ if (vdso_pending_syscall_return_address) {
+ unsigned long expected =
+ (state.ax == SYS_break ? return_address : -1UL);
+ if (reported_return_address != expected) {
+ printf("[FAIL]\t __vdso_pending_syscall_return_address returned 0x%lx; expected 0x%lx\n", reported_return_address, expected);
+ nerrs++;
+ } else {
+ printf("[OK]\t __vdso_pending_syscall_return_address returned the correct value\n");
+ }
+ }
}
int main()
@@ -177,12 +213,21 @@ int main()
info.dli_fname, info.dli_fbase);
}
+ void *vdso = dlopen("linux-gate.so.1", RTLD_NOW);
+ if (vdso)
+ vdso_pending_syscall_return_address = dlsym(vdso, "__vdso_pending_syscall_return_address");
+
sethandler(SIGTRAP, sigtrap, 0);
- syscall(SYS_getpid); /* Force symbol binding without TF set. */
+ syscall(SYS_break); /* Force symbol binding without TF set. */
printf("[RUN]\tSet TF and check a fast syscall\n");
set_eflags(get_eflags() | X86_EFLAGS_TF);
- syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
+
+ /*
+ * We need a harmless syscall that will never return its own syscall
+ * nr. SYS_break is not implemented and returns -ENOSYS.
+ */
+ syscall(SYS_break, 1, 2, 3, 4, 5, 6);
if (!got_sysinfo) {
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
--
2.5.0
* Andy Lutomirski <[email protected]> wrote:
> musl implements system call cancellation in an unusual but clever way.
So I'm sceptical about the concept.
Could someone remind me why cancellation points matter to user-space?
I know the pthread APIs and semantics that are behind it, I just don't see how it
can be truly utilized for any meaningful programmatic property: for example the
moment you add any sort of ad-hoc printf() based tracing or any other spontaneous
logging IO to your application, you add in a lot of potential cancellation points
into various places in your user-space logic ...
It's _very_ easy to add inadvertent cancellation point to the code in practice, so
using the default pthread cancellation model and relying on what is a cancellation
point is crazy and very libc dependent in general. POSIX seems to be pretty vague
about it as well. So unless you make heavy use of pthread_setcancelstate() to
explicitly mark your work atoms, it's a really bad interface to rely on.
And if you are using pthread_setcancelstate(), instead of relying on calcellation,
then you are not really using the built-in cancellation points but have to spike
your code with pthread_testcancel(). In that case, why not just use your own
explicit 'cancellation' points in a few strategic places - which is mostly just a
simple flag really. That's what most worker thread models that I've seen use.
I suspect more complex runtimes like java runtimes couldn't care less, so it's
really something that only libc using C/C++ code cares about.
> When a thread issues a cancellable syscall, musl issues the syscall
> through a special thunk that looks roughly like this:
>
> cancellable_syscall:
> test whether a cancel is queued
> jnz cancel_me
> int $0x80
> end_cancellable_syscall:
>
> If a pthread cancellation signal hits with
> cancellable_syscall <= EIP < end_cancellable_syscall, then the
> signal interrupted a cancellation point before the syscall in
> question started. If so, it rewrites the calling context to skip
> the syscall and simulate a -EINTR return. The caller will detect
> this simulated -EINTR or an actual -EINTR and handle a possible
> cancellation event.
Why is so much complexity added to avoid a ~3 instructions window where
calcellation is tested? Cancellation at work atom boundaries is a fundamentally
'polling' model anyway, and signal delivery is asynchronous, with a fundamental
IPI delay if it's cross-CPU.
> This technique doesn't work if int $0x80 is replaced by a call to
> AT_SYSINFO: the signal handler can no longer tell whether it's
> interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was
> called from.
>
> Add minimal helpers so that musl's signal handler can learn the
> status of a possible pending AT_SYSINFO invocation and, if it hasn't
> entered the kernel yet, abort it without needing to parse the vdso
> DWARF unwind data.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
>
> musl people-
>
> Does this solve your AT_SYSINFO cancellation problem? I'd like to
> make sure it survives an actual implementation before I commit to the ABI.
>
> x86 people-
>
> Are you okay with this idea?
>
>
> arch/x86/entry/vdso/Makefile | 3 +-
> arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
> arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 +
> tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++--
> 4 files changed, 171 insertions(+), 7 deletions(-)
> create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c
I'd really like to see a cost/benefit analysis here! Some before/after explanation
- exactly what is not possible today (in practical terms), what are the practical
effects of not being able to do that, and how would the bright future look like?
Thanks,
Ingo
* Ingo Molnar <[email protected]> [2016-03-09 09:56:31 +0100]:
> * Andy Lutomirski <[email protected]> wrote:
>
> > musl implements system call cancellation in an unusual but clever way.
>
> So I'm sceptical about the concept.
>
> Could someone remind me why cancellation points matter to user-space?
>
because of standards.
> I know the pthread APIs and semantics that are behind it, I just don't see how it
> can be truly utilized for any meaningful programmatic property: for example the
> moment you add any sort of ad-hoc printf() based tracing or any other spontaneous
> logging IO to your application, you add in a lot of potential cancellation points
> into various places in your user-space logic ...
>
> It's _very_ easy to add inadvertent cancellation point to the code in practice, so
> using the default pthread cancellation model and relying on what is a cancellation
> point is crazy and very libc dependent in general. POSIX seems to be pretty vague
> about it as well. So unless you make heavy use of pthread_setcancelstate() to
> explicitly mark your work atoms, it's a really bad interface to rely on.
>
if the canceled thread only executes code that expects cancellation then
it should work (code that does not expect cancellation won't have cancellation
cleanup handlers set up and thus cancelling it can cause problems).
> And if you are using pthread_setcancelstate(), instead of relying on calcellation,
> then you are not really using the built-in cancellation points but have to spike
> your code with pthread_testcancel(). In that case, why not just use your own
> explicit 'cancellation' points in a few strategic places - which is mostly just a
> simple flag really. That's what most worker thread models that I've seen use.
>
the point of cancellation is to be able to kill a thread that is in a
blocking syscall. i don't see how a flag helps with that, this is hard
to do without libc help, hence pthread_cancel exists.
> I suspect more complex runtimes like java runtimes couldn't care less, so it's
> really something that only libc using C/C++ code cares about.
>
c++ code cannot be cancelled if it uses non-pod objects or c++ threads.
(destructor vs cancellation cleanup semantics is undefined)
this is for posix conforming code.
> > When a thread issues a cancellable syscall, musl issues the syscall
> > through a special thunk that looks roughly like this:
> >
> > cancellable_syscall:
> > test whether a cancel is queued
> > jnz cancel_me
> > int $0x80
> > end_cancellable_syscall:
> >
> > If a pthread cancellation signal hits with
> > cancellable_syscall <= EIP < end_cancellable_syscall, then the
> > signal interrupted a cancellation point before the syscall in
> > question started. If so, it rewrites the calling context to skip
> > the syscall and simulate a -EINTR return. The caller will detect
> > this simulated -EINTR or an actual -EINTR and handle a possible
> > cancellation event.
>
> Why is so much complexity added to avoid a ~3 instructions window where
> calcellation is tested? Cancellation at work atom boundaries is a fundamentally
> 'polling' model anyway, and signal delivery is asynchronous, with a fundamental
> IPI delay if it's cross-CPU.
>
to avoid the race when the thread is cancelled after the test but before
the syscall see http://ewontfix.com/16/
> > This technique doesn't work if int $0x80 is replaced by a call to
> > AT_SYSINFO: the signal handler can no longer tell whether it's
> > interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was
> > called from.
> >
> > Add minimal helpers so that musl's signal handler can learn the
> > status of a possible pending AT_SYSINFO invocation and, if it hasn't
> > entered the kernel yet, abort it without needing to parse the vdso
> > DWARF unwind data.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> >
> > musl people-
> >
> > Does this solve your AT_SYSINFO cancellation problem? I'd like to
> > make sure it survives an actual implementation before I commit to the ABI.
> >
> > x86 people-
> >
> > Are you okay with this idea?
> >
> >
> > arch/x86/entry/vdso/Makefile | 3 +-
> > arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
> > arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 +
> > tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++--
> > 4 files changed, 171 insertions(+), 7 deletions(-)
> > create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c
>
> I'd really like to see a cost/benefit analysis here! Some before/after explanation
> - exactly what is not possible today (in practical terms), what are the practical
> effects of not being able to do that, and how would the bright future look like?
>
> Thanks,
>
> Ingo
* Szabolcs Nagy <[email protected]> [2016-03-09 12:34:50 +0100]:
> * Ingo Molnar <[email protected]> [2016-03-09 09:56:31 +0100]:
> > Why is so much complexity added to avoid a ~3 instructions window where
> > calcellation is tested? Cancellation at work atom boundaries is a fundamentally
> > 'polling' model anyway, and signal delivery is asynchronous, with a fundamental
> > IPI delay if it's cross-CPU.
> >
>
> to avoid the race when the thread is cancelled after the test but before
> the syscall see http://ewontfix.com/16/
>
wrong link
http://ewontfix.com/2/
On Tue, Mar 8, 2016 at 5:24 PM, Andy Lutomirski <[email protected]> wrote:
> musl implements system call cancellation in an unusual but clever way.
> When a thread issues a cancellable syscall, musl issues the syscall
> through a special thunk that looks roughly like this:
>
FWIW, this patch fails disastrously on 64-bit kernels. I fixed it,
but it needs kbuild changes. I'll send those out to the maintainers.
--Andy
On Wed, Mar 9, 2016 at 3:34 AM, Szabolcs Nagy <[email protected]> wrote:
>>
>> Could someone remind me why cancellation points matter to user-space?
>
> because of standards.
So quite frankly, if we have to do kernel support for this, then let's
do it right, instead of just perpetuating a hack that was done in user
space in a new way.
We already have support for cancelling blocking system calls early: we
do it for fatal signals (exactly because we know that it's ok to
return -EINTR without failing POSIX semantics - the dying thread will
never actually *see* the -EINTR because it's dying).
I suspect that what you guys want is the same semantics as a fatal
signal (return early with -EINTR), but without the actual fatality
(you want to do cleanup in the cancelled thread).
I suspect that we could fairly easily give those kinds of semantics.
We could add a new flag to the sigaction (sa_flags) that says "this
signal interrupts even uninterruptible system calls".
Would that be good for you?
And if not, can you explain the exact semantics you need? IThere might
be some reason why you cannot reserve a particular signal for this,
for example, but I'd like to know more precisely..
Because this "let's compare addresses" seems just excessively hacky.
It's a clever little hack when you're doing user space and don't want
to rely on kernel changes, but now that Andy is actuallty trying to
push kernel changes it turns into just disgusting.
Linus
On Wed, Mar 9, 2016 at 11:47 AM, Linus Torvalds
<[email protected]> wrote:
> On Wed, Mar 9, 2016 at 3:34 AM, Szabolcs Nagy <[email protected]> wrote:
>>>
>>> Could someone remind me why cancellation points matter to user-space?
>>
>> because of standards.
>
> So quite frankly, if we have to do kernel support for this, then let's
> do it right, instead of just perpetuating a hack that was done in user
> space in a new way.
>
> We already have support for cancelling blocking system calls early: we
> do it for fatal signals (exactly because we know that it's ok to
> return -EINTR without failing POSIX semantics - the dying thread will
> never actually *see* the -EINTR because it's dying).
>
> I suspect that what you guys want is the same semantics as a fatal
> signal (return early with -EINTR), but without the actual fatality
> (you want to do cleanup in the cancelled thread).
>
How safe would this be in a multithreaded process? For example, if
open() gets canceled in the "killable" sense, is it guaranteed that no
file descriptor will be allocated?
> I suspect that we could fairly easily give those kinds of semantics.
> We could add a new flag to the sigaction (sa_flags) that says "this
> signal interrupts even uninterruptible system calls".
>
> Would that be good for you?
>
> And if not, can you explain the exact semantics you need? IThere might
> be some reason why you cannot reserve a particular signal for this,
> for example, but I'd like to know more precisely..
>
> Because this "let's compare addresses" seems just excessively hacky.
> It's a clever little hack when you're doing user space and don't want
> to rely on kernel changes, but now that Andy is actuallty trying to
> push kernel changes it turns into just disgusting.
>
Let me try to summarize my understanding of the semantics.
Thread A sends thread B a signal. Thread B wants to ignore the signal
and defer handling unless it's either in a particular syscall and
returns -EINTR or unless the thread is about to do the syscall.
This would all be trivial if there were a way to set up a signal that
is *only* delivered in response to a syscall, no? SA_ONLY_IN_SYSCALL,
perhaps?
Frankly, I'm a bir surprised that musl didn't take the approach of
"pthread cancellation is not such a great idea -- let's just not
support it".
> Linus
--
Andy Lutomirski
AMA Capital Management, LLC
On Wed, Mar 9, 2016 at 9:58 AM, Andy Lutomirski <[email protected]> wrote:
> On Tue, Mar 8, 2016 at 5:24 PM, Andy Lutomirski <[email protected]> wrote:
>> musl implements system call cancellation in an unusual but clever way.
>> When a thread issues a cancellable syscall, musl issues the syscall
>> through a special thunk that looks roughly like this:
>>
>
> FWIW, this patch fails disastrously on 64-bit kernels. I fixed it,
> but it needs kbuild changes. I'll send those out to the maintainers.
This version should be okay:
https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=fed6d35d3941bc53896ab80b5c8d68d54cc00347
You'll need the parent, too, if you want to test. I'm going to give
the 0day bot a good long chew, since the parent change is a little bit
scary.
--Andy
On Wed, Mar 9, 2016 at 12:57 PM, Andy Lutomirski <[email protected]> wrote:
>
> How safe would this be in a multithreaded process? For example, if
> open() gets canceled in the "killable" sense, is it guaranteed that no
> file descriptor will be allocated?
Not all system calls can be killed, we only do the usual cases. A
system call has to have the proper EINTR logic in place, so it's not
like we kill system calls at any random point.
> Let me try to summarize my understanding of the semantics.
>
> Thread A sends thread B a signal. Thread B wants to ignore the signal
> and defer handling unless it's either in a particular syscall and
> returns -EINTR or unless the thread is about to do the syscall.
Note that for the kernel, we don't actually have to use a signal for
this at all. Our existing "cancel system calls" code only works for
fatal signals, but that's just a trivial implementation issue.
We could add a system call that just sets a cancel flag in another
thread, and we'd just use that cancel flag to say "abort the currently
executing system call with EINTR" - in all the same places we
currently dot hat "fatal_signal_pending()" thing.
You'd still have to have all the user-space logic to do the
cancellation cleanup etc. But now you could actually cancel a write()
system call in the *middle*, which is currently just not an option.
Linus
On Wed, Mar 09, 2016 at 11:47:30AM -0800, Linus Torvalds wrote:
> On Wed, Mar 9, 2016 at 3:34 AM, Szabolcs Nagy <[email protected]> wrote:
> >>
> >> Could someone remind me why cancellation points matter to user-space?
> >
> > because of standards.
>
> So quite frankly, if we have to do kernel support for this, then let's
> do it right, instead of just perpetuating a hack that was done in user
> space in a new way.
>
> We already have support for cancelling blocking system calls early: we
> do it for fatal signals (exactly because we know that it's ok to
> return -EINTR without failing POSIX semantics - the dying thread will
> never actually *see* the -EINTR because it's dying).
>
> I suspect that what you guys want is the same semantics as a fatal
> signal (return early with -EINTR), but without the actual fatality
> (you want to do cleanup in the cancelled thread).
No, the semantics need to be identical to EINTR -- you can't cancel an
operation where some work has already been done. This is both a POSIX
requirement and a conceptual requirement. When a thread is cancelled,
the process is not terminating abnormally; it's continuing. It needs
to be able to know whether some work was completed, because that
changes what the cleanup code needs to do in order for a consistent
state to be maintained. This is most critical with syscalls that
allocate or free resources -- open, close, recvmsg accepting file
descriptors, etc. -- but it can even matter for reads and writes.
This is the whole reason we need a race-free cancellation rather than
the buggy implementation glibc historically used (which they are in
the process of fixing too).
Anyway, in the case where some but not all work was completed already
at the time the cancellation request was made, the function needs to
return and report whatever was successful.
> I suspect that we could fairly easily give those kinds of semantics.
> We could add a new flag to the sigaction (sa_flags) that says "this
> signal interrupts even uninterruptible system calls".
This would not help, because whether the system call should be
cancellable is a function of the caller, not the system call; some
syscalls are cancellable when used in one place but not in others.
Also it does not solve the race condition; it's possible that the
signal is delivered _after_ userspace checks the cancellation flag,
but _before_ the syscall is made. Thus we need a way to probe whether
the program counter is in a range between the userspace flag check and
the syscall instruction.
I believe a new kernel cancellation API with a sticky cancellation
flag (rather than a signal), and a flag or'd onto the syscall number
to make it cancellable at the call point, could work, but then
userspace needs to support fairly different old and new kernel APIs in
order to be able to run on old kernels while also taking advantage of
new ones, and it's not clear to me that it would actually be
worthwhile to do so. I could see doing it for a completely new syscall
API, but as a second syscall API for a system that already has one it
seems gratuitous. From my perspective the existing approach (checking
program counter from signal handler) is very clean and simple. After
all it made enough sense that I was able to convince the glibc folks
to adopt it.
Rich
* Andy Lutomirski <[email protected]> wrote:
> Let me try to summarize my understanding of the semantics.
>
> Thread A sends thread B a signal. Thread B wants to ignore the signal and defer
> handling unless it's either in a particular syscall and returns -EINTR or unless
> the thread is about to do the syscall.
s/the syscall/an interruptible syscall/
The fundamental intention is to essentially allow the asynchronous killing
(cancellation) of pthread threads without corrupting user-space data structures
such as malloc() state.
There's a long list of system calls listed at pthread(8) that must be cancellation
points, plus an even longer list of system calls and libc APIs that may be
cancellation points.
On glibc signal 32 (the first RT signal) is used as the cancellation signal.
But I guess you knew all this already!
So my original thinking was this:
| What surprises me is why Musl even bothers with trying to detect system calls
| that are about to be executed. Cancellation is a fundamentally polling-type
| API, a very small, 2-3 instructions window to 'miss' the current system call
| has no practical latency effect - so why does it even attempt to detect that
| RIP range? Why doesn't Musl just check the cancellation flag (activated by
| signal 32) and is content? Am I misunderstanding something about it?
... and when I wrote that up I realized the detail that I missed: it's a
problematic race if the thread starts a long-lived blocking system call (such as
accept()), shortly after the cancellation signal has been sent.
So the signal-32 handler _has_ to check the RIP and make sure that the system call
is not about to be executed - cancellation might be delayed indefinitely
otherwise. It's essentially needed for correctness.
Linus's suggestion to allow system calls to be more interruptible via a new SA_
flag also makes sense, but that is a latency improvement change - while the aspect
I was wondering about was a fundamental correctness detail.
So I withdraw my objection regarding AT_SYSINFO cancellation helpers. User-space
needs to have a signal-atomic way to prevent system calls from being started after
a cancellation signal has been received.
Thanks,
Ingo
* Rich Felker <[email protected]> wrote:
> [...]
>
> I believe a new kernel cancellation API with a sticky cancellation flag (rather
> than a signal), and a flag or'd onto the syscall number to make it cancellable
> at the call point, could work, but then userspace needs to support fairly
> different old and new kernel APIs in order to be able to run on old kernels
> while also taking advantage of new ones, and it's not clear to me that it would
> actually be worthwhile to do so. I could see doing it for a completely new
> syscall API, but as a second syscall API for a system that already has one it
> seems gratuitous. From my perspective the existing approach (checking program
> counter from signal handler) is very clean and simple. After all it made enough
> sense that I was able to convince the glibc folks to adopt it.
I concur with your overall analysis, but things get a bit messy once we consider
AT_SYSINFO which is a non-atomic mix of user-space and kernel-space code. Trying
to hand cancellation status through that results in extra complexity:
arch/x86/entry/vdso/Makefile | 3 +-
arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 +
tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++--
4 files changed, 171 insertions(+), 7 deletions(-)
So instead of a sticky cancellation flag, we could introduce a sticky cancellation
signal.
A 'sticky signal' is not cleared from signal_pending() when the signal handler
executes, but it's automatically blocked so no signal handler recursion occurs.
(A sticky signal could still be cleared via a separate mechanism, by the
cancellation cleanup code.)
Such a 'sticky cancellation signal' would, in the racy situation, cause new
blocking system calls to immediately return with -EINTR. Non-blocking syscalls
could still be used. (So the cancellation signal handler itself would still have
access to various fundamental system calls.)
I think this would avoid messy coupling between the kernel's increasingly more
varied system call entry code and C libraries.
Sticky signals could be requested via a new SA_ flag.
What do you think?
Thanks,
Ingo
On Thu, Mar 10, 2016 at 12:16:46PM +0100, Ingo Molnar wrote:
>
> * Rich Felker <[email protected]> wrote:
>
> > [...]
> >
> > I believe a new kernel cancellation API with a sticky cancellation flag (rather
> > than a signal), and a flag or'd onto the syscall number to make it cancellable
> > at the call point, could work, but then userspace needs to support fairly
> > different old and new kernel APIs in order to be able to run on old kernels
> > while also taking advantage of new ones, and it's not clear to me that it would
> > actually be worthwhile to do so. I could see doing it for a completely new
> > syscall API, but as a second syscall API for a system that already has one it
> > seems gratuitous. From my perspective the existing approach (checking program
> > counter from signal handler) is very clean and simple. After all it made enough
> > sense that I was able to convince the glibc folks to adopt it.
>
> I concur with your overall analysis, but things get a bit messy once we consider
> AT_SYSINFO which is a non-atomic mix of user-space and kernel-space code. Trying
> to hand cancellation status through that results in extra complexity:
>
> arch/x86/entry/vdso/Makefile | 3 +-
> arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
> arch/x86/entry/vdso/vdso32/vdso32.lds.S | 2 +
> tools/testing/selftests/x86/unwind_vdso.c | 57 +++++++++--
> 4 files changed, 171 insertions(+), 7 deletions(-)
>
> So instead of a sticky cancellation flag, we could introduce a sticky cancellation
> signal.
>
> A 'sticky signal' is not cleared from signal_pending() when the signal handler
> executes, but it's automatically blocked so no signal handler recursion occurs.
> (A sticky signal could still be cleared via a separate mechanism, by the
> cancellation cleanup code.)
>
> Such a 'sticky cancellation signal' would, in the racy situation, cause new
> blocking system calls to immediately return with -EINTR. Non-blocking syscalls
> could still be used. (So the cancellation signal handler itself would still have
> access to various fundamental system calls.)
>
> I think this would avoid messy coupling between the kernel's increasingly more
> varied system call entry code and C libraries.
>
> Sticky signals could be requested via a new SA_ flag.
>
> What do you think?
This still doesn't address the issue that the code making the syscall
needs to be able to control whether it's cancellable or not. Not only
do some syscalls whose public functions are cancellation points need
to be used internally in non-cancellable ways; there's also the
pthread_setcancelstate interface that allows deferring cancellation so
that it's possible to call functions which are cancellation points
without invoking cancellation.
Ideally all syscalls would be like pselect/ppoll and take a sigset_t
to unmask/remask atomically with respect to the syscall action. Then
implementing cancellation (as well as using EINTR race-free) would be
trivial. But this is obviously not a practical change to make.
>From my standpoint the simplest and cleanest solution is for vdso to
provide a predicate function that takes a ucontext_t and returns
true/false for whether it represents a state prior to entering (or
reentering, for restart state) the vdso syscall. If vdso exports this
symbol libc can use vdso syscall with cancellation. If not, it can
just fallback to straight inline syscall like now.
Rich
* Rich Felker <[email protected]> wrote:
> > So instead of a sticky cancellation flag, we could introduce a sticky
> > cancellation signal.
> >
> > A 'sticky signal' is not cleared from signal_pending() when the signal handler
> > executes, but it's automatically blocked so no signal handler recursion
> > occurs. (A sticky signal could still be cleared via a separate mechanism, by
> > the cancellation cleanup code.)
> >
> > Such a 'sticky cancellation signal' would, in the racy situation, cause new
> > blocking system calls to immediately return with -EINTR. Non-blocking syscalls
> > could still be used. (So the cancellation signal handler itself would still
> > have access to various fundamental system calls.)
> >
> > I think this would avoid messy coupling between the kernel's increasingly more
> > varied system call entry code and C libraries.
> >
> > Sticky signals could be requested via a new SA_ flag.
> >
> > What do you think?
>
> This still doesn't address the issue that the code making the syscall needs to
> be able to control whether it's cancellable or not. Not only do some syscalls
> whose public functions are cancellation points need to be used internally in
> non-cancellable ways; there's also the pthread_setcancelstate interface that
> allows deferring cancellation so that it's possible to call functions which are
> cancellation points without invoking cancellation.
I don't think there's a problem - but I might be wrong:
One way I think it would work is the following: a sticky signal is not the
cancellation flag - it's a helper construct to implement the flag in user-space in
a race-free way.
Say you have RT signal-32 as the cancellation signal, and it's a sticky signal.
When pthread_cancel() wants to cancel another thread, it first (atomically) sets
the desired cancel state of the target thread. If that state signals that the
thread is cancellable right now, and that we initiated its cancellation, then we
send signal-32. I.e. the signal only ever gets sent if the thread is in a
cancellable state.
libc internal functions and the pthread_setcancelstate() API can temporarily
change the cancel state of a thread to non-cancellable - but pthread_cancel()
observes those state transitions.
The 'sticky' nature of signal-32 will make a difference in the following race
condition, if the cancellation flag is checked before a system call by the C
library, and signal-32 arrives before the system call is executed. In that case
the 'sticky' nature of the signal makes sure that all subsequent system calls
return immediately.
The sticky signal is only ever sent when the thread is in cancellable state - and
if the target thread notices the cancellation request before the signal arrives,
it first waits for its arrival before executing any new system calls (as part of
the teardown, etc.).
So the C library never has to do complex work with a sticky signal pending.
Does that make more sense to you?
> From my standpoint the simplest and cleanest solution is for vdso to provide a
> predicate function that takes a ucontext_t and returns true/false for whether it
> represents a state prior to entering (or reentering, for restart state) the vdso
> syscall. If vdso exports this symbol libc can use vdso syscall with
> cancellation. If not, it can just fallback to straight inline syscall like now.
Offering such a flag pushes unreasonable conceptual overhead into the vDSO proper
in the long run: right now it might be easy to implement because the code paths
are relatively simple and we can generate the flag passively via RIP checking -
but if the vDSO grows more complex interfaces in the future, we'd essentially have
to track our entry/exit state dynamically which sucks ...
I think the real solution is to push all such overhead to the cancellation API
side: it can track its state, and it can use sticky signals to make sure blocking
system calls return immediately once a cancellation is in progress.
Thanks,
Ingo
On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
>
> * Rich Felker <[email protected]> wrote:
>
> > > So instead of a sticky cancellation flag, we could introduce a sticky
> > > cancellation signal.
> > >
> > > A 'sticky signal' is not cleared from signal_pending() when the signal handler
> > > executes, but it's automatically blocked so no signal handler recursion
> > > occurs. (A sticky signal could still be cleared via a separate mechanism, by
> > > the cancellation cleanup code.)
> > >
> > > Such a 'sticky cancellation signal' would, in the racy situation, cause new
> > > blocking system calls to immediately return with -EINTR. Non-blocking syscalls
> > > could still be used. (So the cancellation signal handler itself would still
> > > have access to various fundamental system calls.)
> > >
> > > I think this would avoid messy coupling between the kernel's increasingly more
> > > varied system call entry code and C libraries.
> > >
> > > Sticky signals could be requested via a new SA_ flag.
> > >
> > > What do you think?
> >
> > This still doesn't address the issue that the code making the syscall needs to
> > be able to control whether it's cancellable or not. Not only do some syscalls
> > whose public functions are cancellation points need to be used internally in
> > non-cancellable ways; there's also the pthread_setcancelstate interface that
> > allows deferring cancellation so that it's possible to call functions which are
> > cancellation points without invoking cancellation.
>
> I don't think there's a problem - but I might be wrong:
>
> One way I think it would work is the following: a sticky signal is not the
> cancellation flag - it's a helper construct to implement the flag in user-space in
> a race-free way.
>
> Say you have RT signal-32 as the cancellation signal, and it's a sticky signal.
>
> When pthread_cancel() wants to cancel another thread, it first (atomically) sets
> the desired cancel state of the target thread. If that state signals that the
> thread is cancellable right now, and that we initiated its cancellation, then we
> send signal-32. I.e. the signal only ever gets sent if the thread is in a
> cancellable state.
>
> libc internal functions and the pthread_setcancelstate() API can temporarily
> change the cancel state of a thread to non-cancellable - but pthread_cancel()
> observes those state transitions.
>
> The 'sticky' nature of signal-32 will make a difference in the following race
> condition, if the cancellation flag is checked before a system call by the C
> library, and signal-32 arrives before the system call is executed. In that case
> the 'sticky' nature of the signal makes sure that all subsequent system calls
> return immediately.
>
> The sticky signal is only ever sent when the thread is in cancellable state - and
> if the target thread notices the cancellation request before the signal arrives,
> it first waits for its arrival before executing any new system calls (as part of
> the teardown, etc.).
>
> So the C library never has to do complex work with a sticky signal pending.
>
> Does that make more sense to you?
No, it doesn't work. Cancellability of the target thread at the time
of the cancellation request (when you would decide whether or not to
send the signal) has no relation to cancellability at the time of
calling the cancellation point. Consider 2 threads A and B and the
following sequence of events:
1. A has cancellation enabled
2. B calls pthread_cancel(A) and sets sticky pending signal
3. A disables cancellation
4. A calls cancellation point and syscall wrongly gets interrupted
This can be solved with more synchronization in pthread_cancel and
pthread_setcancelstate, but it seems costly. pthread_setcancelstate
would have to clear pending sticky cancellation signals, and any
internal non-cancellable syscalls would have to be made using the same
mechanism (effectively calling pthread_setcancelstate). A naive
implementation of such clearing would involve a syscall itself,
defeating the purpose of using the vdso syscall at all (since an extra
syscall costs a lot more than the cycles you save from sysenter vs int
$0x80). It should be possible to track the state of the pending signal
in userspace, so that syscalls to clear it can be avoided except when
it's actually pending, but this requires some very tricky locking to
implement since most of these syscalls have to be async-signal-safe
but would also need to be using locking that synchronizes with the
thread calling pthread_cancel. At worst, implementing such locking
would require blocking all signals before taking the lock, which would
again introduce the requirement of more syscalls.
> > From my standpoint the simplest and cleanest solution is for vdso to provide a
> > predicate function that takes a ucontext_t and returns true/false for whether it
> > represents a state prior to entering (or reentering, for restart state) the vdso
> > syscall. If vdso exports this symbol libc can use vdso syscall with
> > cancellation. If not, it can just fallback to straight inline syscall like now.
>
> Offering such a flag pushes unreasonable conceptual overhead into the vDSO proper
> in the long run: right now it might be easy to implement because the code paths
> are relatively simple and we can generate the flag passively via RIP checking -
> but if the vDSO grows more complex interfaces in the future, we'd essentially have
> to track our entry/exit state dynamically which sucks ...
I don't see what you think it would grow. We're not talking about all
functionality in the vdso, only the vdso syscall/sysenter replacement
(AT_SYSINFO) to be used in place of int $0x80. The only way it would
get more complex is if whole syscalls were being fast-pathed in
userspace, but I think it was already determined that this approach
was wrong and that the vdso should export public symbols (like
__vdso_clock_gettime) instead of transparently fast-pathing them in
userspace via the AT_SYSINFO function. Also, any function that would
be a candidate for fast-pathing in userspace would be a
non-cancellation-point anyway.
Rich
* Rich Felker <[email protected]> [2016-03-10 18:28:20 -0500]:
> On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
> >
> > * Rich Felker <[email protected]> wrote:
> >
> > > > So instead of a sticky cancellation flag, we could introduce a sticky
> > > > cancellation signal.
> > > >
> > > > A 'sticky signal' is not cleared from signal_pending() when the signal handler
> > > > executes, but it's automatically blocked so no signal handler recursion
> > > > occurs. (A sticky signal could still be cleared via a separate mechanism, by
> > > > the cancellation cleanup code.)
> > > >
> > > > Such a 'sticky cancellation signal' would, in the racy situation, cause new
> > > > blocking system calls to immediately return with -EINTR. Non-blocking syscalls
> > > > could still be used. (So the cancellation signal handler itself would still
> > > > have access to various fundamental system calls.)
> > > >
> > > > I think this would avoid messy coupling between the kernel's increasingly more
> > > > varied system call entry code and C libraries.
> > > >
> > > > Sticky signals could be requested via a new SA_ flag.
> > > >
> > > > What do you think?
> > >
> > > This still doesn't address the issue that the code making the syscall needs to
> > > be able to control whether it's cancellable or not. Not only do some syscalls
> > > whose public functions are cancellation points need to be used internally in
> > > non-cancellable ways; there's also the pthread_setcancelstate interface that
> > > allows deferring cancellation so that it's possible to call functions which are
> > > cancellation points without invoking cancellation.
> >
> > I don't think there's a problem - but I might be wrong:
> >
> > One way I think it would work is the following: a sticky signal is not the
> > cancellation flag - it's a helper construct to implement the flag in user-space in
> > a race-free way.
> >
> > Say you have RT signal-32 as the cancellation signal, and it's a sticky signal.
> >
> > When pthread_cancel() wants to cancel another thread, it first (atomically) sets
> > the desired cancel state of the target thread. If that state signals that the
> > thread is cancellable right now, and that we initiated its cancellation, then we
> > send signal-32. I.e. the signal only ever gets sent if the thread is in a
> > cancellable state.
> >
> > libc internal functions and the pthread_setcancelstate() API can temporarily
> > change the cancel state of a thread to non-cancellable - but pthread_cancel()
> > observes those state transitions.
> >
> > The 'sticky' nature of signal-32 will make a difference in the following race
> > condition, if the cancellation flag is checked before a system call by the C
> > library, and signal-32 arrives before the system call is executed. In that case
> > the 'sticky' nature of the signal makes sure that all subsequent system calls
> > return immediately.
> >
> > The sticky signal is only ever sent when the thread is in cancellable state - and
> > if the target thread notices the cancellation request before the signal arrives,
> > it first waits for its arrival before executing any new system calls (as part of
> > the teardown, etc.).
> >
> > So the C library never has to do complex work with a sticky signal pending.
> >
> > Does that make more sense to you?
>
> No, it doesn't work. Cancellability of the target thread at the time
> of the cancellation request (when you would decide whether or not to
> send the signal) has no relation to cancellability at the time of
> calling the cancellation point. Consider 2 threads A and B and the
> following sequence of events:
>
> 1. A has cancellation enabled
> 2. B calls pthread_cancel(A) and sets sticky pending signal
> 3. A disables cancellation
> 4. A calls cancellation point and syscall wrongly gets interrupted
>
> This can be solved with more synchronization in pthread_cancel and
> pthread_setcancelstate, but it seems costly. pthread_setcancelstate
> would have to clear pending sticky cancellation signals, and any
> internal non-cancellable syscalls would have to be made using the same
> mechanism (effectively calling pthread_setcancelstate). A naive
> implementation of such clearing would involve a syscall itself,
i think a syscall in setcancelstate in case of pending sticky signal
is not that bad given that cancellation is very rarely used.
however maintaining two completely different cancellation designs
is expensive and only the current one works on old kernels.
> defeating the purpose of using the vdso syscall at all (since an extra
> syscall costs a lot more than the cycles you save from sysenter vs int
> $0x80). It should be possible to track the state of the pending signal
> in userspace, so that syscalls to clear it can be avoided except when
> it's actually pending, but this requires some very tricky locking to
> implement since most of these syscalls have to be async-signal-safe
> but would also need to be using locking that synchronizes with the
> thread calling pthread_cancel. At worst, implementing such locking
> would require blocking all signals before taking the lock, which would
> again introduce the requirement of more syscalls.
>
> > > From my standpoint the simplest and cleanest solution is for vdso to provide a
> > > predicate function that takes a ucontext_t and returns true/false for whether it
> > > represents a state prior to entering (or reentering, for restart state) the vdso
> > > syscall. If vdso exports this symbol libc can use vdso syscall with
> > > cancellation. If not, it can just fallback to straight inline syscall like now.
> >
> > Offering such a flag pushes unreasonable conceptual overhead into the vDSO proper
> > in the long run: right now it might be easy to implement because the code paths
> > are relatively simple and we can generate the flag passively via RIP checking -
> > but if the vDSO grows more complex interfaces in the future, we'd essentially have
> > to track our entry/exit state dynamically which sucks ...
>
> I don't see what you think it would grow. We're not talking about all
> functionality in the vdso, only the vdso syscall/sysenter replacement
> (AT_SYSINFO) to be used in place of int $0x80. The only way it would
> get more complex is if whole syscalls were being fast-pathed in
> userspace, but I think it was already determined that this approach
> was wrong and that the vdso should export public symbols (like
> __vdso_clock_gettime) instead of transparently fast-pathing them in
> userspace via the AT_SYSINFO function. Also, any function that would
> be a candidate for fast-pathing in userspace would be a
> non-cancellation-point anyway.
>
> Rich
On Fri, Mar 11, 2016 at 01:18:54AM +0100, Szabolcs Nagy wrote:
> * Rich Felker <[email protected]> [2016-03-10 18:28:20 -0500]:
> > On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
> > >
> > > * Rich Felker <[email protected]> wrote:
> > >
> > > > > So instead of a sticky cancellation flag, we could introduce a sticky
> > > > > cancellation signal.
> > > > >
> > > > > A 'sticky signal' is not cleared from signal_pending() when the signal handler
> > > > > executes, but it's automatically blocked so no signal handler recursion
> > > > > occurs. (A sticky signal could still be cleared via a separate mechanism, by
> > > > > the cancellation cleanup code.)
> > > > >
> > > > > Such a 'sticky cancellation signal' would, in the racy situation, cause new
> > > > > blocking system calls to immediately return with -EINTR. Non-blocking syscalls
> > > > > could still be used. (So the cancellation signal handler itself would still
> > > > > have access to various fundamental system calls.)
> > > > >
> > > > > I think this would avoid messy coupling between the kernel's increasingly more
> > > > > varied system call entry code and C libraries.
> > > > >
> > > > > Sticky signals could be requested via a new SA_ flag.
> > > > >
> > > > > What do you think?
> > > >
> > > > This still doesn't address the issue that the code making the syscall needs to
> > > > be able to control whether it's cancellable or not. Not only do some syscalls
> > > > whose public functions are cancellation points need to be used internally in
> > > > non-cancellable ways; there's also the pthread_setcancelstate interface that
> > > > allows deferring cancellation so that it's possible to call functions which are
> > > > cancellation points without invoking cancellation.
> > >
> > > I don't think there's a problem - but I might be wrong:
> > >
> > > One way I think it would work is the following: a sticky signal is not the
> > > cancellation flag - it's a helper construct to implement the flag in user-space in
> > > a race-free way.
> > >
> > > Say you have RT signal-32 as the cancellation signal, and it's a sticky signal.
> > >
> > > When pthread_cancel() wants to cancel another thread, it first (atomically) sets
> > > the desired cancel state of the target thread. If that state signals that the
> > > thread is cancellable right now, and that we initiated its cancellation, then we
> > > send signal-32. I.e. the signal only ever gets sent if the thread is in a
> > > cancellable state.
> > >
> > > libc internal functions and the pthread_setcancelstate() API can temporarily
> > > change the cancel state of a thread to non-cancellable - but pthread_cancel()
> > > observes those state transitions.
> > >
> > > The 'sticky' nature of signal-32 will make a difference in the following race
> > > condition, if the cancellation flag is checked before a system call by the C
> > > library, and signal-32 arrives before the system call is executed. In that case
> > > the 'sticky' nature of the signal makes sure that all subsequent system calls
> > > return immediately.
> > >
> > > The sticky signal is only ever sent when the thread is in cancellable state - and
> > > if the target thread notices the cancellation request before the signal arrives,
> > > it first waits for its arrival before executing any new system calls (as part of
> > > the teardown, etc.).
> > >
> > > So the C library never has to do complex work with a sticky signal pending.
> > >
> > > Does that make more sense to you?
> >
> > No, it doesn't work. Cancellability of the target thread at the time
> > of the cancellation request (when you would decide whether or not to
> > send the signal) has no relation to cancellability at the time of
> > calling the cancellation point. Consider 2 threads A and B and the
> > following sequence of events:
> >
> > 1. A has cancellation enabled
> > 2. B calls pthread_cancel(A) and sets sticky pending signal
> > 3. A disables cancellation
> > 4. A calls cancellation point and syscall wrongly gets interrupted
> >
> > This can be solved with more synchronization in pthread_cancel and
> > pthread_setcancelstate, but it seems costly. pthread_setcancelstate
> > would have to clear pending sticky cancellation signals, and any
> > internal non-cancellable syscalls would have to be made using the same
> > mechanism (effectively calling pthread_setcancelstate). A naive
> > implementation of such clearing would involve a syscall itself,
>
> i think a syscall in setcancelstate in case of pending sticky signal
> is not that bad given that cancellation is very rarely used.
I agree, but it's not clear to me whether you could eliminate syscalls
in the case where it's not pending, since AS-safe lock machinery is
hard to get right. I don't see a way it can be done with just atomics
because the syscall that sends the signal cannot be atomic with the
memory operating setting a flag, which suggests a lock is needed, and
then there are all sorts of issues to deal with.
> however maintaining two completely different cancellation designs
> is expensive and only the current one works on old kernels.
Indeed. I think it would be hard to justify supporting a new one in
musl unless there's some easy way to isolate the complexity of having
both, being that vdso syscall is of marginal value to begin with
anyway...
Rich
On Thu, Mar 10, 2016 at 4:48 PM, Rich Felker <[email protected]> wrote:
> On Fri, Mar 11, 2016 at 01:18:54AM +0100, Szabolcs Nagy wrote:
>> * Rich Felker <[email protected]> [2016-03-10 18:28:20 -0500]:
>> > On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
>> > >
>> > > * Rich Felker <[email protected]> wrote:
>> > >
>> > > > > So instead of a sticky cancellation flag, we could introduce a sticky
>> > > > > cancellation signal.
>> > > > >
>> > > > > A 'sticky signal' is not cleared from signal_pending() when the signal handler
>> > > > > executes, but it's automatically blocked so no signal handler recursion
>> > > > > occurs. (A sticky signal could still be cleared via a separate mechanism, by
>> > > > > the cancellation cleanup code.)
>> > > > >
>> > > > > Such a 'sticky cancellation signal' would, in the racy situation, cause new
>> > > > > blocking system calls to immediately return with -EINTR. Non-blocking syscalls
>> > > > > could still be used. (So the cancellation signal handler itself would still
>> > > > > have access to various fundamental system calls.)
>> > > > >
>> > > > > I think this would avoid messy coupling between the kernel's increasingly more
>> > > > > varied system call entry code and C libraries.
>> > > > >
>> > > > > Sticky signals could be requested via a new SA_ flag.
>> > > > >
>> > > > > What do you think?
>> > > >
>> > > > This still doesn't address the issue that the code making the syscall needs to
>> > > > be able to control whether it's cancellable or not. Not only do some syscalls
>> > > > whose public functions are cancellation points need to be used internally in
>> > > > non-cancellable ways; there's also the pthread_setcancelstate interface that
>> > > > allows deferring cancellation so that it's possible to call functions which are
>> > > > cancellation points without invoking cancellation.
>> > >
>> > > I don't think there's a problem - but I might be wrong:
>> > >
>> > > One way I think it would work is the following: a sticky signal is not the
>> > > cancellation flag - it's a helper construct to implement the flag in user-space in
>> > > a race-free way.
>> > >
>> > > Say you have RT signal-32 as the cancellation signal, and it's a sticky signal.
>> > >
>> > > When pthread_cancel() wants to cancel another thread, it first (atomically) sets
>> > > the desired cancel state of the target thread. If that state signals that the
>> > > thread is cancellable right now, and that we initiated its cancellation, then we
>> > > send signal-32. I.e. the signal only ever gets sent if the thread is in a
>> > > cancellable state.
>> > >
>> > > libc internal functions and the pthread_setcancelstate() API can temporarily
>> > > change the cancel state of a thread to non-cancellable - but pthread_cancel()
>> > > observes those state transitions.
>> > >
>> > > The 'sticky' nature of signal-32 will make a difference in the following race
>> > > condition, if the cancellation flag is checked before a system call by the C
>> > > library, and signal-32 arrives before the system call is executed. In that case
>> > > the 'sticky' nature of the signal makes sure that all subsequent system calls
>> > > return immediately.
>> > >
>> > > The sticky signal is only ever sent when the thread is in cancellable state - and
>> > > if the target thread notices the cancellation request before the signal arrives,
>> > > it first waits for its arrival before executing any new system calls (as part of
>> > > the teardown, etc.).
>> > >
>> > > So the C library never has to do complex work with a sticky signal pending.
>> > >
>> > > Does that make more sense to you?
>> >
>> > No, it doesn't work. Cancellability of the target thread at the time
>> > of the cancellation request (when you would decide whether or not to
>> > send the signal) has no relation to cancellability at the time of
>> > calling the cancellation point. Consider 2 threads A and B and the
>> > following sequence of events:
>> >
>> > 1. A has cancellation enabled
>> > 2. B calls pthread_cancel(A) and sets sticky pending signal
>> > 3. A disables cancellation
>> > 4. A calls cancellation point and syscall wrongly gets interrupted
>> >
>> > This can be solved with more synchronization in pthread_cancel and
>> > pthread_setcancelstate, but it seems costly. pthread_setcancelstate
>> > would have to clear pending sticky cancellation signals, and any
>> > internal non-cancellable syscalls would have to be made using the same
>> > mechanism (effectively calling pthread_setcancelstate). A naive
>> > implementation of such clearing would involve a syscall itself,
>>
>> i think a syscall in setcancelstate in case of pending sticky signal
>> is not that bad given that cancellation is very rarely used.
>
> I agree, but it's not clear to me whether you could eliminate syscalls
> in the case where it's not pending, since AS-safe lock machinery is
> hard to get right. I don't see a way it can be done with just atomics
> because the syscall that sends the signal cannot be atomic with the
> memory operating setting a flag, which suggests a lock is needed, and
> then there are all sorts of issues to deal with.
>
>> however maintaining two completely different cancellation designs
>> is expensive and only the current one works on old kernels.
>
> Indeed. I think it would be hard to justify supporting a new one in
> musl unless there's some easy way to isolate the complexity of having
> both, being that vdso syscall is of marginal value to begin with
> anyway...
I would argue that vdso syscalls are of considerably more than
marginal utility. They are vastly faster. The difference isn't
subtle.
However... while it seems straightforward that a pthread cancellation
implementation should be correct, is there any reason that it needs to
fast? After all musl could always do:
if (this thread is cancellable right now)
use int $0x80 and eat the performance hit
else
call AT_SYSINFO
Aside from a branch, this adds minimal overhead to sane programs, and
programs crazy enough to try use pthread cancellation get penalized on
x86_32.
If I read it right, that's what musl already does. But I could be
reading it wrong:
if ((st=(self=__pthread_self())->canceldisable)
&& (st==PTHREAD_CANCEL_DISABLE || nr==SYS_close))
return __syscall(nr, u, v, w, x, y, z);
... slow path ...
Figuring out what "(st=(self=__pthread_self())->canceldisable) && ..."
does and why requires more cross-referencing that I care to do right
now. Shouldn't that "&&" at least be a "," or perhaps just be a
separate statement? On glibc, at least, PTHREAD_CANCEL_DISABLE == 1,
so this looks nearly tautological.
--Andy
* Rich Felker <[email protected]> [2016-03-10 19:48:59 -0500]:
> On Fri, Mar 11, 2016 at 01:18:54AM +0100, Szabolcs Nagy wrote:
> > * Rich Felker <[email protected]> [2016-03-10 18:28:20 -0500]:
> > > On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
> > > >
> > > > The sticky signal is only ever sent when the thread is in cancellable state - and
> > > > if the target thread notices the cancellation request before the signal arrives,
^^^^^^...
> > > > it first waits for its arrival before executing any new system calls (as part of
^^^^^^...
> > > > the teardown, etc.).
> > > >
> > > > So the C library never has to do complex work with a sticky signal pending.
> > > >
> > > > Does that make more sense to you?
> > >
> > > No, it doesn't work. Cancellability of the target thread at the time
> > > of the cancellation request (when you would decide whether or not to
> > > send the signal) has no relation to cancellability at the time of
> > > calling the cancellation point. Consider 2 threads A and B and the
> > > following sequence of events:
> > >
> > > 1. A has cancellation enabled
> > > 2. B calls pthread_cancel(A) and sets sticky pending signal
> > > 3. A disables cancellation
> > > 4. A calls cancellation point and syscall wrongly gets interrupted
> > >
> > > This can be solved with more synchronization in pthread_cancel and
> > > pthread_setcancelstate, but it seems costly. pthread_setcancelstate
> > > would have to clear pending sticky cancellation signals, and any
> > > internal non-cancellable syscalls would have to be made using the same
> > > mechanism (effectively calling pthread_setcancelstate). A naive
> > > implementation of such clearing would involve a syscall itself,
> >
> > i think a syscall in setcancelstate in case of pending sticky signal
> > is not that bad given that cancellation is very rarely used.
>
> I agree, but it's not clear to me whether you could eliminate syscalls
> in the case where it's not pending, since AS-safe lock machinery is
> hard to get right. I don't see a way it can be done with just atomics
> because the syscall that sends the signal cannot be atomic with the
> memory operating setting a flag, which suggests a lock is needed, and
> then there are all sorts of issues to deal with.
>
i think this is not a problem and the above marked text hints for
a solution: just call pause() to wait for the sticky signal if
self->cancelstate indicates that there is one comming or pending.
t->cancelstate always have to be atomically modified but sending
the sticky signal can be delayed (does not have to be atomic with
the memory op).
(of course there migth be other caveats and it certainly needs
more atomic ops and more state than the current design)
* Szabolcs Nagy <[email protected]> [2016-03-11 02:39:47 +0100]:
> * Rich Felker <[email protected]> [2016-03-10 19:48:59 -0500]:
> > On Fri, Mar 11, 2016 at 01:18:54AM +0100, Szabolcs Nagy wrote:
> > > * Rich Felker <[email protected]> [2016-03-10 18:28:20 -0500]:
> > > > On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
> > > > >
> > > > > The sticky signal is only ever sent when the thread is in cancellable state - and
> > > > > if the target thread notices the cancellation request before the signal arrives,
> ^^^^^^...
> > > > > it first waits for its arrival before executing any new system calls (as part of
> ^^^^^^...
> > > > > the teardown, etc.).
> > > > >
> > > > > So the C library never has to do complex work with a sticky signal pending.
> > > > >
> > > > > Does that make more sense to you?
> > > >
> > > > No, it doesn't work. Cancellability of the target thread at the time
> > > > of the cancellation request (when you would decide whether or not to
> > > > send the signal) has no relation to cancellability at the time of
> > > > calling the cancellation point. Consider 2 threads A and B and the
> > > > following sequence of events:
> > > >
> > > > 1. A has cancellation enabled
> > > > 2. B calls pthread_cancel(A) and sets sticky pending signal
> > > > 3. A disables cancellation
> > > > 4. A calls cancellation point and syscall wrongly gets interrupted
> > > >
> > > > This can be solved with more synchronization in pthread_cancel and
> > > > pthread_setcancelstate, but it seems costly. pthread_setcancelstate
> > > > would have to clear pending sticky cancellation signals, and any
> > > > internal non-cancellable syscalls would have to be made using the same
> > > > mechanism (effectively calling pthread_setcancelstate). A naive
> > > > implementation of such clearing would involve a syscall itself,
> > >
> > > i think a syscall in setcancelstate in case of pending sticky signal
> > > is not that bad given that cancellation is very rarely used.
> >
> > I agree, but it's not clear to me whether you could eliminate syscalls
> > in the case where it's not pending, since AS-safe lock machinery is
> > hard to get right. I don't see a way it can be done with just atomics
> > because the syscall that sends the signal cannot be atomic with the
> > memory operating setting a flag, which suggests a lock is needed, and
> > then there are all sorts of issues to deal with.
> >
>
> i think this is not a problem and the above marked text hints for
> a solution: just call pause() to wait for the sticky signal if
> self->cancelstate indicates that there is one comming or pending.
>
> t->cancelstate always have to be atomically modified but sending
> the sticky signal can be delayed (does not have to be atomic with
> the memory op).
>
i take this back, if there are signals between the check of
self->cancelstate and pause() in setcancelstate that can
cause problems (the sticky signal will not hit pause but
something else).
On Fri, Mar 11, 2016 at 02:39:47AM +0100, Szabolcs Nagy wrote:
> * Rich Felker <[email protected]> [2016-03-10 19:48:59 -0500]:
> > On Fri, Mar 11, 2016 at 01:18:54AM +0100, Szabolcs Nagy wrote:
> > > * Rich Felker <[email protected]> [2016-03-10 18:28:20 -0500]:
> > > > On Thu, Mar 10, 2016 at 07:03:31PM +0100, Ingo Molnar wrote:
> > > > >
> > > > > The sticky signal is only ever sent when the thread is in cancellable state - and
> > > > > if the target thread notices the cancellation request before the signal arrives,
> ^^^^^^...
> > > > > it first waits for its arrival before executing any new system calls (as part of
> ^^^^^^...
> > > > > the teardown, etc.).
> > > > >
> > > > > So the C library never has to do complex work with a sticky signal pending.
> > > > >
> > > > > Does that make more sense to you?
> > > >
> > > > No, it doesn't work. Cancellability of the target thread at the time
> > > > of the cancellation request (when you would decide whether or not to
> > > > send the signal) has no relation to cancellability at the time of
> > > > calling the cancellation point. Consider 2 threads A and B and the
> > > > following sequence of events:
> > > >
> > > > 1. A has cancellation enabled
> > > > 2. B calls pthread_cancel(A) and sets sticky pending signal
> > > > 3. A disables cancellation
> > > > 4. A calls cancellation point and syscall wrongly gets interrupted
> > > >
> > > > This can be solved with more synchronization in pthread_cancel and
> > > > pthread_setcancelstate, but it seems costly. pthread_setcancelstate
> > > > would have to clear pending sticky cancellation signals, and any
> > > > internal non-cancellable syscalls would have to be made using the same
> > > > mechanism (effectively calling pthread_setcancelstate). A naive
> > > > implementation of such clearing would involve a syscall itself,
> > >
> > > i think a syscall in setcancelstate in case of pending sticky signal
> > > is not that bad given that cancellation is very rarely used.
> >
> > I agree, but it's not clear to me whether you could eliminate syscalls
> > in the case where it's not pending, since AS-safe lock machinery is
> > hard to get right. I don't see a way it can be done with just atomics
> > because the syscall that sends the signal cannot be atomic with the
> > memory operating setting a flag, which suggests a lock is needed, and
> > then there are all sorts of issues to deal with.
>
> i think this is not a problem and the above marked text hints for
> a solution: just call pause() to wait for the sticky signal if
> self->cancelstate indicates that there is one comming or pending.
There are multiple problems with this approach, at least:
- pause does not 'consume' the signal; sigwaitinfo might.
- pause might return on a different signal that happens to arrive
between setting the flag and sending the cancel signal
- If the thread calling pthread_cancel is interrupted by a signal
after setting the flag but before sending the signal, the target
thread may be arbitrarily delayed; in complex cases it may even
deadlock. This should be easy to solve though by having
pthread_cancel run with signals masked.
> t->cancelstate always have to be atomically modified but sending
> the sticky signal can be delayed (does not have to be atomic with
> the memory op).
Right.
> (of course there migth be other caveats and it certainly needs
> more atomic ops and more state than the current design)
I think it might be possible to do by having pthread_cancel run with
signals blocked and having sigwaitinfo consume the sticky signal if
the atomic-set cancellation-pending flag was seen, but I haven't
thought about all the corner cases of signal handlers and nested
cancellation points. POSIX might be making the behavior of the
affected cases undefined, though. So I think solving this might be
plausible, but nontrivial.
Rich
* Rich Felker <[email protected]> wrote:
> No, it doesn't work. Cancellability of the target thread at the time
> of the cancellation request (when you would decide whether or not to
> send the signal) has no relation to cancellability at the time of
> calling the cancellation point. Consider 2 threads A and B and the
> following sequence of events:
>
> 1. A has cancellation enabled
> 2. B calls pthread_cancel(A) and sets sticky pending signal
> 3. A disables cancellation
> 4. A calls cancellation point and syscall wrongly gets interrupted
As I (tried to!) describe it when describing the cancellation signal, if a
cancellation signal is in flight, it must be waited for in the unlikely event of
cancellation being disabled in the small window where the signal is sent.
So in your above example, it would do:
> 1. A has cancellation enabled
> 2. B calls pthread_cancel(A) and sets sticky pending signal
> 3. A disables cancellation
3b. Notices that cancellation request is pending and waits for it
and clears the sticky signal.
4. A calls cancellation point and syscall correctly executes
5. Once A enables cancellation again, the cancellation propagates.
So I still see no problem.
> This can be solved with more synchronization in pthread_cancel and
> pthread_setcancelstate, but it seems costly. [...]
An active signal round trip in itself is very costly (thousands of cycles), a
thread exit is tens of thousands of cycles, and this is a 'slow path' anyway, and
the window is small in any case.
It's just a correctness synchronization to make sure no sticky signal is pending,
not a real performance concern in practice.
Thanks,
Ingo
* Ingo Molnar <[email protected]> [2016-03-11 10:33:47 +0100]:
> * Rich Felker <[email protected]> wrote:
>
> > No, it doesn't work. Cancellability of the target thread at the time
> > of the cancellation request (when you would decide whether or not to
> > send the signal) has no relation to cancellability at the time of
> > calling the cancellation point. Consider 2 threads A and B and the
> > following sequence of events:
> >
> > 1. A has cancellation enabled
> > 2. B calls pthread_cancel(A) and sets sticky pending signal
> > 3. A disables cancellation
> > 4. A calls cancellation point and syscall wrongly gets interrupted
>
> As I (tried to!) describe it when describing the cancellation signal, if a
> cancellation signal is in flight, it must be waited for in the unlikely event of
> cancellation being disabled in the small window where the signal is sent.
>
> So in your above example, it would do:
>
> > 1. A has cancellation enabled
> > 2. B calls pthread_cancel(A) and sets sticky pending signal
blocking signals here is ok.
> > 3. A disables cancellation
blocking signals here is not ok. (libc changes cancelstate at
many places, there should be no syscall in that path.)
> 3b. Notices that cancellation request is pending and waits for it
> and clears the sticky signal.
setcancelstate can be reentered between 'noticing' and 'waiting'
if interrupted by a signal.
the state change from expect-pending-signal to no-pending-signal
cannot be atomic wrt sigwaitinfo unless signals are blocked.
what i didnt think about yesterday is that it is ok and possible
to only block signals if there was a cancel. (it is not trivial
since all the cancel related state changes have to be atomic and
there are at least canceled, signaled, cancelstate and canceltype,
which have to fit into 32bits and managed together.)
> 4. A calls cancellation point and syscall correctly executes
> 5. Once A enables cancellation again, the cancellation propagates.
>
> So I still see no problem.
>
i think the sticky signal design would work, but more
complex than what we have and adds some atomic rmw ops
into common code paths and not backward compatible.
not using vsyscalls for cancellation-points sounds easier.
> > This can be solved with more synchronization in pthread_cancel and
> > pthread_setcancelstate, but it seems costly. [...]
>
> An active signal round trip in itself is very costly (thousands of cycles), a
> thread exit is tens of thousands of cycles, and this is a 'slow path' anyway, and
> the window is small in any case.
>
> It's just a correctness synchronization to make sure no sticky signal is pending,
> not a real performance concern in practice.
>
> Thanks,
>
> Ingo
On Fri, Mar 11, 2016 at 3:39 AM, Szabolcs Nagy <[email protected]> wrote:
>
> i think the sticky signal design would work, but more
> complex than what we have and adds some atomic rmw ops
> into common code paths and not backward compatible.
>
> not using vsyscalls for cancellation-points sounds easier.
Hmm. Ok, so I think I understand your needs, and your current model
does sound easier. But the cost of not using vsyscalls is really quite
high.
It sounds like the main worry is that some system calls are guaranteed
cancellation points, and if the signal slips in between your
cancellation point check and the system call, you lose that ability.
I'm assuming that if the "canceltype" is asynchronous, you never have
this problem, because the cancellation can be done in the signal
handler itself, which avoids the whole race.
Am I getting closer to understanding the particular semantics you are
looking for?
Because if that's the case, I wonder if what you really want is not
"sticky signals" as much as "synchronous signals" - ie the ability to
say that a signal shouldn't ever interrupt in random places, but only
at well-defined points (where a system call would be one such point -
are there others?)
So then you could make "pthread_setcanceltype()" just set that flag
for the cancellation signal, and just know that the signal itself will
always be deferred to such a synchronous point (ie system call entry).
We already have the ability to catch things at system call entry
(ptrace needs it, for example), so we could possibly make our signal
delivery have a mode where a signal does *not* cause user space
execution to be interrupted by a signal handler, but instead just sets
a bit in the thread info state that then causes the next system call
to take the signal.
Linus
On Fri, Mar 11, 2016 at 11:27 AM, Linus Torvalds
<[email protected]> wrote:
> On Fri, Mar 11, 2016 at 3:39 AM, Szabolcs Nagy <[email protected]> wrote:
>>
>> i think the sticky signal design would work, but more
>> complex than what we have and adds some atomic rmw ops
>> into common code paths and not backward compatible.
>>
>> not using vsyscalls for cancellation-points sounds easier.
>
> Hmm. Ok, so I think I understand your needs, and your current model
> does sound easier. But the cost of not using vsyscalls is really quite
> high.
>
> It sounds like the main worry is that some system calls are guaranteed
> cancellation points, and if the signal slips in between your
> cancellation point check and the system call, you lose that ability.
>
> I'm assuming that if the "canceltype" is asynchronous, you never have
> this problem, because the cancellation can be done in the signal
> handler itself, which avoids the whole race.
>
> Am I getting closer to understanding the particular semantics you are
> looking for?
>
> Because if that's the case, I wonder if what you really want is not
> "sticky signals" as much as "synchronous signals" - ie the ability to
> say that a signal shouldn't ever interrupt in random places, but only
> at well-defined points (where a system call would be one such point -
> are there others?)
>
> So then you could make "pthread_setcanceltype()" just set that flag
> for the cancellation signal, and just know that the signal itself will
> always be deferred to such a synchronous point (ie system call entry).
>
> We already have the ability to catch things at system call entry
> (ptrace needs it, for example), so we could possibly make our signal
> delivery have a mode where a signal does *not* cause user space
> execution to be interrupted by a signal handler, but instead just sets
> a bit in the thread info state that then causes the next system call
> to take the signal.
I think that this would almost work for musl, except that musl would
still need to be able to tell whether the syscall that eventually gets
interrupted is a cancellation point, which still may require some
ability to unwind from the vdso. The syscall handler can easily tell
the syscall number (it's in EAX), but it may need the effective EIP as
well.
--Andy
On Fri, Mar 11, 2016 at 11:30 AM, Andy Lutomirski <[email protected]> wrote:
>
> I think that this would almost work for musl, except that musl would
> still need to be able to tell whether the syscall that eventually gets
> interrupted is a cancellation point, which still may require some
> ability to unwind from the vdso. The syscall handler can easily tell
> the syscall number (it's in EAX), but it may need the effective EIP as
> well.
So having tried to read the posix manual pages on this, it looks like
there is a list of *minimal* cancellation points, but that saying "any
system call is a cancellation point" is also perfectly valid.
"An implementation may also mark other functions not specified in the
standard as cancellation points"
Of course, musl may have more strict ideas than that on cancellation
points. The "any system call" would make even trivial non-blocking
ones like "futex_wake()" and "getpid()" be cancellation points. So
maybe "any system call" isn't acceptable.
But if it *is* acceptable, that would be a pretty simple kernel mod, I think.
And I could see others possibly wanting to use synchronous signal
handlers. It's not like musl is the only project ever to have had
races with signals..
Linus
On Fri, Mar 11, 2016 at 11:39 AM, Linus Torvalds
<[email protected]> wrote:
>
> "An implementation may also mark other functions not specified in the
> standard as cancellation points"
.. but that was from the Linux man-page. The open group has
"An implementation shall not introduce cancellation points into any
other functions specified in this volume of POSIX.1-2008"
So yeah, it looks like there would need to be some way to filter things.
Oh well.
Linus
* Linus Torvalds <[email protected]> wrote:
> [...]
>
> Because if that's the case, I wonder if what you really want is not "sticky
> signals" as much as "synchronous signals" - ie the ability to say that a signal
> shouldn't ever interrupt in random places, but only at well-defined points
> (where a system call would be one such point - are there others?)
Yes, I had similar 'deferred signal delivery' thoughts after having written up the
sticky signals approach, I just couldn't map all details of the semantics: see the
'internal libc functions' problem below.
If we can do this approach then there's another advantage as well: this way the C
library does not even have to poll for cancellation at syscall boundaries: i.e.
the regular system call fast path gets faster by 2-3 instructions as well.
> So then you could make "pthread_setcanceltype()" just set that flag for the
> cancellation signal, and just know that the signal itself will always be
> deferred to such a synchronous point (ie system call entry).
>
> We already have the ability to catch things at system call entry (ptrace needs
> it, for example), so we could possibly make our signal delivery have a mode
> where a signal does *not* cause user space execution to be interrupted by a
> signal handler, but instead just sets a bit in the thread info state that then
> causes the next system call to take the signal.
Yes, so this would need a bit of work, to handle the problem mentioned by Rich
Felker: "internal" libc APIs (such as name server lookups) may consist of a series
of complex system calls - some of which might be blocking. It should still be
possible to execute such 'internal' system calls undisturbed, even if a 'deferred'
signal is sent.
One workable solution I think would be to prepare the internal functions for
eventual interruption by the cancellation signal. They have to be restartable
anyway, because the application can send other signals. As long as the
interruption is only transient it should be fine.
And note that this approach would also be pretty fast on the libc side: none of
the 'fast' cancellation APIs would have to do anything complex like per call
signal blocking/unblocking or other complex signal operations. They would just
activate a straightforward new SA_ flag and rely on its semantics.
Thanks,
Ingo
* Linus Torvalds <[email protected]> wrote:
> On Fri, Mar 11, 2016 at 11:39 AM, Linus Torvalds
> <[email protected]> wrote:
> >
> > "An implementation may also mark other functions not specified in the
> > standard as cancellation points"
>
> .. but that was from the Linux man-page. The open group has
>
> "An implementation shall not introduce cancellation points into any
> other functions specified in this volume of POSIX.1-2008"
>
> So yeah, it looks like there would need to be some way to filter things.
>
> Oh well.
Is this really a big problem? Signals are asynchronous anyway, so if a C library
uses signal delivery for cancellation, it has to be ready to get the signal
delivered in the 'wrong' moment, for the wrong system call. The system call has to
be restarted in that case - or the interruption result has to be returned.
The _cancellation_ itself will then still be executed during the next suitable
cancellation point: which will be before doing the next cancellable system call
(or libc API).
So I think it can still all be made work with SA_SYNCHRONOUS.
It would only be a show stopper if Linux didn't cover all required system calls.
Covering _more_ system calls is not a problem AFAICS. But I might be missing
something ...
Thanks,
Ingo
(Argh: Mail-Followup-To spam your mailer sets up is nasty!)
* Szabolcs Nagy <[email protected]> wrote:
> > 4. A calls cancellation point and syscall correctly executes
> > 5. Once A enables cancellation again, the cancellation propagates.
> >
> > So I still see no problem.
>
> i think the sticky signal design would work, but more
> complex than what we have and adds some atomic rmw ops
> into common code paths and not backward compatible.
Agreed about complexity, but note that the RMW ops shouldn't really be expensive
here, as this should be a well-cached flag. Especially compared to:
> not using vsyscalls for cancellation-points sounds easier.
... FYI not using vsyscalls has _far_ higher cost than using well-cached RMW ops.
So ... what do you think about Linus's SA_SYNCHRONOUS approach? I think it can be
made to work without much fuss.
There will still be different code paths on old and new kernels, but that's
unavoidable.
Thanks,
Ingo
On Sat, Mar 12, 2016 at 06:00:40PM +0100, Ingo Molnar wrote:
>
> * Linus Torvalds <[email protected]> wrote:
>
> > [...]
> >
> > Because if that's the case, I wonder if what you really want is not "sticky
> > signals" as much as "synchronous signals" - ie the ability to say that a signal
> > shouldn't ever interrupt in random places, but only at well-defined points
> > (where a system call would be one such point - are there others?)
>
> Yes, I had similar 'deferred signal delivery' thoughts after having written up the
> sticky signals approach, I just couldn't map all details of the semantics: see the
> 'internal libc functions' problem below.
>
> If we can do this approach then there's another advantage as well: this way the C
> library does not even have to poll for cancellation at syscall boundaries: i.e.
> the regular system call fast path gets faster by 2-3 instructions as well.
That is not a measurable benefit. You're talking about 2-3 cycles out
of 10k or more cycles (these are heavy blocking syscalls not light
things like SYS_time or SYS_getpid).
> > So then you could make "pthread_setcanceltype()" just set that flag for the
> > cancellation signal, and just know that the signal itself will always be
> > deferred to such a synchronous point (ie system call entry).
> >
> > We already have the ability to catch things at system call entry (ptrace needs
> > it, for example), so we could possibly make our signal delivery have a mode
> > where a signal does *not* cause user space execution to be interrupted by a
> > signal handler, but instead just sets a bit in the thread info state that then
> > causes the next system call to take the signal.
>
> Yes, so this would need a bit of work, to handle the problem mentioned by Rich
> Felker: "internal" libc APIs (such as name server lookups) may consist of a series
> of complex system calls - some of which might be blocking. It should still be
> possible to execute such 'internal' system calls undisturbed, even if a 'deferred'
> signal is sent.
That's equivalent to setcancelstate(disabled), and actually the
mechanism we use for most "complex" functions since it's a lot simpler
and more maintainable to build these complex functins on top of public
APIs than direct inline syscalls or internal APIs that may change. In
musl, direct non-cancellable syscall variants are mainly used in
places where either it's just a single simple syscall (like close) or
where calling the public API is already impossible for namespace
reasons (e.g. inside stdio, which can't use POSIX namespace because
it's implementing ISO C not POSIX).
> One workable solution I think would be to prepare the internal functions for
> eventual interruption by the cancellation signal. They have to be restartable
> anyway, because the application can send other signals. As long as the
> interruption is only transient it should be fine.
No, that does not work. EINTR from a non-restarting signal is a
specified, reportable error (despite being rather useles in practice
due to race conditions; of course you can solve those with repeated
signals and exponential backoff). We cannot just loop and retry on
spurious EINTR except in a few cases where EINTR is optional or not
used (like sem_wait).
> And note that this approach would also be pretty fast on the libc side: none of
> the 'fast' cancellation APIs would have to do anything complex like per call
> signal blocking/unblocking or other complex signal operations. They would just
> activate a straightforward new SA_ flag and rely on its semantics.
It's already fast, aside from not being able to use sysenter/syscall
instructions. I'm really frustrated that, again and again, we have
kernel folks with no experience with libc implementation trying to
redesign something that already has a simple zero-cost design that
works on all existing systems, and proposing things that have a mix of
immediately-obvious flaws and potential future problems we haven't
even thought of yet.
Even if your designs were ideal, we would end up with libc
implementing two good designs and switching them at runtime based on
kernel version, instead of just one good design. As it stands, every
alternative proposed so far is _more_ complex on the libc side, _more_
complex on the kernel side, _and_ on top of that, requires having two
implementations.
Rich
On Sat, Mar 12, 2016 at 06:05:09PM +0100, Ingo Molnar wrote:
>
> * Linus Torvalds <[email protected]> wrote:
>
> > On Fri, Mar 11, 2016 at 11:39 AM, Linus Torvalds
> > <[email protected]> wrote:
> > >
> > > "An implementation may also mark other functions not specified in the
> > > standard as cancellation points"
> >
> > .. but that was from the Linux man-page. The open group has
> >
> > "An implementation shall not introduce cancellation points into any
> > other functions specified in this volume of POSIX.1-2008"
> >
> > So yeah, it looks like there would need to be some way to filter things.
> >
> > Oh well.
>
> Is this really a big problem? Signals are asynchronous anyway, so if a C library
> uses signal delivery for cancellation, it has to be ready to get the signal
> delivered in the 'wrong' moment, for the wrong system call. The system call has to
> be restarted in that case - or the interruption result has to be returned.
The signals used for cancellation are not interrupting; the handler is
installed with SA_RESTART. If cancellation is disabled when the
handler is invoked, it does nothing at all. Otherwise, it first
modifies the saved signal mask to leave itself block after it returns
(the reason why involves complex nested-signal corner cases you
probably don't want to know about). Then, if the signal handler
determines the interrupted context is at a cancellation point, it
rewrites the saved program counter to act on cancellation rather than
restarting the syscall. If not, it does nothing else.
> The _cancellation_ itself will then still be executed during the next suitable
> cancellation point: which will be before doing the next cancellable system call
> (or libc API).
>
> So I think it can still all be made work with SA_SYNCHRONOUS.
>
> It would only be a show stopper if Linux didn't cover all required system calls.
> Covering _more_ system calls is not a problem AFAICS. But I might be missing
> something ...
You're missing a lot.
Rich
On Wed, Mar 9, 2016 at 1:19 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Mar 9, 2016 at 9:58 AM, Andy Lutomirski <[email protected]> wrote:
>> On Tue, Mar 8, 2016 at 5:24 PM, Andy Lutomirski <[email protected]> wrote:
>>> musl implements system call cancellation in an unusual but clever way.
>>> When a thread issues a cancellable syscall, musl issues the syscall
>>> through a special thunk that looks roughly like this:
>>>
>>
>> FWIW, this patch fails disastrously on 64-bit kernels. I fixed it,
>> but it needs kbuild changes. I'll send those out to the maintainers.
>
> This version should be okay:
>
> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=fed6d35d3941bc53896ab80b5c8d68d54cc00347
>
> You'll need the parent, too, if you want to test. I'm going to give
> the 0day bot a good long chew, since the parent change is a little bit
> scary.
Nope, that version was also not okay. But the version currently in
that branch has survived the kbuild bot for a while now.
Yikes our build process for usermode code sucks.
--Andy
* Rich Felker <[email protected]> wrote:
> On Sat, Mar 12, 2016 at 06:00:40PM +0100, Ingo Molnar wrote:
> >
> > * Linus Torvalds <[email protected]> wrote:
> >
> > > [...]
> > >
> > > Because if that's the case, I wonder if what you really want is not "sticky
> > > signals" as much as "synchronous signals" - ie the ability to say that a signal
> > > shouldn't ever interrupt in random places, but only at well-defined points
> > > (where a system call would be one such point - are there others?)
> >
> > Yes, I had similar 'deferred signal delivery' thoughts after having written up the
> > sticky signals approach, I just couldn't map all details of the semantics: see the
> > 'internal libc functions' problem below.
> >
> > If we can do this approach then there's another advantage as well: this way the C
> > library does not even have to poll for cancellation at syscall boundaries: i.e.
> > the regular system call fast path gets faster by 2-3 instructions as well.
>
> That is not a measurable benefit. You're talking about 2-3 cycles out of 10k or
> more cycles (these are heavy blocking syscalls not light things like SYS_time or
> SYS_getpid).
Huh? The list of 'must be' cancellable system calls includes key system calls
like:
open()
close()
read() variants
write() variants
poll()
select()
which can be and often are very lightweight. The list of 'may be cancellable'
system calls includes even more lightweight system calls.
I think you are confusing 'might block' with 'will block'. Most IO operations on a
modern kernel with modern hardware will not block!
You are scaring me ... :-(
Thanks,
Ingo
On Sat, Mar 12, 2016 at 07:48:36PM +0100, Ingo Molnar wrote:
>
> * Rich Felker <[email protected]> wrote:
>
> > On Sat, Mar 12, 2016 at 06:00:40PM +0100, Ingo Molnar wrote:
> > >
> > > * Linus Torvalds <[email protected]> wrote:
> > >
> > > > [...]
> > > >
> > > > Because if that's the case, I wonder if what you really want is not "sticky
> > > > signals" as much as "synchronous signals" - ie the ability to say that a signal
> > > > shouldn't ever interrupt in random places, but only at well-defined points
> > > > (where a system call would be one such point - are there others?)
> > >
> > > Yes, I had similar 'deferred signal delivery' thoughts after having written up the
> > > sticky signals approach, I just couldn't map all details of the semantics: see the
> > > 'internal libc functions' problem below.
> > >
> > > If we can do this approach then there's another advantage as well: this way the C
> > > library does not even have to poll for cancellation at syscall boundaries: i.e.
> > > the regular system call fast path gets faster by 2-3 instructions as well.
> >
> > That is not a measurable benefit. You're talking about 2-3 cycles out of 10k or
> > more cycles (these are heavy blocking syscalls not light things like SYS_time or
> > SYS_getpid).
>
> Huh? The list of 'must be' cancellable system calls includes key system calls
> like:
>
> open()
> close()
> read() variants
> write() variants
> poll()
> select()
>
> which can be and often are very lightweight. The list of 'may be cancellable'
> system calls includes even more lightweight system calls.
>
> I think you are confusing 'might block' with 'will block'. Most IO operations on a
> modern kernel with modern hardware will not block!
No, I just mean syscalls that may block are generally heavy
operations. There may be a few exceptions (especially close in the
case where it's not the last fd for an open file) but I think you'd be
hard pressed to find a case where 2-3 cycles is even 0.2% of the
syscall time. But my point was not to get derailed on an argument
about the exact performance (non-)benefits of "saving 2-3 cycles",
just to say this is not an interesting argument for one approach vs
another and that it's a distraction from other much-more-important
issues.
> You are scaring me ... :-(
I'm not sure how to interpret this, but if you really feel what I'm
writing is scary/hostile I'll try to convey my ideas differently.
Rich