A regression caused by 854a6ed56839 ("signal: Add restore_user_sigmask()")
caused users of epoll_pwait, io_pgetevents, and ppoll to notice a
latent problem in signal handling during these syscalls.
That patch (854a6ed56839) moved the signal_pending() check closer
to restoring of the user sigmask. But, it failed to update the error
code accordingly. From the userspace perspective, the patch increased
the time window for the signal discovery and subsequent delivery to the
userspace, but did not always adjust the errno afterwards. The behavior
before 854a6ed56839a was that the signals were dropped after the error
code was decided. This resulted in lost signals but the userspace did not
notice it as the syscalls had finished executing the core functionality
and the error codes returned notified success.
For all the syscalls that receive a sigmask from the userland,
the user sigmask is to be in effect through the syscall execution.
At the end of syscall, sigmask of the current process is restored
to what it was before the switch over to user sigmask.
But, for this to be true in practice, the sigmask should be restored
only at the the point we change the saved_sigmask. Anything before
that loses signals. And, anything after is just pointless as the
signal is already lost by restoring the sigmask.
Detailed issue discussion permalink:
https://lore.kernel.org/linux-fsdevel/20190427093319.sgicqik2oqkez3wk@dcvr/
Note that this patch returns interrupted errors (EINTR, ERESTARTNOHAND,
etc) only when there is no other error. If there is a signal and an error
like EINVAL, the syscalls return -EINVAL rather than the interrupted
error codes.
Reported-by: Eric Wong <[email protected]>
Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
Signed-off-by: Deepa Dinamani <[email protected]>
Cc: <[email protected]> # 5.0.x
Cc: <[email protected]> # 5.1.x
---
Changes since v1:
* updated the commit text for more context of the pre-existing condition
* added stable tags as requested
fs/aio.c | 24 ++++++++++++------------
fs/eventpoll.c | 14 ++++++++++----
fs/io_uring.c | 7 +++++--
fs/select.c | 37 +++++++++++++++++++++----------------
include/linux/signal.h | 2 +-
kernel/signal.c | 13 ++++++++++---
6 files changed, 59 insertions(+), 38 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..ebd2b1980161 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2095,7 +2095,7 @@ SYSCALL_DEFINE6(io_pgetevents,
struct __aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 ts;
- int ret;
+ int ret, signal_detected;
if (timeout && unlikely(get_timespec64(&ts, timeout)))
return -EFAULT;
@@ -2108,8 +2108,8 @@ SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+ signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_detected && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2128,7 +2128,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
struct __aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 ts;
- int ret;
+ int ret, signal_detected;
if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
return -EFAULT;
@@ -2142,8 +2142,8 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+ signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_detected && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2193,7 +2193,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct __compat_aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 t;
- int ret;
+ int ret, signal_detected;
if (timeout && get_old_timespec32(&t, timeout))
return -EFAULT;
@@ -2206,8 +2206,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+ signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_detected && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2226,7 +2226,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
struct __compat_aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 t;
- int ret;
+ int ret, signal_detected;
if (timeout && get_timespec64(&t, timeout))
return -EFAULT;
@@ -2239,8 +2239,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+ signal_detected = restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_detected && !ret)
ret = -ERESTARTNOHAND;
return ret;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a0e98d87fcc..fe5a0724b417 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- int error;
+ int error, signal_detected;
sigset_t ksigmask, sigsaved;
/*
@@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
error = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
+
+ if (signal_detected && !error)
+ error = -EINTR;
return error;
}
@@ -2342,7 +2345,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
const compat_sigset_t __user *, sigmask,
compat_size_t, sigsetsize)
{
- long err;
+ long err, signal_detected;
sigset_t ksigmask, sigsaved;
/*
@@ -2355,7 +2358,10 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
err = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
+
+ if (signal_detected && !err)
+ err = -EINTR;
return err;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e11d77181398..b785c8d7efc4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2205,8 +2205,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (ret == -ERESTARTSYS)
ret = -EINTR;
- if (sig)
- restore_user_sigmask(sig, &sigsaved);
+ if (sig) {
+ signal_detected = restore_user_sigmask(sig, &sigsaved);
+ if (signal_detected && !ret)
+ ret = -EINTR;
+ }
return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
}
diff --git a/fs/select.c b/fs/select.c
index 6cbc9ff56ba0..da9cfea35159 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -732,7 +732,7 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
- int ret;
+ int ret, signal_detected;
if (tsp) {
switch (type) {
@@ -760,7 +760,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
ret = core_sys_select(n, inp, outp, exp, to);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
+ if (signal_detected && !ret)
+ ret = -EINTR;
return ret;
}
@@ -1089,7 +1091,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
- int ret;
+ int ret, signal_detected;
if (tsp) {
if (get_timespec64(&ts, tsp))
@@ -1106,10 +1108,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
/* We can restart this syscall, usually */
- if (ret == -EINTR)
+ if (ret == -EINTR || (signal_detected && !ret))
ret = -ERESTARTNOHAND;
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
@@ -1125,7 +1127,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
- int ret;
+ int ret, signal_detected;
if (tsp) {
if (get_old_timespec32(&ts, tsp))
@@ -1142,10 +1144,10 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
/* We can restart this syscall, usually */
- if (ret == -EINTR)
+ if (ret == -EINTR || (signal_detected && !ret))
ret = -ERESTARTNOHAND;
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
@@ -1324,7 +1326,7 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
- int ret;
+ int ret, signal_detected;
if (tsp) {
switch (type) {
@@ -1352,7 +1354,10 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
ret = compat_core_sys_select(n, inp, outp, exp, to);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
+
+ if (signal_detected && !ret)
+ ret = -EINTR;
return ret;
}
@@ -1408,7 +1413,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
- int ret;
+ int ret, signal_detected;
if (tsp) {
if (get_old_timespec32(&ts, tsp))
@@ -1425,10 +1430,10 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
/* We can restart this syscall, usually */
- if (ret == -EINTR)
+ if (ret == -EINTR || (signal_detected && !ret))
ret = -ERESTARTNOHAND;
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
@@ -1444,7 +1449,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
{
sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
- int ret;
+ int ret, signal_detected;
if (tsp) {
if (get_timespec64(&ts, tsp))
@@ -1461,10 +1466,10 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved);
/* We can restart this syscall, usually */
- if (ret == -EINTR)
+ if (ret == -EINTR || (signal_detected && !ret))
ret = -ERESTARTNOHAND;
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 9702016734b1..1d36e8629edf 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -275,7 +275,7 @@ extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struc
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
+extern int restore_user_sigmask(const void __user *usigmask,
sigset_t *sigsaved);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c86b78a7597..7cc33d23ee4b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2916,15 +2916,21 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
* usigmask: sigmask passed in from userland.
* sigsaved: saved sigmask when the syscall started and changed the sigmask to
* usigmask.
+ * returns 1 in case a pending signal is detected.
+ *
+ * Users of the api need to adjust their return values based on whether the
+ * signal was detected here. If a signal is detected, it is delivered to the
+ * userspace. So without an error like -ETINR, userspace might fail to
+ * adjust the flow of execution.
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed in from userland for the syscalls.
*/
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+int restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
{
if (!usigmask)
- return;
+ return 0;
/*
* When signals are pending, do not restore them here.
* Restoring sigmask here can lead to delivering signals that the above
@@ -2933,7 +2939,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
if (signal_pending(current)) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
- return;
+ return 1;
}
/*
@@ -2941,6 +2947,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
* saved_sigmask when signals are not pending.
*/
set_current_blocked(sigsaved);
+ return 0;
}
EXPORT_SYMBOL(restore_user_sigmask);
--
2.17.1
On 05/21, Deepa Dinamani wrote:
>
> Note that this patch returns interrupted errors (EINTR, ERESTARTNOHAND,
> etc) only when there is no other error. If there is a signal and an error
> like EINVAL, the syscalls return -EINVAL rather than the interrupted
> error codes.
Ugh. I need to re-check, but at first glance I really dislike this change.
I think we can fix the problem _and_ simplify the code. Something like below.
The patch is obviously incomplete, it changes only only one caller of
set_user_sigmask(), epoll_pwait() to explain what I mean.
restore_user_sigmask() should simply die. Although perhaps another helper
makes sense to add WARN_ON(test_tsk_restore_sigmask() && !signal_pending).
Oleg.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a0e98d..85f56e4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2318,19 +2318,19 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
size_t, sigsetsize)
{
int error;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ if (error != -EINTR)
+ restore_saved_sigmask();
return error;
}
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e412c09..1e82ae0 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -416,7 +416,6 @@ void task_join_group_stop(struct task_struct *task);
static inline void set_restore_sigmask(void)
{
set_thread_flag(TIF_RESTORE_SIGMASK);
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
@@ -447,7 +446,6 @@ static inline bool test_and_clear_restore_sigmask(void)
static inline void set_restore_sigmask(void)
{
current->restore_sigmask = true;
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
{
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 9702016..887cea6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,8 +273,7 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize);
+extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
extern void restore_user_sigmask(const void __user *usigmask,
sigset_t *sigsaved);
extern void set_current_blocked(sigset_t *);
diff --git a/kernel/signal.c b/kernel/signal.c
index 227ba17..76f4f9a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2801,19 +2801,21 @@ EXPORT_SYMBOL(sigprocmask);
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
+ sigset_t *kmask;
+
+ if (!umask)
return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(kmask);
return 0;
}
@@ -2840,39 +2842,6 @@ int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (signal_pending(current)) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
On Wed, May 22, 2019 at 9:14 AM Oleg Nesterov <[email protected]> wrote:
>
> On 05/22, Deepa Dinamani wrote:
> >
> > -Deepa
> >
> > > On May 22, 2019, at 8:05 AM, Oleg Nesterov <[email protected]> wrote:
> > >
> > >> On 05/21, Deepa Dinamani wrote:
> > >>
> > >> Note that this patch returns interrupted errors (EINTR, ERESTARTNOHAND,
> > >> etc) only when there is no other error. If there is a signal and an error
> > >> like EINVAL, the syscalls return -EINVAL rather than the interrupted
> > >> error codes.
> > >
> > > Ugh. I need to re-check, but at first glance I really dislike this change.
> > >
> > > I think we can fix the problem _and_ simplify the code. Something like below.
> > > The patch is obviously incomplete, it changes only only one caller of
> > > set_user_sigmask(), epoll_pwait() to explain what I mean.
> > > restore_user_sigmask() should simply die. Although perhaps another helper
> > > makes sense to add WARN_ON(test_tsk_restore_sigmask() && !signal_pending).
> >
> > restore_user_sigmask() was added because of all the variants of these
> > syscalls we added because of y2038 as noted in commit message:
> >
> > signal: Add restore_user_sigmask()
> >
> > Refactor the logic to restore the sigmask before the syscall
> > returns into an api.
> > This is useful for versions of syscalls that pass in the
> > sigmask and expect the current->sigmask to be changed during
> > the execution and restored after the execution of the syscall.
> >
> > With the advent of new y2038 syscalls in the subsequent patches,
> > we add two more new versions of the syscalls (for pselect, ppoll
> > and io_pgetevents) in addition to the existing native and compat
> > versions. Adding such an api reduces the logic that would need to
> > be replicated otherwise.
>
> Again, I need to re-check, will continue tomorrow. But so far I am not sure
> this helper can actually help.
>
> > > --- a/fs/eventpoll.c
> > > +++ b/fs/eventpoll.c
> > > @@ -2318,19 +2318,19 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> > > size_t, sigsetsize)
> > > {
> > > int error;
> > > - sigset_t ksigmask, sigsaved;
> > >
> > > /*
> > > * If the caller wants a certain signal mask to be set during the wait,
> > > * we apply it here.
> > > */
> > > - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> > > + error = set_user_sigmask(sigmask, sigsetsize);
> > > if (error)
> > > return error;
> > >
> > > error = do_epoll_wait(epfd, events, maxevents, timeout);
> > >
> > > - restore_user_sigmask(sigmask, &sigsaved);
> > > + if (error != -EINTR)
> >
> > As you address all the other syscalls this condition becomes more and
> > more complicated.
>
> May be.
>
> > > --- a/include/linux/sched/signal.h
> > > +++ b/include/linux/sched/signal.h
> > > @@ -416,7 +416,6 @@ void task_join_group_stop(struct task_struct *task);
> > > static inline void set_restore_sigmask(void)
> > > {
> > > set_thread_flag(TIF_RESTORE_SIGMASK);
> > > - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> >
> > So you always want do_signal() to be called?
>
> Why do you think so? No. This is just to avoid the warning, because with the
> patch I sent set_restore_sigmask() is called "in advance".
>
> > You will have to check each architecture's implementation of
> > do_signal() to check if that has any side effects.
>
> I don't think so.
Why not?
> > Although this is not what the patch is solving.
>
> Sure. But you know, after I tried to read the changelog, I am not sure
> I understand what exactly you are trying to fix. Could you please explain
> this part
>
> The behavior
> before 854a6ed56839a was that the signals were dropped after the error
> code was decided. This resulted in lost signals but the userspace did not
> notice it
>
> ? I fail to understand it, sorry. It looks as if the code was already buggy before
> that commit and it could miss a signal or something like this, but I do not see how.
Did you read the explanation pointed to in the commit text? :
https://lore.kernel.org/linux-fsdevel/20190427093319.sgicqik2oqkez3wk@dcvr/
Let me know what part you don't understand and I can explain more.
It would be better to understand the isssue before we start discussing the fix.
-Deepa
-Deepa
> On May 22, 2019, at 8:05 AM, Oleg Nesterov <[email protected]> wrote:
>
>> On 05/21, Deepa Dinamani wrote:
>>
>> Note that this patch returns interrupted errors (EINTR, ERESTARTNOHAND,
>> etc) only when there is no other error. If there is a signal and an error
>> like EINVAL, the syscalls return -EINVAL rather than the interrupted
>> error codes.
>
> Ugh. I need to re-check, but at first glance I really dislike this change.
>
> I think we can fix the problem _and_ simplify the code. Something like below.
> The patch is obviously incomplete, it changes only only one caller of
> set_user_sigmask(), epoll_pwait() to explain what I mean.
> restore_user_sigmask() should simply die. Although perhaps another helper
> makes sense to add WARN_ON(test_tsk_restore_sigmask() && !signal_pending).
restore_user_sigmask() was added because of all the variants of these
syscalls we added because of y2038 as noted in commit message:
signal: Add restore_user_sigmask()
Refactor the logic to restore the sigmask before the syscall
returns into an api.
This is useful for versions of syscalls that pass in the
sigmask and expect the current->sigmask to be changed during
the execution and restored after the execution of the syscall.
With the advent of new y2038 syscalls in the subsequent patches,
we add two more new versions of the syscalls (for pselect, ppoll
and io_pgetevents) in addition to the existing native and compat
versions. Adding such an api reduces the logic that would need to
be replicated otherwise.
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 4a0e98d..85f56e4 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -2318,19 +2318,19 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> size_t, sigsetsize)
> {
> int error;
> - sigset_t ksigmask, sigsaved;
>
> /*
> * If the caller wants a certain signal mask to be set during the wait,
> * we apply it here.
> */
> - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + error = set_user_sigmask(sigmask, sigsetsize);
> if (error)
> return error;
>
> error = do_epoll_wait(epfd, events, maxevents, timeout);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> + if (error != -EINTR)
As you address all the other syscalls this condition becomes more and
more complicated.
> + restore_saved_sigmask();
>
> return error;
> }
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index e412c09..1e82ae0 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -416,7 +416,6 @@ void task_join_group_stop(struct task_struct *task);
> static inline void set_restore_sigmask(void)
> {
> set_thread_flag(TIF_RESTORE_SIGMASK);
> - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
So you always want do_signal() to be called?
You will have to check each architecture's implementation of
do_signal() to check if that has any side effects.
Although this is not what the patch is solving. What we want is to
adjust return codes on all these syscalls to user and not drop
signals. Please check v2/v3 of the patch. I've updated the commit text
to provide more context into what is actually being fixed here.
If we really want to simplify, we should rewrite all the internal
logic of all the ppoll, epoll_pwait, io_pgetevent syscall internal
handling where we set the error code.
As new versions of syscalls were added, the internal logic got
reworked rather hapazardly. But, as the current issue points out,
these are delicate changes.
-Deepa
> }
>
> static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
> @@ -447,7 +446,6 @@ static inline bool test_and_clear_restore_sigmask(void)
> static inline void set_restore_sigmask(void)
> {
> current->restore_sigmask = true;
> - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> }
> static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
> {
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 9702016..887cea6 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -273,8 +273,7 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
> struct task_struct *p, enum pid_type type);
> extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> extern int sigprocmask(int, sigset_t *, sigset_t *);
> -extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
> - sigset_t *oldset, size_t sigsetsize);
> +extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
> extern void restore_user_sigmask(const void __user *usigmask,
> sigset_t *sigsaved);
> extern void set_current_blocked(sigset_t *);
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 227ba17..76f4f9a 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2801,19 +2801,21 @@ EXPORT_SYMBOL(sigprocmask);
> * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
> * epoll_pwait where a new sigmask is passed from userland for the syscalls.
> */
> -int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
> - sigset_t *oldset, size_t sigsetsize)
> +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
> {
> - if (!usigmask)
> + sigset_t *kmask;
> +
> + if (!umask)
> return 0;
>
> if (sigsetsize != sizeof(sigset_t))
> return -EINVAL;
> - if (copy_from_user(set, usigmask, sizeof(sigset_t)))
> + if (copy_from_user(kmask, umask, sizeof(sigset_t)))
> return -EFAULT;
>
> - *oldset = current->blocked;
> - set_current_blocked(set);
> + set_restore_sigmask();
> + current->saved_sigmask = current->blocked;
> + set_current_blocked(kmask);
>
> return 0;
> }
> @@ -2840,39 +2842,6 @@ int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
> EXPORT_SYMBOL(set_compat_user_sigmask);
> #endif
>
> -/*
> - * restore_user_sigmask:
> - * usigmask: sigmask passed in from userland.
> - * sigsaved: saved sigmask when the syscall started and changed the sigmask to
> - * usigmask.
> - *
> - * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
> - * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
> - */
> -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> -{
> -
> - if (!usigmask)
> - return;
> - /*
> - * When signals are pending, do not restore them here.
> - * Restoring sigmask here can lead to delivering signals that the above
> - * syscalls are intended to block because of the sigmask passed in.
> - */
> - if (signal_pending(current)) {
> - current->saved_sigmask = *sigsaved;
> - set_restore_sigmask();
> - return;
> - }
> -
> - /*
> - * This is needed because the fast syscall return path does not restore
> - * saved_sigmask when signals are not pending.
> - */
> - set_current_blocked(sigsaved);
> -}
> -EXPORT_SYMBOL(restore_user_sigmask);
> -
> /**
> * sys_rt_sigprocmask - change the list of currently blocked signals
> * @how: whether to add, remove, or set signals
>
On 05/22, Deepa Dinamani wrote:
>
> -Deepa
>
> > On May 22, 2019, at 8:05 AM, Oleg Nesterov <[email protected]> wrote:
> >
> >> On 05/21, Deepa Dinamani wrote:
> >>
> >> Note that this patch returns interrupted errors (EINTR, ERESTARTNOHAND,
> >> etc) only when there is no other error. If there is a signal and an error
> >> like EINVAL, the syscalls return -EINVAL rather than the interrupted
> >> error codes.
> >
> > Ugh. I need to re-check, but at first glance I really dislike this change.
> >
> > I think we can fix the problem _and_ simplify the code. Something like below.
> > The patch is obviously incomplete, it changes only only one caller of
> > set_user_sigmask(), epoll_pwait() to explain what I mean.
> > restore_user_sigmask() should simply die. Although perhaps another helper
> > makes sense to add WARN_ON(test_tsk_restore_sigmask() && !signal_pending).
>
> restore_user_sigmask() was added because of all the variants of these
> syscalls we added because of y2038 as noted in commit message:
>
> signal: Add restore_user_sigmask()
>
> Refactor the logic to restore the sigmask before the syscall
> returns into an api.
> This is useful for versions of syscalls that pass in the
> sigmask and expect the current->sigmask to be changed during
> the execution and restored after the execution of the syscall.
>
> With the advent of new y2038 syscalls in the subsequent patches,
> we add two more new versions of the syscalls (for pselect, ppoll
> and io_pgetevents) in addition to the existing native and compat
> versions. Adding such an api reduces the logic that would need to
> be replicated otherwise.
Again, I need to re-check, will continue tomorrow. But so far I am not sure
this helper can actually help.
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -2318,19 +2318,19 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> > size_t, sigsetsize)
> > {
> > int error;
> > - sigset_t ksigmask, sigsaved;
> >
> > /*
> > * If the caller wants a certain signal mask to be set during the wait,
> > * we apply it here.
> > */
> > - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> > + error = set_user_sigmask(sigmask, sigsetsize);
> > if (error)
> > return error;
> >
> > error = do_epoll_wait(epfd, events, maxevents, timeout);
> >
> > - restore_user_sigmask(sigmask, &sigsaved);
> > + if (error != -EINTR)
>
> As you address all the other syscalls this condition becomes more and
> more complicated.
May be.
> > --- a/include/linux/sched/signal.h
> > +++ b/include/linux/sched/signal.h
> > @@ -416,7 +416,6 @@ void task_join_group_stop(struct task_struct *task);
> > static inline void set_restore_sigmask(void)
> > {
> > set_thread_flag(TIF_RESTORE_SIGMASK);
> > - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
>
> So you always want do_signal() to be called?
Why do you think so? No. This is just to avoid the warning, because with the
patch I sent set_restore_sigmask() is called "in advance".
> You will have to check each architecture's implementation of
> do_signal() to check if that has any side effects.
I don't think so.
> Although this is not what the patch is solving.
Sure. But you know, after I tried to read the changelog, I am not sure
I understand what exactly you are trying to fix. Could you please explain
this part
The behavior
before 854a6ed56839a was that the signals were dropped after the error
code was decided. This resulted in lost signals but the userspace did not
notice it
? I fail to understand it, sorry. It looks as if the code was already buggy before
that commit and it could miss a signal or something like this, but I do not see how.
Oleg.
+Cc: linux-mm, since this broke mmots tree and has been applied there
This patch is missing a definition for signal_detected in io_cqring_wait, which
breaks the build.
diff --git fs/io_uring.c fs/io_uring.c
index b785c8d7efc4..b34311675d2d 100644
--- fs/io_uring.c
+++ fs/io_uring.c
@@ -2182,7 +2182,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_cq_ring *ring = ctx->cq_ring;
sigset_t ksigmask, sigsaved;
- int ret;
+ int ret, signal_detected;
if (io_cqring_events(ring) >= min_events)
return 0;
On Wed, May 22, 2019 at 3:18 PM Chris Down <[email protected]> wrote:
>
> +Cc: linux-mm, since this broke mmots tree and has been applied there
>
> This patch is missing a definition for signal_detected in io_cqring_wait, which
> breaks the build.
This patch does not break the build.
The patch the breaks the build was the v2 of this patch since there
was an accidental deletion.
That's what the v3 fixed. I think v3 got picked up today morning into
the mm tree
-Deepa
From: Deepa Dinamani
> Sent: 22 May 2019 17:34
> On Wed, May 22, 2019 at 9:14 AM Oleg Nesterov <[email protected]> wrote:
> >
> > On 05/22, Deepa Dinamani wrote:
> > >
> > > -Deepa
> > >
> > > > On May 22, 2019, at 8:05 AM, Oleg Nesterov <[email protected]> wrote:
> > > >
> > > >> On 05/21, Deepa Dinamani wrote:
> > > >>
> > > >> Note that this patch returns interrupted errors (EINTR, ERESTARTNOHAND,
> > > >> etc) only when there is no other error. If there is a signal and an error
> > > >> like EINVAL, the syscalls return -EINVAL rather than the interrupted
> > > >> error codes.
> > > >
> > > > Ugh. I need to re-check, but at first glance I really dislike this change.
> > > >
> > > > I think we can fix the problem _and_ simplify the code. Something like below.
> > > > The patch is obviously incomplete, it changes only only one caller of
> > > > set_user_sigmask(), epoll_pwait() to explain what I mean.
> > > > restore_user_sigmask() should simply die. Although perhaps another helper
> > > > makes sense to add WARN_ON(test_tsk_restore_sigmask() && !signal_pending).
> > >
> > > restore_user_sigmask() was added because of all the variants of these
> > > syscalls we added because of y2038 as noted in commit message:
> > >
> > > signal: Add restore_user_sigmask()
> > >
> > > Refactor the logic to restore the sigmask before the syscall
> > > returns into an api.
> > > This is useful for versions of syscalls that pass in the
> > > sigmask and expect the current->sigmask to be changed during
> > > the execution and restored after the execution of the syscall.
> > >
> > > With the advent of new y2038 syscalls in the subsequent patches,
> > > we add two more new versions of the syscalls (for pselect, ppoll
> > > and io_pgetevents) in addition to the existing native and compat
> > > versions. Adding such an api reduces the logic that would need to
> > > be replicated otherwise.
> >
> > Again, I need to re-check, will continue tomorrow. But so far I am not sure
> > this helper can actually help.
> >
> > > > --- a/fs/eventpoll.c
> > > > +++ b/fs/eventpoll.c
> > > > @@ -2318,19 +2318,19 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *,
> events,
> > > > size_t, sigsetsize)
> > > > {
> > > > int error;
> > > > - sigset_t ksigmask, sigsaved;
> > > >
> > > > /*
> > > > * If the caller wants a certain signal mask to be set during the wait,
> > > > * we apply it here.
> > > > */
> > > > - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> > > > + error = set_user_sigmask(sigmask, sigsetsize);
> > > > if (error)
> > > > return error;
> > > >
> > > > error = do_epoll_wait(epfd, events, maxevents, timeout);
> > > >
> > > > - restore_user_sigmask(sigmask, &sigsaved);
> > > > + if (error != -EINTR)
> > >
> > > As you address all the other syscalls this condition becomes more and
> > > more complicated.
> >
> > May be.
> >
> > > > --- a/include/linux/sched/signal.h
> > > > +++ b/include/linux/sched/signal.h
> > > > @@ -416,7 +416,6 @@ void task_join_group_stop(struct task_struct *task);
> > > > static inline void set_restore_sigmask(void)
> > > > {
> > > > set_thread_flag(TIF_RESTORE_SIGMASK);
> > > > - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> > >
> > > So you always want do_signal() to be called?
> >
> > Why do you think so? No. This is just to avoid the warning, because with the
> > patch I sent set_restore_sigmask() is called "in advance".
> >
> > > You will have to check each architecture's implementation of
> > > do_signal() to check if that has any side effects.
> >
> > I don't think so.
>
> Why not?
>
> > > Although this is not what the patch is solving.
> >
> > Sure. But you know, after I tried to read the changelog, I am not sure
> > I understand what exactly you are trying to fix. Could you please explain
> > this part
> >
> > The behavior
> > before 854a6ed56839a was that the signals were dropped after the error
> > code was decided. This resulted in lost signals but the userspace did not
> > notice it
> >
> > ? I fail to understand it, sorry. It looks as if the code was already buggy before
> > that commit and it could miss a signal or something like this, but I do not see how.
>
> Did you read the explanation pointed to in the commit text? :
>
> https://lore.kernel.org/linux-fsdevel/20190427093319.sgicqik2oqkez3wk@dcvr/
>
> Let me know what part you don't understand and I can explain more.
>
> It would be better to understand the isssue before we start discussing the fix.
I'm confused...
I thought:
EINTR should only be returned if a blocking sleep (eg in do_epoll_wait() itself)
was interrupted by a signal that was enabled at the time of the sleep.
The handlers for all unblocked signals should be run on return to user.
This is after the mask has been restored and regardless of the error code.
So epoll() can return 'success' or 'timeout' (etc) and the handler for SIG_URG
should still be called.
This is exactly equivalent to the interrupt that generates the signal happening
just after the 'return to user' of the system call.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On 05/22, Deepa Dinamani wrote:
>
> > > > --- a/include/linux/sched/signal.h
> > > > +++ b/include/linux/sched/signal.h
> > > > @@ -416,7 +416,6 @@ void task_join_group_stop(struct task_struct *task);
> > > > static inline void set_restore_sigmask(void)
> > > > {
> > > > set_thread_flag(TIF_RESTORE_SIGMASK);
> > > > - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> > >
> > > So you always want do_signal() to be called?
> >
> > Why do you think so? No. This is just to avoid the warning, because with the
> > patch I sent set_restore_sigmask() is called "in advance".
> >
> > > You will have to check each architecture's implementation of
> > > do_signal() to check if that has any side effects.
> >
> > I don't think so.
>
> Why not?
Why yes?
it seems that we have some communication problems. OK, please look at the code
I proposed, I only added a couple of TODO comments
static inline void set_restore_sigmask(void)
{
// WARN_ON(!TIF_SIGPENDING) was removed by this patch
current->restore_sigmask = true;
}
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
sigset_t *kmask;
if (!umask)
return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
set_restore_sigmask();
current->saved_sigmask = current->blocked;
set_current_blocked(kmask);
return 0;
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
int error;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
// TODO. Add another helper to restore WARN_ON(!TIF_SIGPENDING)
// in case restore_saved_sigmask() is NOT called.
if (error != -EINTR)
restore_saved_sigmask();
return error;
}
Note that it looks much simpler. Now, could you please explain
- why do you think this code is not correct ?
- why do you think we need to audit do_signal() ???
> > > Although this is not what the patch is solving.
> >
> > Sure. But you know, after I tried to read the changelog, I am not sure
> > I understand what exactly you are trying to fix. Could you please explain
> > this part
> >
> > The behavior
> > before 854a6ed56839a was that the signals were dropped after the error
> > code was decided. This resulted in lost signals but the userspace did not
> > notice it
> >
> > ? I fail to understand it, sorry. It looks as if the code was already buggy before
> > that commit and it could miss a signal or something like this, but I do not see how.
>
> Did you read the explanation pointed to in the commit text? :
>
> https://lore.kernel.org/linux-fsdevel/20190427093319.sgicqik2oqkez3wk@dcvr/
this link points to the lengthy and confusing discussion... after a quick glance
I didn't find an answer to my question, so let me repeat it again: why do you think
the kernel was buggy even before 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal:
Add restore_user_sigmask()") ?
Just in case...
https://lore.kernel.org/linux-fsdevel/CABeXuvq7gCV2qPOo+Q8jvNyRaTvhkRLRbnL_oJ-AuK7Sp=P3QQ@mail.gmail.com/
doesn't look right to me... let me quite some parts of your email:
- /*
- * If we changed the signal mask, we need to restore the original one.
- * In case we've got a signal while waiting, we do not restore the
- * signal mask yet, and we allow do_signal() to deliver the signal on
- * the way back to userspace, before the signal mask is restored.
- */
- if (sigmask) {
- if (error == -EINTR) {
- memcpy(¤t->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- } else
**** Execution reaches this else statement and the sigmask is restored
directly, ignoring the newly generated signal.
I see nothing wrong. This is what we want.
The signal is never
handled.
Well, "never" is not right. It won't be handled now, because it is blocked, but
for example think of another pselect/whatever call with the same sigmask.
> It would be better to understand the isssue before we start discussing the fix.
Agreed. And that is why I am asking for your explanations, quite possibly I missed
something, but so far I fail to understand you.
Oleg.
On 05/23, David Laight wrote:
>
> I'm confused...
Me too. To clarify, the current code is obviously buggy, pselect/whatever
shouldn't return 0 (or anything else) if it was interrupted and we are going
to deliver the signal.
But it seems that Deepa has other concerns which I do not understand at all.
In any case, the signal_pending() check _inside_ restore_user_sigmask() can't
be right, with or without this patch. If nothing else, a signal can come right
after the check.
> So epoll() can return 'success' or 'timeout' (etc) and the handler for SIG_URG
> should still be called.
Not sure I understand... OK, suppose that you do
block-all-signals;
ret = pselect(..., sigmask(SIG_URG));
if it returns success/timeout then the handler for SIG_URG should not be called?
or I am totally confused...
Oleg.
From: Oleg Nesterov
> On 05/23, David Laight wrote:
> >
> > I'm confused...
>
> Me too. To clarify, the current code is obviously buggy, pselect/whatever
> shouldn't return 0 (or anything else) if it was interrupted and we are going
> to deliver the signal.
If it was interrupted the return value has to be EINTR.
Whether any signal handlers are called is a separate matter.
> But it seems that Deepa has other concerns which I do not understand at all.
>
> In any case, the signal_pending() check _inside_ restore_user_sigmask() can't
> be right, with or without this patch. If nothing else, a signal can come right
> after the check.
>
> > So epoll() can return 'success' or 'timeout' (etc) and the handler for SIG_URG
> > should still be called.
>
> Not sure I understand... OK, suppose that you do
>
> block-all-signals;
> ret = pselect(..., sigmask(SIG_URG));
>
> if it returns success/timeout then the handler for SIG_URG should not be called?
Ugg...
Posix probably allows the signal handler be called at the point the event
happens rather than being deferred until the system call completes.
Queueing up the signal handler to be run at a later time (syscall exit)
certainly makes sense.
Definitely safest to call the signal handler even if success/timeout
is returned.
pselect() exists to stop the entry race, not the exit one.
> or I am totally confused...
The pselect(2) man page says that the signal handler for a signal that is
enabled for the duration should run.
Clearly it is also valid to call the signal handlers for any signals that
are allowed on entry/exit (they could happen just after the return).
Also remember that pselect() can also be used to disable signals.
So ISTM that signal handlers allowed by either signal mask
should be called during syscall exit.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On 05/23, David Laight wrote:
>
> From: Oleg Nesterov
> > On 05/23, David Laight wrote:
> > >
> > > I'm confused...
> >
> > Me too. To clarify, the current code is obviously buggy, pselect/whatever
> > shouldn't return 0 (or anything else) if it was interrupted and we are going
> > to deliver the signal.
>
> If it was interrupted the return value has to be EINTR.
Yes, and this is what we need to fix.
> Whether any signal handlers are called is a separate matter.
Not really... because in this case we know that the signal will be delivered,
> > Not sure I understand... OK, suppose that you do
> >
> > block-all-signals;
> > ret = pselect(..., sigmask(SIG_URG));
> >
> > if it returns success/timeout then the handler for SIG_URG should not be called?
>
> Ugg...
> Posix probably allows the signal handler be called at the point the event
> happens rather than being deferred until the system call completes.
> Queueing up the signal handler to be run at a later time (syscall exit)
> certainly makes sense.
> Definitely safest to call the signal handler even if success/timeout
> is returned.
Why?
> pselect() exists to stop the entry race, not the exit one.
pselect() has to block SIG_URG again before it returns to user-mode, right?
Suppose pselect() finds a ready fd, and this races with SIG_URG.
Why do you think the handler should run?
What if SIG_URG comes right after pselect() blocks SIG_URG again? I mean,
how this differs the case when it comes before, but a ready fd was already
found?
Oleg.
From: Oleg Nesterov
> Sent: 23 May 2019 17:36
> On 05/23, David Laight wrote:
> >
> > From: Oleg Nesterov
> > > On 05/23, David Laight wrote:
...
> > > Not sure I understand... OK, suppose that you do
> > >
> > > block-all-signals;
> > > ret = pselect(..., sigmask(SIG_URG));
> > >
> > > if it returns success/timeout then the handler for SIG_URG should not be called?
> >
> > Ugg...
> > Posix probably allows the signal handler be called at the point the event
> > happens rather than being deferred until the system call completes.
> > Queueing up the signal handler to be run at a later time (syscall exit)
> > certainly makes sense.
> > Definitely safest to call the signal handler even if success/timeout
> > is returned.
>
> Why?
>
> > pselect() exists to stop the entry race, not the exit one.
>
> pselect() has to block SIG_URG again before it returns to user-mode, right?
Yep.
So the signal handler can't be called for a signal that happens after
pselect() returns.
> Suppose pselect() finds a ready fd, and this races with SIG_URG.
You mean if SIG_URG is raised after a ready fd is found (or even timeout)?
So the return value isn't EINTR.
(If an fd is readable on entry, the SIG_URG could have happened much earlier.)
> Why do you think the handler should run?
Think of the application code loop.
Consider what happens if the signal is SIG_INT - to request the program
stop.
After every pselect() call the application looks to see if the handler
has been called.
If one of the fds is always readable pselect() will never return EINTR
but you want the SIG_INT handler run so that the loop gets terminated.
If you only call the signal handler when EINTR is returned the process
will never stop.
So you need to call the handler even when pselect() succeeds/time out.
> What if SIG_URG comes right after pselect() blocks SIG_URG again? I mean,
> how this differs the case when it comes before, but a ready fd was already
> found?
I suspect you need to defer the re-instatement of the original mask
to the code that calls the signal handlers (which probably should
be called with the programs signal mask).
So that particular window doesn't exist.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Ok, since there has been quite a bit of argument here, I will
backtrack a little bit and maybe it will help us understand what's
happening here.
There are many scenarios being discussed on this thread:
a. State of code before 854a6ed56839a
b. State after 854a6ed56839a
c. Proposed fix as per the patchset in question.
Oleg, I will discuss these first and then we can discuss the
additional changes you suggested.
Some background on why we have these syscalls that take sigmask as an
argument. This is just for the sake of completeness of the argument.
These are particularly meant for a scenario(d) such as below:
1. block the signals you don't care about.
2. syscall()
3. unblock the signals blocked in 1.
The problem here is that if there is a signal that is not blocked by 1
and such a signal is delivered between 1 and 2(since they are not
atomic), the syscall in 2 might block forever as it never found out
about the signal.
As per [a] and let's consider the case of epoll_pwait only first for simplicity.
As I said before, ep_poll() is what checks for signal_pending() and is
responsible for setting errno to -EINTR when there is a signal.
So if a signal is received after ep_poll() and ep_poll() returns
success, it is never noticed by the syscall during execution.
So the question is does the userspace have to know about this signal
or not. From scenario [d] above, I would say it should, even if all
the fd's completed successfully.
This does not happen in [a]. So this is what I said was already broken.
What [b] does is to move the signal check closer to the restoration of
the signal. This way it is good. So, if there is a signal after
ep_poll() returns success, it is noticed and the signal is delivered
when the syscall exits. But, the syscall error status itself is 0.
So now [c] is adjusting the return values based on whether extra
signals were detected after ep_poll(). This part was needed even for
[a].
Let me know if this clarifies things a bit.
-Deepa
On Thu, May 23, 2019 at 11:06 AM Deepa Dinamani <[email protected]> wrote:
>
> Ok, since there has been quite a bit of argument here, I will
> backtrack a little bit and maybe it will help us understand what's
> happening here.
> There are many scenarios being discussed on this thread:
> a. State of code before 854a6ed56839a
> b. State after 854a6ed56839a
> c. Proposed fix as per the patchset in question.
>
> Oleg, I will discuss these first and then we can discuss the
> additional changes you suggested.
>
> Some background on why we have these syscalls that take sigmask as an
> argument. This is just for the sake of completeness of the argument.
>
> These are particularly meant for a scenario(d) such as below:
>
> 1. block the signals you don't care about.
> 2. syscall()
> 3. unblock the signals blocked in 1.
>
> The problem here is that if there is a signal that is not blocked by 1
> and such a signal is delivered between 1 and 2(since they are not
> atomic), the syscall in 2 might block forever as it never found out
> about the signal.
>
> As per [a] and let's consider the case of epoll_pwait only first for simplicity.
>
> As I said before, ep_poll() is what checks for signal_pending() and is
> responsible for setting errno to -EINTR when there is a signal.
>
> So if a signal is received after ep_poll() and ep_poll() returns
> success, it is never noticed by the syscall during execution.
> So the question is does the userspace have to know about this signal
> or not. From scenario [d] above, I would say it should, even if all
> the fd's completed successfully.
> This does not happen in [a]. So this is what I said was already broken.
>
> What [b] does is to move the signal check closer to the restoration of
> the signal. This way it is good. So, if there is a signal after
> ep_poll() returns success, it is noticed and the signal is delivered
> when the syscall exits. But, the syscall error status itself is 0.
>
> So now [c] is adjusting the return values based on whether extra
> signals were detected after ep_poll(). This part was needed even for
> [a].
>
> Let me know if this clarifies things a bit.
Just adding a little more clarification, there is an additional change
between [a] and [b].
As per [a] we would just restore the signal instead of changing the
saved_sigmask and the signal could get delivered right then. [b]
changes this to happen at syscall exit:
void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
{
<snip>
/*
* When signals are pending, do not restore them here.
* Restoring sigmask here can lead to delivering signals
that the above
* syscalls are intended to block because of the sigmask passed in.
*/
if (signal_pending(current)) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
}
-Deepa
> Just adding a little more clarification, there is an additional change
> between [a] and [b].
> As per [a] we would just restore the signal instead of changing the
> saved_sigmask and the signal could get delivered right then. [b]
> changes this to happen at syscall exit:
Rewording above, as there seems to be a few misrepresentations:
Just adding a little more clarification, there is an additional change
between [a] and [b].
As per [a] we would just restore the signal mask instead of changing
the saved_sigmask and the even the blocked signals could get delivered
right then. [b] changes the restoration to happen at syscall exit:
> void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> {
>
> <snip>
>
> /*
> * When signals are pending, do not restore them here.
> * Restoring sigmask here can lead to delivering signals
> that the above
> * syscalls are intended to block because of the sigmask passed in.
> */
> if (signal_pending(current)) {
> current->saved_sigmask = *sigsaved;
> set_restore_sigmask();
> return;
> }
-Deepa
From: Deepa Dinamani
> Sent: 23 May 2019 19:07
> Ok, since there has been quite a bit of argument here, I will
> backtrack a little bit and maybe it will help us understand what's
> happening here.
> There are many scenarios being discussed on this thread:
> a. State of code before 854a6ed56839a
> b. State after 854a6ed56839a
> c. Proposed fix as per the patchset in question.
>
> Oleg, I will discuss these first and then we can discuss the
> additional changes you suggested.
>
> Some background on why we have these syscalls that take sigmask as an
> argument. This is just for the sake of completeness of the argument.
>
> These are particularly meant for a scenario(d) such as below:
>
> 1. block the signals you don't care about.
> 2. syscall()
> 3. unblock the signals blocked in 1.
>
> The problem here is that if there is a signal that is not blocked by 1
> and such a signal is delivered between 1 and 2(since they are not
> atomic), the syscall in 2 might block forever as it never found out
> about the signal.
I think we all agree about the underlying problem these system calls solve.
> As per [a] and let's consider the case of epoll_pwait only first for simplicity.
For simplicity you ought to consider sigwaitinfo() :-)
> As I said before, ep_poll() is what checks for signal_pending() and is
> responsible for setting errno to -EINTR when there is a signal.
Ah, there in lies the problem (well one of them).
ep_poll() should only return -EINTR if its sleep (waiting for an fd to
be ready) is interrupted.
The signal handler(s) should still be called though.
If the timeout is 0 then any signal handler should be called, but the
return value is still 0 (if no fd are 'ready').
> So if a signal is received after ep_poll() and ep_poll() returns
> success, it is never noticed by the syscall during execution.
> So the question is does the userspace have to know about this signal
> or not. From scenario [d] above, I would say it should, even if all
> the fd's completed successfully.
What is scenario [d]? You've code versions a/b/c but no scenarios.
> This does not happen in [a]. So this is what I said was already broken.
>
> What [b] does is to move the signal check closer to the restoration of
> the signal. This way it is good. So, if there is a signal after
> ep_poll() returns success, it is noticed and the signal is delivered
> when the syscall exits. But, the syscall error status itself is 0.
By 0 you mean >= 0 ??
> So now [c] is adjusting the return values based on whether extra
> signals were detected after ep_poll(). This part was needed even for
> [a].
IMHO The return value should never be changed.
Much like write() can return a partial length if a signal happens.
ISTM that a user signal handler should be scheduled to be run
if the signal is pending and not masked.
On return to user all scheduled signal handlers are called
(regardless as to whether the signal is masked at that time).
This might mean getting the 'return to user' code to restore
the original signal mask saved for epoll_pwait() and pselect() etc.
If, for some perverted reason (compatibility with broken apps),
you need epoll_pwait() to return EINTR instead of 0 the you
probably need a special 'kernel internal' errno value that
is always converted by the syscall exit code to EINTR/0
(and a second one that is EINTR/EAGAIN etc).
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
It seems that we all are just trying to confuse each other. I got lost.
On 05/23, David Laight wrote:
>
> From: Oleg Nesterov
> > Sent: 23 May 2019 17:36
> > On 05/23, David Laight wrote:
> > >
> > > From: Oleg Nesterov
> > > > On 05/23, David Laight wrote:
> ...
> > > > Not sure I understand... OK, suppose that you do
> > > >
> > > > block-all-signals;
> > > > ret = pselect(..., sigmask(SIG_URG));
> > > >
> > > > if it returns success/timeout then the handler for SIG_URG should not be called?
> > >
> > > Ugg...
> > > Posix probably allows the signal handler be called at the point the event
> > > happens rather than being deferred until the system call completes.
> > > Queueing up the signal handler to be run at a later time (syscall exit)
> > > certainly makes sense.
> > > Definitely safest to call the signal handler even if success/timeout
> > > is returned.
> >
> > Why?
> >
> > > pselect() exists to stop the entry race, not the exit one.
> >
> > pselect() has to block SIG_URG again before it returns to user-mode, right?
>
> Yep.
> So the signal handler can't be called for a signal that happens after
> pselect() returns.
Yes. And "after pselect() returns" actually means "after pselect() restores
the old sigmask while it returns to user mode".
> > Suppose pselect() finds a ready fd, and this races with SIG_URG.
>
> You mean if SIG_URG is raised after a ready fd is found (or even timeout)?
> So the return value isn't EINTR.
Yes.
> (If an fd is readable on entry, the SIG_URG could have happened much earlier.)
Why not? See the pseudo code above. It was blocked before pselect() was called.
So SIG_URG can be already pending when pselect() is called but since an fd is
already ready on entry pselect() restores the old sigmask (and thus blocks SIG_URG
again) and returns success. The handler is not called.
However, if there is no a ready fd, pselect won't block. It will notice SIG_URG,
deliver this signal, and return -EINTR.
> > Why do you think the handler should run?
>
> Think of the application code loop.
> Consider what happens if the signal is SIG_INT - to request the program
> stop.
SIG_INT or SIG_URG ? Again, please look at the pseudo code above. SIG_INT is
blocked and never unblocked.
> After every pselect() call the application looks to see if the handler
> has been called.
> If one of the fds is always readable pselect() will never return EINTR
> but you want the SIG_INT handler run so that the loop gets terminated.
> If you only call the signal handler when EINTR is returned the process
> will never stop.
> So you need to call the handler even when pselect() succeeds/time out.
Then do not block SIG_INT ?
block-all-signals-except-SIG_INT;
ret = pselect(..., sigmask{SIG_URG, SIG_INT});
> > What if SIG_URG comes right after pselect() blocks SIG_URG again? I mean,
> > how this differs the case when it comes before, but a ready fd was already
> > found?
>
> I suspect you need to defer the re-instatement of the original mask
> to the code that calls the signal handlers (which probably should
> be called with the programs signal mask).
This is what the kernel does when the signal is delivered, the original mask
is restored after the signal handler runs.
> So that particular window doesn't exist.
Which window???
Oleg.
On 05/23, Deepa Dinamani wrote:
>
> Ok, since there has been quite a bit of argument here, I will
> backtrack a little bit and maybe it will help us understand what's
> happening here.
> There are many scenarios being discussed on this thread:
> a. State of code before 854a6ed56839a
I think everything was correct,
> b. State after 854a6ed56839a
obviously buggy,
> c. Proposed fix as per the patchset in question.
Nack, sorry. I'll try to finish my patch on Monday. It will restore the state
before 854a6ed56839a and (imo) cleanup/simplify this code.
At leat this is what I think right now. May be I will have to change my mind
after this discussion. But in any case I can't believe I will ever agree with
your fix ;)
> These are particularly meant for a scenario(d) such as below:
>
> 1. block the signals you don't care about.
> 2. syscall()
> 3. unblock the signals blocked in 1.
>
> The problem here is that if there is a signal that is not blocked by 1
> and such a signal is delivered between 1 and 2(since they are not
> atomic), the syscall in 2 might block forever as it never found out
> about the signal.
and that is why we have pselect/etc to make this sequence "atomic".
> As per [a] and let's consider the case of epoll_pwait only first for simplicity.
>
> As I said before, ep_poll() is what checks for signal_pending() and is
> responsible for setting errno to -EINTR when there is a signal.
To clarify, if do_epoll_wait() return -EINTR then signal_pending() is true,
right?
> So if a signal is received after ep_poll() and ep_poll() returns
> success, it is never noticed by the syscall during execution.
What you are saying looks very confusing to me, I will assume that you
meant something like
- a signal SIG_XXX was blocked before sys_epoll_pwait() was called
- sys_epoll_pwait(sigmask) unblocks SIG_XXX according to sigmask
- sys_epoll_pwait() calls do_epoll_wait() which returns success
- SIG_XXX comes after that and it is "never noticed"
Yes. Everything is correct. And see my reply to David, SIG_XXX can even
come _before_ sys_epoll_pwait() was called.
> So the question is does the userspace have to know about this signal
> or not.
If userspace needs to know about SIG_XXX it should not block it, that is all.
> What [b] does is to move the signal check closer to the restoration of
> the signal.
FOR NO REASON, afaics (to simplify, lets forget the problem with the wrong
return value you are trying to fix).
And even if there were ANY reason to do this, note that (with or without this
fix) the signal_pending() check inside restore_user_sigmask() can NOT help,
simply because SIG_XXX can come right after this check.
Oleg.
On 05/23, Deepa Dinamani wrote:
>
> 1. block the signals you don't care about.
> 2. syscall()
> 3. unblock the signals blocked in 1.
and even this part of your email is very confusing. because in this case
we can never miss a signal. I'd say
1. block the signals you don't care about
2. unblock the signals which should interrupt the syscall below
3. syscall()
4. block the signals unblocked in 2.
Oleg.
I think you are misunderstanding what I said. You are taking things
out of context. I was saying here what I did was inspired by why the
syscall was designed to begin with. The syscall below refers to
epoll_wait and not epoll_pwait.
-Deepa
On Fri, May 24, 2019 at 7:19 AM Oleg Nesterov <[email protected]> wrote:
>
> On 05/23, Deepa Dinamani wrote:
> >
> > 1. block the signals you don't care about.
> > 2. syscall()
> > 3. unblock the signals blocked in 1.
>
> and even this part of your email is very confusing. because in this case
> we can never miss a signal. I'd say
>
> 1. block the signals you don't care about
> 2. unblock the signals which should interrupt the syscall below
> 3. syscall()
> 4. block the signals unblocked in 2.
>
> Oleg.
>
On 05/24, Deepa Dinamani wrote:
>
> I think you are misunderstanding what I said.
probably. Everything was very confusing to me from the very beginning.
And yes, I can hardly understand your emails, sorry. This one too :/
> You are taking things
> out of context. I was saying here what I did was inspired by why the
> syscall was designed to begin with.
which syscall?
> The syscall below refers to
> epoll_wait and not epoll_pwait.
So you tried to explain why epoll_pwait() was designed? Or what?
Either way, everything I said below still looks right to me. This probably
means that I still can't understand you.
But this is irrelevant. My main point is that the kernel was correct before
854a6ed568 ("signal: Add restore_user_sigmask()"), the (incomplete) patch I sent
tries to a) restore the correct behaviour and b) simplify/cleanup the code.
> On Fri, May 24, 2019 at 7:19 AM Oleg Nesterov <[email protected]> wrote:
> >
> > On 05/23, Deepa Dinamani wrote:
> > >
> > > 1. block the signals you don't care about.
> > > 2. syscall()
> > > 3. unblock the signals blocked in 1.
> >
> > and even this part of your email is very confusing. because in this case
> > we can never miss a signal. I'd say
> >
> > 1. block the signals you don't care about
> > 2. unblock the signals which should interrupt the syscall below
> > 3. syscall()
> > 4. block the signals unblocked in 2.
> >
> > Oleg.
> >
From: Oleg Nesterov
> Sent: 24 May 2019 14:29
> It seems that we all are just trying to confuse each other. I got lost.
I'm always lost :-)
> On 05/23, David Laight wrote:
> >
> > From: Oleg Nesterov
> > > Sent: 23 May 2019 17:36
> > > On 05/23, David Laight wrote:
> > > >
> > > > From: Oleg Nesterov
> > > > > On 05/23, David Laight wrote:
> > ...
> > > > > Not sure I understand... OK, suppose that you do
> > > > >
> > > > > block-all-signals;
> > > > > ret = pselect(..., sigmask(SIG_URG));
> > > > >
> > > > > if it returns success/timeout then the handler for SIG_URG should not be called?
> > > >
> > > > Ugg...
> > > > Posix probably allows the signal handler be called at the point the event
> > > > happens rather than being deferred until the system call completes.
> > > > Queueing up the signal handler to be run at a later time (syscall exit)
> > > > certainly makes sense.
> > > > Definitely safest to call the signal handler even if success/timeout
> > > > is returned.
> > >
> > > Why?
> > >
> > > > pselect() exists to stop the entry race, not the exit one.
> > >
> > > pselect() has to block SIG_URG again before it returns to user-mode, right?
> >
> > Yep.
> > So the signal handler can't be called for a signal that happens after
> > pselect() returns.
>
> Yes. And "after pselect() returns" actually means "after pselect() restores
> the old sigmask while it returns to user mode".
>
> > > Suppose pselect() finds a ready fd, and this races with SIG_URG.
> >
> > You mean if SIG_URG is raised after a ready fd is found (or even timeout)?
> > So the return value isn't EINTR.
>
> Yes.
>
> > (If an fd is readable on entry, the SIG_URG could have happened much earlier.)
>
> Why not? See the pseudo code above. It was blocked before pselect() was called.
> So SIG_URG can be already pending when pselect() is called but since an fd is
> already ready on entry pselect() restores the old sigmask (and thus blocks SIG_URG
> again) and returns success. The handler is not called.
>
> However, if there is no a ready fd, pselect won't block. It will notice SIG_URG,
> deliver this signal, and return -EINTR.
To my mind changing the signal mask should be enough to get a masked
signal handler called - even if the mask is reset before the syscall exits.
There shouldn't be any need for an interruptible wait to be interrupted.
I suspect that if you send a signal to a process that is looping
in userspace (on a different) the signal handler is called on the next
exit to userspace regardless as to whether the kernel blocks.
epoll and pselect shouldn't be any different.
Having the signal unmasked at any time should be enough to get it called.
...
> > > What if SIG_URG comes right after pselect() blocks SIG_URG again? I mean,
> > > how this differs the case when it comes before, but a ready fd was already
> > > found?
> >
> > I suspect you need to defer the re-instatement of the original mask
> > to the code that calls the signal handlers (which probably should
> > be called with the programs signal mask).
>
> This is what the kernel does when the signal is delivered, the original mask
> is restored after the signal handler runs.
I'd have thought that the original signal mask (all blocked in the examples)
should be restored before the signal handler is called.
After all the signal handler is allowed to modify the processes signal mask.
I've had horrid thoughts about SIG_SUSPEND :-)
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
From: David Laight
> Sent: 24 May 2019 16:00
...
> I've had horrid thoughts about SIG_SUSPEND :-)
Not to mention exiting signal handlers with longjmp().
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Fri, May 24, 2019 at 7:11 AM Oleg Nesterov <[email protected]> wrote:
>
> On 05/23, Deepa Dinamani wrote:
> >
> > Ok, since there has been quite a bit of argument here, I will
> > backtrack a little bit and maybe it will help us understand what's
> > happening here.
> > There are many scenarios being discussed on this thread:
> > a. State of code before 854a6ed56839a
>
> I think everything was correct,
There were 2 things that were wrong:
1. If an unblocked signal was received, after the ep_poll(), then the
return status did not indicate that. This is expected behavior
according to man page. If this is indeed what is expected then the man
page should note that signal will be delivered in this case and return
code will still be 0.
"EINTR
The call was interrupted by a signal handler before either any of the
requested events occurred or the timeout expired; see signal(7)."
2. The restoring of the sigmask is done right in the syscall part and
not while exiting the syscall and if you get a blocked signal here,
you will deliver this to userspace.
> > b. State after 854a6ed56839a
>
> obviously buggy,
Ok, then can you point out what specifically was wrong with
854a6ed56839a? And, not how it could be more simple?
> > c. Proposed fix as per the patchset in question.
>
> > As per [a] and let's consider the case of epoll_pwait only first for simplicity.
> >
> > As I said before, ep_poll() is what checks for signal_pending() and is
> > responsible for setting errno to -EINTR when there is a signal.
>
> To clarify, if do_epoll_wait() return -EINTR then signal_pending() is true,
> right?
Yes, the case I'm talking about is when do_epoll_wait() returns 0 and
then you get a signal.
> > So if a signal is received after ep_poll() and ep_poll() returns
> > success, it is never noticed by the syscall during execution.
>
> What you are saying looks very confusing to me, I will assume that you
> meant something like
>
> - a signal SIG_XXX was blocked before sys_epoll_pwait() was called
>
> - sys_epoll_pwait(sigmask) unblocks SIG_XXX according to sigmask
>
> - sys_epoll_pwait() calls do_epoll_wait() which returns success
>
> - SIG_XXX comes after that and it is "never noticed"
>
> Yes. Everything is correct. And see my reply to David, SIG_XXX can even
> come _before_ sys_epoll_pwait() was called.
No, I'm talking about a signal that was not blocked.
> > So the question is does the userspace have to know about this signal
> > or not.
>
> If userspace needs to know about SIG_XXX it should not block it, that is all.
What should be the return value if a signal is detected after a fd completed?
> > What [b] does is to move the signal check closer to the restoration of
> > the signal.
>
> FOR NO REASON, afaics (to simplify, lets forget the problem with the wrong
> return value you are trying to fix).
As I already pointed out, the restoring of the sigmask is done during
the syscall and not while exiting the syscall and if you get a blocked
signal here, you will deliver this to userspace.
> And even if there were ANY reason to do this, note that (with or without this
> fix) the signal_pending() check inside restore_user_sigmask() can NOT help,
> simply because SIG_XXX can come right after this check.
This I pointed out already that we should probably make this sequence atomic.
-Deepa
On 05/24, David Laight wrote:
>
> From: Oleg Nesterov
> > Sent: 24 May 2019 14:29
> > It seems that we all are just trying to confuse each other. I got lost.
>
> I'm always lost :-)
same here ;)
> To my mind changing the signal mask should be enough to get a masked
> signal handler called - even if the mask is reset before the syscall exits.
well, the kernel doesn't do this, and on purpose.
> There shouldn't be any need for an interruptible wait to be interrupted.
can't parse ;)
> I suspect that if you send a signal to a process that is looping
> in userspace (on a different) the signal handler is called on the next
> exit to userspace regardless as to whether the kernel blocks.
>
> epoll and pselect shouldn't be any different.
They differ exactly because they manipulate the blocked mask,
> Having the signal unmasked at any time should be enough to get it called.
No. The sigmask passed to pselect() tells the kernel which signals should
interrupt the syscall if it blocks. The fact that pselect() actually unblocks
a signal is just the internal implementation detail.
> > > I suspect you need to defer the re-instatement of the original mask
> > > to the code that calls the signal handlers (which probably should
> > > be called with the programs signal mask).
> >
> > This is what the kernel does when the signal is delivered, the original mask
> > is restored after the signal handler runs.
>
> I'd have thought that the original signal mask (all blocked in the examples)
> should be restored before the signal handler is called.
No. And this means that if you have 2 pending signals, they both will be delivered.
Unless of course sigaction->sa_mask includes the 2nd one.
> After all the signal handler is allowed to modify the processes signal mask.
only untill the handler returns.
> I've had horrid thoughts about SIG_SUSPEND :-)
google knows nothing about SIG_SUSPEND, neither me ;)
Oleg.
On 05/24, David Laight wrote:
>
> From: David Laight
> > Sent: 24 May 2019 16:00
> ...
> > I've had horrid thoughts about SIG_SUSPEND :-)
>
> Not to mention exiting signal handlers with longjmp().
that is why we have siglongjmp().
Oleg.
On 05/24, Deepa Dinamani wrote:
>
> On Fri, May 24, 2019 at 7:11 AM Oleg Nesterov <[email protected]> wrote:
> >
> > On 05/23, Deepa Dinamani wrote:
> > >
> > > Ok, since there has been quite a bit of argument here, I will
> > > backtrack a little bit and maybe it will help us understand what's
> > > happening here.
> > > There are many scenarios being discussed on this thread:
> > > a. State of code before 854a6ed56839a
> >
> > I think everything was correct,
>
> There were 2 things that were wrong:
>
> 1. If an unblocked signal was received, after the ep_poll(), then the
> return status did not indicate that.
Yes,
> This is expected behavior
> according to man page. If this is indeed what is expected then the man
> page should note that signal will be delivered in this case and return
> code will still be 0.
>
> "EINTR
> The call was interrupted by a signal handler before either any of the
> requested events occurred or the timeout expired; see signal(7)."
and what do you think the man page could say?
This is obviously possible for any syscall, and we can't avoid this. A signal
can come right after syscall insn completes. The signal handler will be called
but this won't change $rax, user-space can see return code == 0 or anything else.
And this doesn't differ from the case when the signal comes before syscall returns.
> 2. The restoring of the sigmask is done right in the syscall part and
> not while exiting the syscall and if you get a blocked signal here,
> you will deliver this to userspace.
So I assume that this time you are talking about epoll_pwait() and not epoll_wait()...
And I simply can't understand you. But yes, if the original mask doesn't include
the pending signal it will be delivered while the syscall can return success/timout
or -EFAULT or anything.
This is correct, see above.
> > > b. State after 854a6ed56839a
> >
> > obviously buggy,
>
> Ok, then can you point out what specifically was wrong with
> 854a6ed56839a?
Cough. If nothing else the lost -EINTR?
> And, not how it could be more simple?
Well, I already sent the patch and after that I even showed you the code with the
patch applied. See https://lore.kernel.org/lkml/[email protected]/
> > What you are saying looks very confusing to me, I will assume that you
> > meant something like
> >
> > - a signal SIG_XXX was blocked before sys_epoll_pwait() was called
> >
> > - sys_epoll_pwait(sigmask) unblocks SIG_XXX according to sigmask
> >
> > - sys_epoll_pwait() calls do_epoll_wait() which returns success
> >
> > - SIG_XXX comes after that and it is "never noticed"
> >
> > Yes. Everything is correct. And see my reply to David, SIG_XXX can even
> > come _before_ sys_epoll_pwait() was called.
>
> No, I'm talking about a signal that was not blocked.
OK, see above.
> > > So the question is does the userspace have to know about this signal
> > > or not.
> >
> > If userspace needs to know about SIG_XXX it should not block it, that is all.
>
> What should be the return value if a signal is detected after a fd completed?
Did you mean "if a signal is detected after a ready fd was already found" ?
In this case the return value should report success. But I have already lost,
this all looks irrelevant wrt to fix we need.
> > > What [b] does is to move the signal check closer to the restoration of
> > > the signal.
> >
> > FOR NO REASON, afaics (to simplify, lets forget the problem with the wrong
> > return value you are trying to fix).
>
> As I already pointed out, the restoring of the sigmask is done during
> the syscall and not while exiting the syscall and if you get a blocked
> signal here, you will deliver this to userspace.
>
> > And even if there were ANY reason to do this, note that (with or without this
> > fix) the signal_pending() check inside restore_user_sigmask() can NOT help,
> > simply because SIG_XXX can come right after this check.
>
> This I pointed out already that we should probably make this sequence atomic.
See above.
Oleg.
From: Oleg Nesterov
> Sent: 24 May 2019 16:44
> > To my mind changing the signal mask should be enough to get a masked
> > signal handler called - even if the mask is reset before the syscall exits.
>
> well, the kernel doesn't do this, and on purpose.
>
> > There shouldn't be any need for an interruptible wait to be interrupted.
>
> can't parse ;)
>
> > I suspect that if you send a signal to a process that is looping
> > in userspace (on a different) the signal handler is called on the next
> > exit to userspace regardless as to whether the kernel blocks.
> >
> > epoll and pselect shouldn't be any different.
>
> They differ exactly because they manipulate the blocked mask,
>
> > Having the signal unmasked at any time should be enough to get it called.
>
> No. The sigmask passed to pselect() tells the kernel which signals should
> interrupt the syscall if it blocks. The fact that pselect() actually unblocks
> a signal is just the internal implementation detail.
If you take that line of reasoning the signal handler shouldn't be called
at all.
For pselect() (which ought to work the same way as epoll_pwait()) the
man page states that the current signal mask is replaced by the specified
one for the duration of the call - so you'd expect signal handlers to run
even if pselect() returns >= 0.
Consider a program that disables all signals at the top of main()
then has a processing loop with epoll_pwait() (or pselect()) at the
top) that enables a variety of signals.
It would be reasonable to expect that a signal handler would run
even if one of the fds was always 'ready'.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Fri, May 24, 2019 at 9:33 AM Oleg Nesterov <[email protected]> wrote:
>
> On 05/24, Deepa Dinamani wrote:
> >
> > On Fri, May 24, 2019 at 7:11 AM Oleg Nesterov <[email protected]> wrote:
> > >
> > > On 05/23, Deepa Dinamani wrote:
> > > >
> > > > Ok, since there has been quite a bit of argument here, I will
> > > > backtrack a little bit and maybe it will help us understand what's
> > > > happening here.
> > > > There are many scenarios being discussed on this thread:
> > > > a. State of code before 854a6ed56839a
> > >
> > > I think everything was correct,
> >
> > There were 2 things that were wrong:
> >
> > 1. If an unblocked signal was received, after the ep_poll(), then the
> > return status did not indicate that.
>
> Yes,
>
> > This is expected behavior
> > according to man page. If this is indeed what is expected then the man
> > page should note that signal will be delivered in this case and return
> > code will still be 0.
> >
> > "EINTR
> > The call was interrupted by a signal handler before either any of the
> > requested events occurred or the timeout expired; see signal(7)."
>
> and what do you think the man page could say?
Maybe clarify that a signal handler can be invoked even if the syscall
return indicates a success.
Maybe a crude userspace application could do something like this:
sig_handler()
{
set global abort = 1
}
poll_the_fds()
{
ret = epoll_pwait()
if (ret)
return ret
if (abort) # but this abort should be ignored
if ret was 0.
return try_again
}
> This is obviously possible for any syscall, and we can't avoid this. A signal
> can come right after syscall insn completes. The signal handler will be called
> but this won't change $rax, user-space can see return code == 0 or anything else.
>
> And this doesn't differ from the case when the signal comes before syscall returns.
But, these syscalls are depending on there signals. I would assume for
the purpose of these syscalls that the execution is done when we
updated the saved_sigmask. We can pick a different point per syscall
like ep_poll() also, but then we need to probably make it clear for
each such syscall.
> > 2. The restoring of the sigmask is done right in the syscall part and
> > not while exiting the syscall and if you get a blocked signal here,
> > you will deliver this to userspace.
>
> So I assume that this time you are talking about epoll_pwait() and not epoll_wait()...
Yes.
> And I simply can't understand you. But yes, if the original mask doesn't include
> the pending signal it will be delivered while the syscall can return success/timout
> or -EFAULT or anything.
>
> This is correct, see above.
Look at the code before 854a6ed56839a:
/*
* If we changed the signal mask, we need to restore the original one.
* In case we've got a signal while waiting, we do not restore the
* signal mask yet, and we allow do_signal() to deliver the signal on
* the way back to userspace, before the signal mask is restored.
*/
if (sigmask) {
####### This err has not been changed since ep_poll()
####### So if there is a signal before this point, but
err = 0, then we goto else.
if (err == -EINTR) {
memcpy(¤t->saved_sigmask, &sigsaved,
sizeof(sigsaved));
set_restore_sigmask();
} else
############ This is a problem if there is signal
pending that is sigmask should block.
########### This is the whole reason we have
current->saved_sigmask?
set_current_blocked(&sigsaved);
}
> > > > b. State after 854a6ed56839a
> > >
> > > obviously buggy,
> >
> > Ok, then can you point out what specifically was wrong with
> > 854a6ed56839a?
>
> Cough. If nothing else the lost -EINTR?
This was my theory. My basis behind the theory was [1](the issue with
return value not being updated) above. And, you are saying this is ok.
854a6ed56839a also has timing differences compared to the original
code. So unless we are sure what was uncovered because of
854a6ed56839a, we might just be masking a pre-existing problem by
reverting it. So I think we should code review 854a6ed56839a and
figure out what is wrong programatically before just reverting it.
> > And, not how it could be more simple?
Oh, I was not asking here. I was saying let's please discuss what's
wrong before simplifying the code.
-Deepa
Deepa,
it seems that we both are saying the same things again and again, and we
simply can't understand each other.
I'll try to write another email to restart this discussion. Tomorrow, somehow
I can't wake up today.
And let me repeat, of course I can be wrong. IOW, it is not that I am trying
to blame you for all this confusion.
On 05/24, Deepa Dinamani wrote:
>
> > > Ok, then can you point out what specifically was wrong with
> > > 854a6ed56839a?
> >
> > Cough. If nothing else the lost -EINTR?
>
> This was my theory. My basis behind the theory was [1](the issue with
> return value not being updated) above. And, you are saying this is ok.
I agree that "the lost -EINTR" above was not clear. I'll try to clarify
what I think is not OK.
Oleg.
From: Deepa Dinamani
> Sent: 24 May 2019 18:02
...
> Maybe a crude userspace application could do something like this:
>
> sig_handler()
> {
> set global abort = 1
> }
>
> poll_the_fds()
> {
> ret = epoll_pwait()
> if (ret)
> return ret
> if (abort)
> // but this abort should be ignored if ret was 0.
> return try_again
>
> }
As an application writer I'd want to see 'abort == 1' even
if epoll_pwait() returned that an fd was 'ready'.
So the code above should probably be:
wait_again:
ret = epoll_pwait();
if (abort)
process_abort();
if (ret <= 0) {
if (ret == 0)
process_timeout();
if (ret == 0 || errno == EINTR)
goto wait_again;
// Something went horribly wrong in epoll_pwait()
return ret;
}
// process the 'ready' fds
It would be non-unreasonable for the application to have
all signals blocked except during the epoll_pwait().
So the application needs the signal handler for SIG_INT (etc)
to be called even if one of the fd is always 'ready'.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
From: Deepa Dinamani
> Sent: 24 May 2019 18:02
...
> Look at the code before 854a6ed56839a:
>
> /*
> * If we changed the signal mask, we need to restore the original one.
> * In case we've got a signal while waiting, we do not restore the
> * signal mask yet, and we allow do_signal() to deliver the signal on
> * the way back to userspace, before the signal mask is restored.
> */
> if (sigmask) {
> ####### This err has not been changed since ep_poll()
> ####### So if there is a signal before this point, but
> err = 0, then we goto else.
> if (err == -EINTR) {
> memcpy(¤t->saved_sigmask, &sigsaved,
> sizeof(sigsaved));
> set_restore_sigmask();
> } else
> ############ This is a problem if there is signal
> pending that is sigmask should block.
> ########### This is the whole reason we have
> current->saved_sigmask?
> set_current_blocked(&sigsaved);
> }
What happens if all that crap is just deleted (I presume from the
bottom of ep_wait()) ?
I'm guessing that on the way back to userspace signal handlers for
signals enabled in the process's current mask (the one specified
to epoll_pwait) get called.
Then the signal mask is loaded from current->saved_sigmask and
and enabled signal handlers are called again.
No special code there that depends on the syscall result, errno
of the syscall number.
That seems exactly correct!
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
> On May 28, 2019, at 2:12 AM, David Laight <[email protected]> wrote:
>
> From: Deepa Dinamani
>> Sent: 24 May 2019 18:02
> ...
>> Look at the code before 854a6ed56839a:
>>
>> /*
>> * If we changed the signal mask, we need to restore the original one.
>> * In case we've got a signal while waiting, we do not restore the
>> * signal mask yet, and we allow do_signal() to deliver the signal on
>> * the way back to userspace, before the signal mask is restored.
>> */
>> if (sigmask) {
>> ####### This err has not been changed since ep_poll()
>> ####### So if there is a signal before this point, but
>> err = 0, then we goto else.
>> if (err == -EINTR) {
>> memcpy(¤t->saved_sigmask, &sigsaved,
>> sizeof(sigsaved));
>> set_restore_sigmask();
>> } else
>> ############ This is a problem if there is signal
>> pending that is sigmask should block.
>> ########### This is the whole reason we have
>> current->saved_sigmask?
>> set_current_blocked(&sigsaved);
>> }
>
> What happens if all that crap is just deleted (I presume from the
> bottom of ep_wait()) ?
Hmm, you have to update the saved_sigmask or the sigmask.
> I'm guessing that on the way back to userspace signal handlers for
> signals enabled in the process's current mask (the one specified
> to epoll_pwait) get called.
> Then the signal mask is loaded from current->saved_sigmask and
> and enabled signal handlers are called again.
Who is saving this saved_sigmask that is being restored on the way back?
> No special code there that depends on the syscall result, errno
> of the syscall number.
I didn’t say this has anything to do with errno.
-Deepa
From: Deepa Dinamani
> Sent: 28 May 2019 12:38
> > On May 28, 2019, at 2:12 AM, David Laight <[email protected]> wrote:
> >
> > From: Deepa Dinamani
> >> Sent: 24 May 2019 18:02
> > ...
> >> Look at the code before 854a6ed56839a:
> >>
> >> /*
> >> * If we changed the signal mask, we need to restore the original one.
> >> * In case we've got a signal while waiting, we do not restore the
> >> * signal mask yet, and we allow do_signal() to deliver the signal on
> >> * the way back to userspace, before the signal mask is restored.
> >> */
> >> if (sigmask) {
> >> ####### This err has not been changed since ep_poll()
> >> ####### So if there is a signal before this point, but
> >> err = 0, then we goto else.
> >> if (err == -EINTR) {
> >> memcpy(¤t->saved_sigmask, &sigsaved,
> >> sizeof(sigsaved));
> >> set_restore_sigmask();
> >> } else
> >> ############ This is a problem if there is signal
> >> pending that is sigmask should block.
> >> ########### This is the whole reason we have
> >> current->saved_sigmask?
> >> set_current_blocked(&sigsaved);
> >> }
> >
> > What happens if all that crap is just deleted (I presume from the
> > bottom of ep_wait()) ?
>
> Hmm, you have to update the saved_sigmask or the sigmask.
Doesn't the top of ep_poll() do all that?
Something like copying the current sigmask to the saved_sigmask
and the user-supplied sigmask to the current sigmask.
Otherwise the sleeps won't be interruptible.
It is worth noting that both pselect() and epoll_pwait() differ
from sigwait() (and friends) which were (IIRC) the first system
calls to try to remove the timing windows associated with waiting
for signals.
sigwait() returns the signal number and removes it from the pending
mask - so the signal handler won't be called for the returned signal.
(Unless it wasn't actually masked!)
So the sigwait() code does have to restore the signal mask itself.
> > I'm guessing that on the way back to userspace signal handlers for
> > signals enabled in the process's current mask (the one specified
> > to epoll_pwait) get called.
> > Then the signal mask is loaded from current->saved_sigmask and
> > and enabled signal handlers are called again.
>
> Who is saving this saved_sigmask that is being restored on the way back?
It has to be saved when the user-supplied one in installed.
> > No special code there that depends on the syscall result, errno
> > of the syscall number.
>
> I didn’t say this has anything to do with errno.
The code snippet above checks it ...
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Mon, May 27, 2019 at 8:04 AM Oleg Nesterov <[email protected]> wrote:
>
> Deepa,
>
> it seems that we both are saying the same things again and again, and we
> simply can't understand each other.
Oleg, I'm sorry for the confusion. Maybe I should point out what I
agree with also.
I agree that signal handller being called and return value not being
altered is an issue with other syscalls also. I was just wondering if
some userspace code assumption would be assuming this. This is not a
kernel bug.
But, I do not think we have an understanding of what was wrong in
854a6ed56839a anymore since you pointed out that my assumption was not
correct that the signal handler being called without errno being set
is wrong.
One open question: this part of epoll_pwait was already broken before
854a6ed56839a. Do you agree?
if (err == -EINTR) {
memcpy(¤t->saved_sigmask, &sigsaved,
sizeof(sigsaved));
set_restore_sigmask();
} else
set_current_blocked(&sigsaved);
What to do next?
We could just see if your optimization patch resolves Eric's issue.
Or, I could revert the signal_pending() check and provide a fix
something like below(not a complete patch) since mainline has this
regression. Eric had tested something like this works also. And, I can
continue to look at what was wrong with 854a6ed56839a in the first
place. Let me know what you prefer:
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+int restore_user_sigmask(const void __user *usigmask, sigset_t
*sigsaved, int sig_pending)
{
if (!usigmask)
return;
/*
* When signals are pending, do not restore them here.
* Restoring sigmask here can lead to delivering signals that the above
* syscalls are intended to block because of the sigmask passed in.
*/
+ if (sig_pending) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
}
@@ -2330,7 +2330,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
epoll_event __user *, events,
error = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ signal_detected = restore_user_sigmask(sigmask, &sigsaved,
error == -EINTR);
-Deepa
Al, Linus, Eric, please help.
The previous discussion was very confusing, we simply can not understand each
other.
To me everything looks very simple and clear, but perhaps I missed something
obvious? Please correct me.
I think that the following code is correct
int interrupted = 0;
void sigint_handler(int sig)
{
interrupted = 1;
}
int main(void)
{
sigset_t sigint, empty;
sigemptyset(&sigint);
sigaddset(&sigint, SIGINT);
sigprocmask(SIG_BLOCK, &sigint, NULL);
signal(SIGINT, sigint_handler);
sigemptyset(&empty); // so pselect() unblocks SIGINT
ret = pselect(..., &empty);
if (ret >= 0) // sucess or timeout
assert(!interrupted);
if (interrupted)
assert(ret == -EINTR);
}
IOW, if pselect(sigmask) temporary unblocks SIGINT according to sigmask, this
signal should not be delivered if a ready fd was found or timeout. The signal
handle should only run if ret == -EINTR.
(pselect() can be interrupted by any other signal which has a handler. In this
case the handler can be called even if ret >= 0. This is correct, I fail to
understand why some people think this is wrong, and in any case we simply can't
avoid this).
This was true until 854a6ed56839a ("signal: Add restore_user_sigmask()"),
now this is broken by the signal_pending() check in restore_user_sigmask().
This patch https://lore.kernel.org/lkml/[email protected]/
turns 0 into -EINTR if signal_pending(), but I think we should simply restore
the old behaviour and simplify the code.
See the compile-tested patch at the end. Of course, the new _xxx() helpers
should be renamed somehow. fs/aio.c doesn't look right with or without this
patch, but iiuc this is what it did before 854a6ed56839a.
Let me show the code with the patch applied. I am using epoll_pwait() as an
example because it looks very simple.
static inline void set_restore_sigmask(void)
{
// WARN_ON(!TIF_SIGPENDING) was removed by this patch
current->restore_sigmask = true;
}
int set_xxx(const sigset_t __user *umask, size_t sigsetsize)
{
sigset_t *kmask;
if (!umask)
return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
// we can safely modify ->saved_sigmask/restore_sigmask, they has no meaning
// until the syscall returns.
set_restore_sigmask();
current->saved_sigmask = current->blocked;
set_current_blocked(kmask);
return 0;
}
void update_xxx(bool interrupted)
{
// the main reason for this helper is WARN_ON(!TIF_SIGPENDING) which was "moved"
// from set_restore_sigmask() above.
if (interrupted)
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
else
restore_saved_sigmask();
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
int error;
error = set_xxx(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
update_xxx(error == -EINTR);
return error;
}
Oleg.
---
fs/aio.c | 40 ++++++++++++++---------
fs/eventpoll.c | 12 +++----
fs/io_uring.c | 12 +++----
fs/select.c | 40 +++++++----------------
include/linux/compat.h | 4 +--
include/linux/sched/signal.h | 2 --
include/linux/signal.h | 6 ++--
kernel/signal.c | 77 ++++++++++++++++----------------------------
8 files changed, 74 insertions(+), 119 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1f..8315bd2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2093,8 +2093,8 @@ SYSCALL_DEFINE6(io_pgetevents,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
+ bool interrupted;
int ret;
if (timeout && unlikely(get_timespec64(&ts, timeout)))
@@ -2103,13 +2103,15 @@ SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ update_xxx(interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2126,8 +2128,8 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
+ bool interrupted;
int ret;
if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
@@ -2137,13 +2139,15 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ update_xxx(interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2191,8 +2195,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 t;
+ bool interrupted;
int ret;
if (timeout && get_old_timespec32(&t, timeout))
@@ -2201,13 +2205,15 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_xxx(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ update_xxx(interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2224,8 +2230,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 t;
+ bool interrupted;
int ret;
if (timeout && get_timespec64(&t, timeout))
@@ -2234,13 +2240,15 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_xxx(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ update_xxx(interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a0e98d..0b1a337 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2318,19 +2318,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
size_t, sigsetsize)
{
int error;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ error = set_xxx(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved);
+ update_xxx(error == -EINTR);
return error;
}
@@ -2343,19 +2341,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
compat_size_t, sigsetsize)
{
long err;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ err = set_compat_xxx(sigmask, sigsetsize);
if (err)
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved);
+ update_xxx(err == -EINTR);
return err;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 310f8d1..b5b99e2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2180,7 +2180,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
struct io_cq_ring *ring = ctx->cq_ring;
- sigset_t ksigmask, sigsaved;
int ret;
if (io_cqring_events(ring) >= min_events)
@@ -2189,24 +2188,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- &ksigmask, &sigsaved, sigsz);
+ ret = set_compat_xxx((const compat_sigset_t __user *)sig,
+ sigsz);
else
#endif
- ret = set_user_sigmask(sig, &ksigmask,
- &sigsaved, sigsz);
+ ret = set_xxx(sig, sigsz);
if (ret)
return ret;
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
+ update_xxx(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
- if (sig)
- restore_user_sigmask(sig, &sigsaved);
-
return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
}
diff --git a/fs/select.c b/fs/select.c
index 6cbc9ff..7eab132 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize,
enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -753,15 +752,14 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_xxx(sigmask, sigsetsize);
if (ret)
return ret;
ret = core_sys_select(n, inp, outp, exp, to);
+ update_xxx(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
-
return ret;
}
@@ -1087,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1100,18 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_xxx(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ update_xxx(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
@@ -1123,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1136,18 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_xxx(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ update_xxx(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1322,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
void __user *tsp, compat_sigset_t __user *sigmask,
compat_size_t sigsetsize, enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1345,15 +1336,14 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_xxx(sigmask, sigsetsize);
if (ret)
return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
+ update_xxx(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
-
return ret;
}
@@ -1406,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1419,18 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_xxx(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ update_xxx(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1442,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
unsigned int, nfds, struct __kernel_timespec __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1455,18 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_xxx(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ update_xxx(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ebddcb6..b20b001 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -138,9 +138,7 @@ typedef struct {
compat_sigset_word sig[_COMPAT_NSIG_WORDS];
} compat_sigset_t;
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
- size_t sigsetsize);
+int set_compat_xxx(const compat_sigset_t __user *umask, size_t sigsetsize);
struct compat_sigaction {
#ifndef __ARCH_HAS_IRIX_SIGACTION
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 38a0f07..8b5b537 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -417,7 +417,6 @@ void task_join_group_stop(struct task_struct *task);
static inline void set_restore_sigmask(void)
{
set_thread_flag(TIF_RESTORE_SIGMASK);
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
@@ -448,7 +447,6 @@ static inline bool test_and_clear_restore_sigmask(void)
static inline void set_restore_sigmask(void)
{
current->restore_sigmask = true;
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 9702016..65f84ac 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,10 +273,8 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
- sigset_t *sigsaved);
+extern int set_xxx(const sigset_t __user *umask, size_t sigsetsize);
+extern void update_xxx(bool interupted);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index d7b9d14..0a1ec68 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2861,79 +2861,56 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with update_xxx() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_xxx(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t *kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
+
+void update_xxx(bool interrupted)
+{
+ if (interrupted)
+ WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+ else
+ restore_saved_sigmask();
+}
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
- size_t sigsetsize)
+int set_compat_xxx(const compat_sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t *kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (signal_pending(current)) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
From: Oleg Nesterov
> Sent: 29 May 2019 17:12
> Al, Linus, Eric, please help.
>
> The previous discussion was very confusing, we simply can not understand each
> other.
>
> To me everything looks very simple and clear, but perhaps I missed something
> obvious? Please correct me.
>
> I think that the following code is correct
>
> int interrupted = 0;
>
> void sigint_handler(int sig)
> {
> interrupted = 1;
> }
>
> int main(void)
> {
> sigset_t sigint, empty;
>
> sigemptyset(&sigint);
> sigaddset(&sigint, SIGINT);
> sigprocmask(SIG_BLOCK, &sigint, NULL);
>
> signal(SIGINT, sigint_handler);
>
> sigemptyset(&empty); // so pselect() unblocks SIGINT
>
> ret = pselect(..., &empty);
^^^^^ sigint
>
> if (ret >= 0) // sucess or timeout
> assert(!interrupted);
>
> if (interrupted)
> assert(ret == -EINTR);
> }
>
> IOW, if pselect(sigmask) temporary unblocks SIGINT according to sigmask, this
> signal should not be delivered if a ready fd was found or timeout. The signal
> handle should only run if ret == -EINTR.
Personally I think that is wrong.
Given code like the above that has:
while (!interrupted) {
pselect(..., &sigint);
// process available data
}
You want the signal handler to be executed even if one of the fds
always has available data.
Otherwise you can't interrupt a process that is always busy.
One option is to return -EINTR if a signal is pending when the mask
is updated - before even looking at anything else.
Signals that happen later on (eg after a timeout) need not be reported
(until the next time around the loop).
> (pselect() can be interrupted by any other signal which has a handler. In this
> case the handler can be called even if ret >= 0. This is correct, I fail to
> understand why some people think this is wrong, and in any case we simply can't
> avoid this).
You mean any signal that isn't blocked when pselect() is called....
> This was true until 854a6ed56839a ("signal: Add restore_user_sigmask()"),
> now this is broken by the signal_pending() check in restore_user_sigmask().
>
> This patch https://lore.kernel.org/lkml/[email protected]/
> turns 0 into -EINTR if signal_pending(), but I think we should simply restore
> the old behaviour and simplify the code.
>
> See the compile-tested patch at the end. Of course, the new _xxx() helpers
> should be renamed somehow. fs/aio.c doesn't look right with or without this
> patch, but iiuc this is what it did before 854a6ed56839a.
>
> Let me show the code with the patch applied. I am using epoll_pwait() as an
> example because it looks very simple.
>
>
> static inline void set_restore_sigmask(void)
> {
> // WARN_ON(!TIF_SIGPENDING) was removed by this patch
> current->restore_sigmask = true;
> }
>
> int set_xxx(const sigset_t __user *umask, size_t sigsetsize)
> {
> sigset_t *kmask;
^ no '*' here, add & before uses.
>
> if (!umask)
> return 0;
> if (sigsetsize != sizeof(sigset_t))
> return -EINVAL;
> if (copy_from_user(kmask, umask, sizeof(sigset_t)))
> return -EFAULT;
>
> // we can safely modify ->saved_sigmask/restore_sigmask, they has no meaning
> // until the syscall returns.
> set_restore_sigmask();
> current->saved_sigmask = current->blocked;
> set_current_blocked(kmask);
>
> return 0;
> }
>
>
> void update_xxx(bool interrupted)
> {
> // the main reason for this helper is WARN_ON(!TIF_SIGPENDING) which was "moved"
> // from set_restore_sigmask() above.
> if (interrupted)
> WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> else
> restore_saved_sigmask();
> }
I looked at the code earlier, but failed to find the code that actually
delivers the signals.
It may be 'racy' with update_xxx() regardless of whether that is
looking for -EINTR or just a pending signal.
I assume that TIF_SIGPENGING is used to (not) short-circuit the
system call return path so that signals get delivered.
So that it is important that update_xxx() calls restore_saved_sigmask()
if there is no signal pending.
(Although a signal can happen after the test - which can/will be ignored
until the signal is enabled again.)
restore_saved_sigmask() must itself be able to set TIF_SIGPENDING
(the inner sigmask could be more restrictive!).
If restore_saved_sigmask() isn't called here, the syscall return
path must do it after calling all the handlers and after clearing
TIF_SIGPENDING, and then call unmasked handlers again.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Resending due to inadvertent conversion of prior message to html.
On Wed, May 29, 2019 at 9:12 AM Oleg Nesterov <[email protected]> wrote:
> Al, Linus, Eric, please help.
>
> The previous discussion was very confusing, we simply can not understand each
> other.
>
> To me everything looks very simple and clear, but perhaps I missed something
> obvious? Please correct me.
>
> I think that the following code is correct
>
> int interrupted = 0;
>
> void sigint_handler(int sig)
> {
> interrupted = 1;
> }
>
> int main(void)
> {
> sigset_t sigint, empty;
>
> sigemptyset(&sigint);
> sigaddset(&sigint, SIGINT);
> sigprocmask(SIG_BLOCK, &sigint, NULL);
>
> signal(SIGINT, sigint_handler);
>
> sigemptyset(&empty); // so pselect() unblocks SIGINT
>
> ret = pselect(..., &empty);
>
> if (ret >= 0) // sucess or timeout
> assert(!interrupted);
>
> if (interrupted)
> assert(ret == -EINTR);
> }
>
> IOW, if pselect(sigmask) temporary unblocks SIGINT according to sigmask, this
> signal should not be delivered if a ready fd was found or timeout. The signal
> handle should only run if ret == -EINTR.
>
> (pselect() can be interrupted by any other signal which has a handler. In this
> case the handler can be called even if ret >= 0. This is correct, I fail to
> understand why some people think this is wrong, and in any case we simply can't
> avoid this).
>
> This was true until 854a6ed56839a ("signal: Add restore_user_sigmask()"),
> now this is broken by the signal_pending() check in restore_user_sigmask().
>
> This patch https://lore.kernel.org/lkml/[email protected]/
> turns 0 into -EINTR if signal_pending(), but I think we should simply restore
> the old behaviour and simplify the code.
>
> See the compile-tested patch at the end. Of course, the new _xxx() helpers
> should be renamed somehow. fs/aio.c doesn't look right with or without this
> patch, but iiuc this is what it did before 854a6ed56839a.
>
> Let me show the code with the patch applied. I am using epoll_pwait() as an
> example because it looks very simple.
>
>
> static inline void set_restore_sigmask(void)
> {
> // WARN_ON(!TIF_SIGPENDING) was removed by this patch
> current->restore_sigmask = true;
> }
>
> int set_xxx(const sigset_t __user *umask, size_t sigsetsize)
> {
> sigset_t *kmask;
>
> if (!umask)
> return 0;
> if (sigsetsize != sizeof(sigset_t))
> return -EINVAL;
> if (copy_from_user(kmask, umask, sizeof(sigset_t)))
> return -EFAULT;
>
> // we can safely modify ->saved_sigmask/restore_sigmask, they has no meaning
> // until the syscall returns.
> set_restore_sigmask();
> current->saved_sigmask = current->blocked;
> set_current_blocked(kmask);
>
> return 0;
> }
>
>
> void update_xxx(bool interrupted)
> {
> // the main reason for this helper is WARN_ON(!TIF_SIGPENDING) which was "moved"
> // from set_restore_sigmask() above.
> if (interrupted)
> WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> else
> restore_saved_sigmask();
> }
>
> SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> int, maxevents, int, timeout, const sigset_t __user *, sigmask,
> size_t, sigsetsize)
> {
> int error;
>
> error = set_xxx(sigmask, sigsetsize);
> if (error)
> return error;
>
> error = do_epoll_wait(epfd, events, maxevents, timeout);
> update_xxx(error == -EINTR);
>
> return error;
> }
>
> Oleg.
> ---
>
> fs/aio.c | 40 ++++++++++++++---------
> fs/eventpoll.c | 12 +++----
> fs/io_uring.c | 12 +++----
> fs/select.c | 40 +++++++----------------
> include/linux/compat.h | 4 +--
> include/linux/sched/signal.h | 2 --
> include/linux/signal.h | 6 ++--
> kernel/signal.c | 77 ++++++++++++++++----------------------------
> 8 files changed, 74 insertions(+), 119 deletions(-)
>
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 3490d1f..8315bd2 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -2093,8 +2093,8 @@ SYSCALL_DEFINE6(io_pgetevents,
> const struct __aio_sigset __user *, usig)
> {
> struct __aio_sigset ksig = { NULL, };
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts;
> + bool interrupted;
> int ret;
>
> if (timeout && unlikely(get_timespec64(&ts, timeout)))
> @@ -2103,13 +2103,15 @@ SYSCALL_DEFINE6(io_pgetevents,
I already admitted I was wrong about this. And the behavior you point out
is consistent with man page. I'm not sure why we are not understanding that
I'm agreeing with you:
https://lore.kernel.org/lkml/CABeXuvouBzZuNarmNcd9JgZgvonL1N_p21gat=O_x0-1hMx=6A@mail.gmail.com/
This was true until 854a6ed56839a("signal: Add restore_user_sigmask()"),
now this is broken by the signal_pending() check in restore_user_sigmask().
This patch is wrong because I did not know about the above behavior before.
I also admitted to this. And proposed another way to revert the patch.
This patch
https://lore.kernel.org/lkml/[email protected]/
turns 0 into -EINTR if signal_pending(), but I think we should simply
restore the old behaviour and simplify the code.
I agree that the patch that turns 0 to -EINTR should not be applied.
It was my misunderstanding.
The part I'm not sure is that I cannot find anything that is wrong with
854a6ed56839a("signal: Add restore_user_sigmask()") that was not already before.
But, that is ok, we can do the revert and I will investigate with the
test application if I can find something.
-Deepa
> if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
> return -EFAULT;
>
> - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> + ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
> if (ret)
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + update_xxx(interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> @@ -2126,8 +2128,8 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
> const struct __aio_sigset __user *, usig)
> {
> struct __aio_sigset ksig = { NULL, };
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts;
> + bool interrupted;
> int ret;
>
> if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
> @@ -2137,13 +2139,15 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
> return -EFAULT;
>
>
> - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> + ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
> if (ret)
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + update_xxx(interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> @@ -2191,8 +2195,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
> const struct __compat_aio_sigset __user *, usig)
> {
> struct __compat_aio_sigset ksig = { NULL, };
> - sigset_t ksigmask, sigsaved;
> struct timespec64 t;
> + bool interrupted;
> int ret;
>
> if (timeout && get_old_timespec32(&t, timeout))
> @@ -2201,13 +2205,15 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
> if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
> return -EFAULT;
>
> - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> + ret = set_compat_xxx(ksig.sigmask, ksig.sigsetsize);
> if (ret)
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + update_xxx(interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> @@ -2224,8 +2230,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
> const struct __compat_aio_sigset __user *, usig)
> {
> struct __compat_aio_sigset ksig = { NULL, };
> - sigset_t ksigmask, sigsaved;
> struct timespec64 t;
> + bool interrupted;
> int ret;
>
> if (timeout && get_timespec64(&t, timeout))
> @@ -2234,13 +2240,15 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
> if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
> return -EFAULT;
>
> - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> + ret = set_compat_xxx(ksig.sigmask, ksig.sigsetsize);
> if (ret)
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + update_xxx(interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 4a0e98d..0b1a337 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -2318,19 +2318,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> size_t, sigsetsize)
> {
> int error;
> - sigset_t ksigmask, sigsaved;
>
> /*
> * If the caller wants a certain signal mask to be set during the wait,
> * we apply it here.
> */
> - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + error = set_xxx(sigmask, sigsetsize);
> if (error)
> return error;
>
> error = do_epoll_wait(epfd, events, maxevents, timeout);
> -
> - restore_user_sigmask(sigmask, &sigsaved);
> + update_xxx(error == -EINTR);
>
> return error;
> }
> @@ -2343,19 +2341,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
> compat_size_t, sigsetsize)
> {
> long err;
> - sigset_t ksigmask, sigsaved;
>
> /*
> * If the caller wants a certain signal mask to be set during the wait,
> * we apply it here.
> */
> - err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + err = set_compat_xxx(sigmask, sigsetsize);
> if (err)
> return err;
>
> err = do_epoll_wait(epfd, events, maxevents, timeout);
> -
> - restore_user_sigmask(sigmask, &sigsaved);
> + update_xxx(err == -EINTR);
>
> return err;
> }
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 310f8d1..b5b99e2 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2180,7 +2180,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> const sigset_t __user *sig, size_t sigsz)
> {
> struct io_cq_ring *ring = ctx->cq_ring;
> - sigset_t ksigmask, sigsaved;
> int ret;
>
> if (io_cqring_events(ring) >= min_events)
> @@ -2189,24 +2188,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> if (sig) {
> #ifdef CONFIG_COMPAT
> if (in_compat_syscall())
> - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
> - &ksigmask, &sigsaved, sigsz);
> + ret = set_compat_xxx((const compat_sigset_t __user *)sig,
> + sigsz);
> else
> #endif
> - ret = set_user_sigmask(sig, &ksigmask,
> - &sigsaved, sigsz);
> + ret = set_xxx(sig, sigsz);
>
> if (ret)
> return ret;
> }
>
> ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
> + update_xxx(ret == -ERESTARTSYS);
> if (ret == -ERESTARTSYS)
> ret = -EINTR;
>
> - if (sig)
> - restore_user_sigmask(sig, &sigsaved);
> -
> return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
> }
>
> diff --git a/fs/select.c b/fs/select.c
> index 6cbc9ff..7eab132 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
> const sigset_t __user *sigmask, size_t sigsetsize,
> enum poll_time_type type)
> {
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts, end_time, *to = NULL;
> int ret;
>
> @@ -753,15 +752,14 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
> return -EINVAL;
> }
>
> - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + ret = set_xxx(sigmask, sigsetsize);
> if (ret)
> return ret;
>
> ret = core_sys_select(n, inp, outp, exp, to);
> + update_xxx(ret == -ERESTARTNOHAND);
> ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> return ret;
> }
>
> @@ -1087,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
> struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
> size_t, sigsetsize)
> {
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts, end_time, *to = NULL;
> int ret;
>
> @@ -1100,18 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
> return -EINVAL;
> }
>
> - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + ret = set_xxx(sigmask, sigsetsize);
> if (ret)
> return ret;
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + update_xxx(ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> -
> ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
>
> return ret;
> @@ -1123,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
> struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
> size_t, sigsetsize)
> {
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts, end_time, *to = NULL;
> int ret;
>
> @@ -1136,18 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
> return -EINVAL;
> }
>
> - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + ret = set_xxx(sigmask, sigsetsize);
> if (ret)
> return ret;
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + update_xxx(ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> -
> ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
>
> return ret;
> @@ -1322,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
> void __user *tsp, compat_sigset_t __user *sigmask,
> compat_size_t sigsetsize, enum poll_time_type type)
> {
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts, end_time, *to = NULL;
> int ret;
>
> @@ -1345,15 +1336,14 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
> return -EINVAL;
> }
>
> - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + ret = set_compat_xxx(sigmask, sigsetsize);
> if (ret)
> return ret;
>
> ret = compat_core_sys_select(n, inp, outp, exp, to);
> + update_xxx(ret == -ERESTARTNOHAND);
> ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> return ret;
> }
>
> @@ -1406,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
> unsigned int, nfds, struct old_timespec32 __user *, tsp,
> const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
> {
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts, end_time, *to = NULL;
> int ret;
>
> @@ -1419,18 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
> return -EINVAL;
> }
>
> - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + ret = set_compat_xxx(sigmask, sigsetsize);
> if (ret)
> return ret;
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + update_xxx(ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> -
> ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
>
> return ret;
> @@ -1442,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
> unsigned int, nfds, struct __kernel_timespec __user *, tsp,
> const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
> {
> - sigset_t ksigmask, sigsaved;
> struct timespec64 ts, end_time, *to = NULL;
> int ret;
>
> @@ -1455,18 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
> return -EINVAL;
> }
>
> - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
> + ret = set_compat_xxx(sigmask, sigsetsize);
> if (ret)
> return ret;
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + update_xxx(ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> -
> ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
>
> return ret;
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index ebddcb6..b20b001 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -138,9 +138,7 @@ typedef struct {
> compat_sigset_word sig[_COMPAT_NSIG_WORDS];
> } compat_sigset_t;
>
> -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
> - sigset_t *set, sigset_t *oldset,
> - size_t sigsetsize);
> +int set_compat_xxx(const compat_sigset_t __user *umask, size_t sigsetsize);
>
> struct compat_sigaction {
> #ifndef __ARCH_HAS_IRIX_SIGACTION
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 38a0f07..8b5b537 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -417,7 +417,6 @@ void task_join_group_stop(struct task_struct *task);
> static inline void set_restore_sigmask(void)
> {
> set_thread_flag(TIF_RESTORE_SIGMASK);
> - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> }
>
> static inline void clear_tsk_restore_sigmask(struct task_struct *task)
> @@ -448,7 +447,6 @@ static inline bool test_and_clear_restore_sigmask(void)
> static inline void set_restore_sigmask(void)
> {
> current->restore_sigmask = true;
> - WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> }
> static inline void clear_tsk_restore_sigmask(struct task_struct *task)
> {
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 9702016..65f84ac 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -273,10 +273,8 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
> struct task_struct *p, enum pid_type type);
> extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> extern int sigprocmask(int, sigset_t *, sigset_t *);
> -extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
> - sigset_t *oldset, size_t sigsetsize);
> -extern void restore_user_sigmask(const void __user *usigmask,
> - sigset_t *sigsaved);
> +extern int set_xxx(const sigset_t __user *umask, size_t sigsetsize);
> +extern void update_xxx(bool interupted);
> extern void set_current_blocked(sigset_t *);
> extern void __set_current_blocked(const sigset_t *);
> extern int show_unhandled_signals;
> diff --git a/kernel/signal.c b/kernel/signal.c
> index d7b9d14..0a1ec68 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2861,79 +2861,56 @@ EXPORT_SYMBOL(sigprocmask);
> *
> * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
> * epoll_pwait where a new sigmask is passed from userland for the syscalls.
> + *
> + * Note that it does set_restore_sigmask() in advance, so it must be always
> + * paired with update_xxx() before return from syscall.
> */
> -int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
> - sigset_t *oldset, size_t sigsetsize)
> +int set_xxx(const sigset_t __user *umask, size_t sigsetsize)
> {
> - if (!usigmask)
> - return 0;
> + sigset_t *kmask;
>
> + if (!umask)
> + return 0;
> if (sigsetsize != sizeof(sigset_t))
> return -EINVAL;
> - if (copy_from_user(set, usigmask, sizeof(sigset_t)))
> + if (copy_from_user(kmask, umask, sizeof(sigset_t)))
> return -EFAULT;
>
> - *oldset = current->blocked;
> - set_current_blocked(set);
> + set_restore_sigmask();
> + current->saved_sigmask = current->blocked;
> + set_current_blocked(kmask);
>
> return 0;
> }
> -EXPORT_SYMBOL(set_user_sigmask);
> +
> +void update_xxx(bool interrupted)
> +{
> + if (interrupted)
> + WARN_ON(!test_thread_flag(TIF_SIGPENDING));
> + else
> + restore_saved_sigmask();
> +}
>
> #ifdef CONFIG_COMPAT
> -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
> - sigset_t *set, sigset_t *oldset,
> - size_t sigsetsize)
> +int set_compat_xxx(const compat_sigset_t __user *umask, size_t sigsetsize)
> {
> - if (!usigmask)
> - return 0;
> + sigset_t *kmask;
>
> + if (!umask)
> + return 0;
> if (sigsetsize != sizeof(compat_sigset_t))
> return -EINVAL;
> - if (get_compat_sigset(set, usigmask))
> + if (get_compat_sigset(kmask, umask))
> return -EFAULT;
>
> - *oldset = current->blocked;
> - set_current_blocked(set);
> + set_restore_sigmask();
> + current->saved_sigmask = current->blocked;
> + set_current_blocked(kmask);
>
> return 0;
> }
> -EXPORT_SYMBOL(set_compat_user_sigmask);
> #endif
>
> -/*
> - * restore_user_sigmask:
> - * usigmask: sigmask passed in from userland.
> - * sigsaved: saved sigmask when the syscall started and changed the sigmask to
> - * usigmask.
> - *
> - * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
> - * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
> - */
> -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> -{
> -
> - if (!usigmask)
> - return;
> - /*
> - * When signals are pending, do not restore them here.
> - * Restoring sigmask here can lead to delivering signals that the above
> - * syscalls are intended to block because of the sigmask passed in.
> - */
> - if (signal_pending(current)) {
> - current->saved_sigmask = *sigsaved;
> - set_restore_sigmask();
> - return;
> - }
> -
> - /*
> - * This is needed because the fast syscall return path does not restore
> - * saved_sigmask when signals are not pending.
> - */
> - set_current_blocked(sigsaved);
> -}
> -EXPORT_SYMBOL(restore_user_sigmask);
> -
> /**
> * sys_rt_sigprocmask - change the list of currently blocked signals
> * @how: whether to add, remove, or set signals
>
On 05/28, Deepa Dinamani wrote:
>
> I agree that signal handller being called and return value not being
> altered is an issue with other syscalls also. I was just wondering if
> some userspace code assumption would be assuming this. This is not a
> kernel bug.
>
> But, I do not think we have an understanding of what was wrong in
> 854a6ed56839a anymore since you pointed out that my assumption was not
> correct that the signal handler being called without errno being set
> is wrong.
Deepa, sorry, I simply can't parse the above... most probably because of
my bad English.
> One open question: this part of epoll_pwait was already broken before
> 854a6ed56839a. Do you agree?
>
> if (err == -EINTR) {
> memcpy(¤t->saved_sigmask, &sigsaved,
> sizeof(sigsaved));
> set_restore_sigmask();
> } else
> set_current_blocked(&sigsaved);
I do not understand why do you think this part was broken :/
> Or, I could revert the signal_pending() check and provide a fix
> something like below(not a complete patch)
...
> -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> +int restore_user_sigmask(const void __user *usigmask, sigset_t
> *sigsaved, int sig_pending)
> {
>
> if (!usigmask)
> return;
>
> /*
> * When signals are pending, do not restore them here.
> * Restoring sigmask here can lead to delivering signals that the above
> * syscalls are intended to block because of the sigmask passed in.
> */
> + if (sig_pending) {
> current->saved_sigmask = *sigsaved;
> set_restore_sigmask();
> return;
> }
>
> @@ -2330,7 +2330,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
> epoll_event __user *, events,
>
> error = do_epoll_wait(epfd, events, maxevents, timeout);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> + signal_detected = restore_user_sigmask(sigmask, &sigsaved,
> error == -EINTR);
I fail to understand this pseudo-code, sorry. In particular, do not understand
why restore_user_sigmask() needs to return a boolean.
The only thing I _seem to_ understand is the "sig_pending" flag passed by the
caller which replaces the signal_pending() check. Yes, this is what I think we
should do, and this is what I tried to propose from the very beginning in my
1st email in this thread.
Oleg.
On Wed, May 29, 2019 at 9:12 AM Oleg Nesterov <[email protected]> wrote:
>
> Al, Linus, Eric, please help.
>
> The previous discussion was very confusing, we simply can not understand each
> other.
>
> To me everything looks very simple and clear, but perhaps I missed something
> obvious? Please correct me.
>
> I think that the following code is correct
>
> int interrupted = 0;
>
> void sigint_handler(int sig)
> {
> interrupted = 1;
> }
>
> int main(void)
> {
> sigset_t sigint, empty;
>
> sigemptyset(&sigint);
> sigaddset(&sigint, SIGINT);
> sigprocmask(SIG_BLOCK, &sigint, NULL);
>
> signal(SIGINT, sigint_handler);
>
> sigemptyset(&empty); // so pselect() unblocks SIGINT
>
> ret = pselect(..., &empty);
>
> if (ret >= 0) // sucess or timeout
> assert(!interrupted);
>
> if (interrupted)
> assert(ret == -EINTR);
> }
>
> IOW, if pselect(sigmask) temporary unblocks SIGINT according to sigmask, this
> signal should not be delivered if a ready fd was found or timeout. The signal
> handle should only run if ret == -EINTR.
I do not think we discussed this part earlier. But, if this is true
then this is what is wrong as part of 854a6ed56839a. I missed that
before.
> (pselect() can be interrupted by any other signal which has a handler. In this
> case the handler can be called even if ret >= 0. This is correct, I fail to
> understand why some people think this is wrong, and in any case we simply can't
> avoid this).
This patch is wrong because I did not know that it was ok to deliver a
signal and not set the errno before. I also admitted to this. And
proposed another way to revert the patch.:
https://lore.kernel.org/lkml/CABeXuvouBzZuNarmNcd9JgZgvonL1N_p21gat=O_x0-1hMx=6A@mail.gmail.com/
-Deepa
On Wed, May 29, 2019 at 9:57 AM Oleg Nesterov <[email protected]> wrote:
>
> On 05/28, Deepa Dinamani wrote:
> >
> > I agree that signal handller being called and return value not being
> > altered is an issue with other syscalls also. I was just wondering if
> > some userspace code assumption would be assuming this. This is not a
> > kernel bug.
> >
> > But, I do not think we have an understanding of what was wrong in
> > 854a6ed56839a anymore since you pointed out that my assumption was not
> > correct that the signal handler being called without errno being set
> > is wrong.
>
> Deepa, sorry, I simply can't parse the above... most probably because of
> my bad English.
Ok, All I meant was that I had thought a signal handler being invoked
without the error value reflecting it was wrong. That is what I had
thought was wrong with 854a6ed56839a. Now, that we agree that signal
handler can be invoked without the errno returning success, I thought
I did not know what is wrong with 854a6ed56839a anymore.
But, you now pointed out that the signals we care about should not be
delivered after an event has been ready. This points out to what was
wrong with 854a6ed56839a. Thanks.
> > One open question: this part of epoll_pwait was already broken before
> > 854a6ed56839a. Do you agree?
> >
> > if (err == -EINTR) {
> > memcpy(¤t->saved_sigmask, &sigsaved,
> > sizeof(sigsaved));
> > set_restore_sigmask();
> > } else
> > set_current_blocked(&sigsaved);
>
> I do not understand why do you think this part was broken :/
Ok, because of your other statement that the signals the application
cares about do not want to know about signals they care about after an
event is ready this is also not a problem.
> > Or, I could revert the signal_pending() check and provide a fix
> > something like below(not a complete patch)
>
> ...
>
> > -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> > +int restore_user_sigmask(const void __user *usigmask, sigset_t
> > *sigsaved, int sig_pending)
> > {
> >
> > if (!usigmask)
> > return;
> >
> > /*
> > * When signals are pending, do not restore them here.
> > * Restoring sigmask here can lead to delivering signals that the above
> > * syscalls are intended to block because of the sigmask passed in.
> > */
> > + if (sig_pending) {
> > current->saved_sigmask = *sigsaved;
> > set_restore_sigmask();
> > return;
> > }
> >
> > @@ -2330,7 +2330,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
> > epoll_event __user *, events,
> >
> > error = do_epoll_wait(epfd, events, maxevents, timeout);
> >
> > - restore_user_sigmask(sigmask, &sigsaved);
> > + signal_detected = restore_user_sigmask(sigmask, &sigsaved,
> > error == -EINTR);
>
> I fail to understand this pseudo-code, sorry. In particular, do not understand
> why restore_user_sigmask() needs to return a boolean.
That was a remnant from the other patch. Return type needs to be void.
> The only thing I _seem to_ understand is the "sig_pending" flag passed by the
> caller which replaces the signal_pending() check.
Correct. This is what is the main change I was proposing.
> Yes, this is what I think we
> should do, and this is what I tried to propose from the very beginning in my
> 1st email in this thread.
This was not clear to me in your first response that you did not want
the signal_pending() check in restore_user_sigmask(). :
https://lore.kernel.org/lkml/[email protected]/
"Ugh. I need to re-check, but at first glance I really dislike this change.
I think we can fix the problem _and_ simplify the code. Something like below.
The patch is obviously incomplete, it changes only only one caller of
set_user_sigmask(), epoll_pwait() to explain what I mean.
restore_user_sigmask() should simply die. Although perhaps another helper
makes sense to add WARN_ON(test_tsk_restore_sigmask() && !signal_pending)."
-Deepa
David Laight <[email protected]> wrote:
> From: Oleg Nesterov
> > Sent: 29 May 2019 17:12
> > Al, Linus, Eric, please help.
> >
> > The previous discussion was very confusing, we simply can not understand each
> > other.
> >
> > To me everything looks very simple and clear, but perhaps I missed something
> > obvious? Please correct me.
> >
> > I think that the following code is correct
> >
> > int interrupted = 0;
> >
> > void sigint_handler(int sig)
> > {
> > interrupted = 1;
> > }
> >
> > int main(void)
> > {
> > sigset_t sigint, empty;
> >
> > sigemptyset(&sigint);
> > sigaddset(&sigint, SIGINT);
> > sigprocmask(SIG_BLOCK, &sigint, NULL);
> >
> > signal(SIGINT, sigint_handler);
> >
> > sigemptyset(&empty); // so pselect() unblocks SIGINT
> >
> > ret = pselect(..., &empty);
> ^^^^^ sigint
> >
> > if (ret >= 0) // sucess or timeout
> > assert(!interrupted);
> >
> > if (interrupted)
> > assert(ret == -EINTR);
> > }
> >
> > IOW, if pselect(sigmask) temporary unblocks SIGINT according to sigmask, this
> > signal should not be delivered if a ready fd was found or timeout. The signal
> > handle should only run if ret == -EINTR.
>
> Personally I think that is wrong.
> Given code like the above that has:
> while (!interrupted) {
> pselect(..., &sigint);
> // process available data
> }
>
> You want the signal handler to be executed even if one of the fds
> always has available data.
> Otherwise you can't interrupt a process that is always busy.
Agreed... I believe cmogstored has always had a bug in the way
it uses epoll_pwait because it failed to check interrupts if:
a) an FD is ready + interrupt
b) epoll_pwait returns 0 on interrupt
The bug remains in userspace for a), which I will fix by adding
an interrupt check when an FD is ready. The window is very
small for a) and difficult to trigger, and also in a rare code
path.
The b) case is the kernel bug introduced in 854a6ed56839a40f
("signal: Add restore_user_sigmask()").
I don't think there's any disagreement that b) is a kernel bug.
So the confusion is for a), and POSIX is not clear w.r.t. how
pselect/poll works when there's both FD readiness and an
interrupt.
Thus I'm inclined to believe *select/*poll/epoll_*wait should
follow POSIX read() semantics:
If a read() is interrupted by a signal before it reads any data, it shall
return −1 with errno set to [EINTR].
If a read() is interrupted by a signal after it has successfully read
some data, it shall return the number of bytes read.
> One option is to return -EINTR if a signal is pending when the mask
> is updated - before even looking at anything else.
>
> Signals that happen later on (eg after a timeout) need not be reported
> (until the next time around the loop).
I'm not sure that's necessary and it would cause delays in
signal handling.
On Wed, May 29, 2019 at 6:12 PM Oleg Nesterov <[email protected]> wrote:
>
> Al, Linus, Eric, please help.
>
> The previous discussion was very confusing, we simply can not understand each
> other.
>
> To me everything looks very simple and clear, but perhaps I missed something
> obvious? Please correct me.
Thanks for the elaborate explanation in this patch, it all starts making sense
to me now. I also looked at your patch in detail and thought I had found
a few mistakes at first but those all turned out to be mistakes in my reading.
> See the compile-tested patch at the end. Of course, the new _xxx() helpers
> should be renamed somehow. fs/aio.c doesn't look right with or without this
> patch, but iiuc this is what it did before 854a6ed56839a.
I think this is a nice simplification, but it would help not to mix up the
minimal regression fix with the rewrite of those functions. For the stable
kernels, I think we want just the addition of the 'bool interrupted' argument
to restore_user_sigmask() to close the race that was introduced
854a6ed56839a. Following up on that for future kernels, your patch
improves the readability, but we can probably take it even further.
> - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> + ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
> if (ret)
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + update_xxx(interrupted);
Maybe name this
restore_saved_sigmask_if(!interrupted);
and make restore_saved_sigmask_if() an inline function
next to restore_saved_sigmask()?
> @@ -2201,13 +2205,15 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
> if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
> return -EFAULT;
>
> - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> + ret = set_compat_xxx(ksig.sigmask, ksig.sigsetsize);
> if (ret)
> return ret;
With some of the recent discussions about compat syscall handling,
I now think that we want to just fold set_compat_user_sigmask()
into set_user_sigmask() (whatever they get called in the end)
with an in_compat_syscall() conditional inside it, and completely get
rid of the COMPAT_SYSCALL_DEFINEx() definitions for those
system calls for which this is the only difference.
Unfortunately we still need the time32/time64 distinction, but removing
syscall handlers is a significant cleanup here already, and we can
move most of the function body of sys_io_pgetevents() into
do_io_getevents() in the process. Same for some of the other calls.
Not sure about the order of the cleanups, but probably something like
this would work:
1. fix the race (to be backported)
2. unify set_compat_user_sigmask/set_user_sigmask
3. remove unneeded compat handlers
4. replace restore_user_sigmask with restore_saved_sigmask_if()
5. also unify compat_get_fd_set()/get_fd_set() and kill off
compat select() variants.
Arnd
Arnd Bergmann <[email protected]> writes:
> On Wed, May 29, 2019 at 6:12 PM Oleg Nesterov <[email protected]> wrote:
>>
>> Al, Linus, Eric, please help.
>>
>> The previous discussion was very confusing, we simply can not understand each
>> other.
>>
>> To me everything looks very simple and clear, but perhaps I missed something
>> obvious? Please correct me.
>
> Thanks for the elaborate explanation in this patch, it all starts making sense
> to me now. I also looked at your patch in detail and thought I had found
> a few mistakes at first but those all turned out to be mistakes in my reading.
>
>> See the compile-tested patch at the end. Of course, the new _xxx() helpers
>> should be renamed somehow. fs/aio.c doesn't look right with or without this
>> patch, but iiuc this is what it did before 854a6ed56839a.
>
> I think this is a nice simplification, but it would help not to mix up the
> minimal regression fix with the rewrite of those functions. For the stable
> kernels, I think we want just the addition of the 'bool interrupted' argument
> to restore_user_sigmask() to close the race that was introduced
> 854a6ed56839a. Following up on that for future kernels, your patch
> improves the readability, but we can probably take it even further.
>
>> - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
>> + ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
>> if (ret)
>> return ret;
>>
>> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
>> - restore_user_sigmask(ksig.sigmask, &sigsaved);
>> - if (signal_pending(current) && !ret)
>> +
>> + interrupted = signal_pending(current);
>> + update_xxx(interrupted);
>
> Maybe name this
>
> restore_saved_sigmask_if(!interrupted);
>
> and make restore_saved_sigmask_if() an inline function
> next to restore_saved_sigmask()?
>
>> @@ -2201,13 +2205,15 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
>> if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
>> return -EFAULT;
>>
>> - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
>> + ret = set_compat_xxx(ksig.sigmask, ksig.sigsetsize);
>> if (ret)
>> return ret;
>
> With some of the recent discussions about compat syscall handling,
> I now think that we want to just fold set_compat_user_sigmask()
> into set_user_sigmask() (whatever they get called in the end)
> with an in_compat_syscall() conditional inside it, and completely get
> rid of the COMPAT_SYSCALL_DEFINEx() definitions for those
> system calls for which this is the only difference.
>
> Unfortunately we still need the time32/time64 distinction, but removing
> syscall handlers is a significant cleanup here already, and we can
> move most of the function body of sys_io_pgetevents() into
> do_io_getevents() in the process. Same for some of the other calls.
>
> Not sure about the order of the cleanups, but probably something like
> this would work:
>
> 1. fix the race (to be backported)
> 2. unify set_compat_user_sigmask/set_user_sigmask
> 3. remove unneeded compat handlers
> 4. replace restore_user_sigmask with restore_saved_sigmask_if()
> 5. also unify compat_get_fd_set()/get_fd_set() and kill off
> compat select() variants.
Are new system calls added preventing a revert of the patch in question
for stable kernels?
Eric
From: Eric Wong
> Sent: 29 May 2019 19:50
...
> > Personally I think that is wrong.
> > Given code like the above that has:
> > while (!interrupted) {
> > pselect(..., &sigint);
> > // process available data
> > }
> >
> > You want the signal handler to be executed even if one of the fds
> > always has available data.
> > Otherwise you can't interrupt a process that is always busy.
FWIW in the past I've seen a process that loops select-accept-fork-exec
get its priority reduced to the point where it never blocked
in select. The client side retries made it go badly wrong!
If it had limited when SIG_INT was detected it would have been
a little more difficult to kill.
> Agreed... I believe cmogstored has always had a bug in the way
> it uses epoll_pwait because it failed to check interrupts if:
>
> a) an FD is ready + interrupt
> b) epoll_pwait returns 0 on interrupt
But the kernel code seems to only call the signal handler
(for signals that are enabled during pselect() (etc)) if
the underlying wait is interrupted.
> The bug remains in userspace for a), which I will fix by adding
> an interrupt check when an FD is ready. The window is very
> small for a) and difficult to trigger, and also in a rare code
> path.
>
> The b) case is the kernel bug introduced in 854a6ed56839a40f
> ("signal: Add restore_user_sigmask()").
>
> I don't think there's any disagreement that b) is a kernel bug.
If the signal is raised after the wait has timed out but before
the signal mask is restored.
> So the confusion is for a), and POSIX is not clear w.r.t. how
> pselect/poll works when there's both FD readiness and an
> interrupt.
>
> Thus I'm inclined to believe *select/*poll/epoll_*wait should
> follow POSIX read() semantics:
>
> If a read() is interrupted by a signal before it reads any data, it shall
> return −1 with errno set to [EINTR].
>
> If a read() is interrupted by a signal after it has successfully read
> some data, it shall return the number of bytes read.
Except that above you want different semantics :-)
For read() any signal handler is always called.
And the return value of a non-blocking read that returns no data
is not affected by any pending signals.
For pselect() that would mean that if the wait timed out (result 0)
and then a signal was raised (before the mask got changed back) then
the return value would be zero and the signal handler would be called.
So your (b) above is not a bug.
Even select() can return 'timeout' and have a signal handler called.
There are really two separate issues:
1) Signals that are pending when the new mask is applied.
2) Signals that are raised after the wait terminates (success or timeout).
If the signal handlers for (2) are not called then they become (1)
next time around the application loop.
Maybe the 'restore sigmask' function should be passed an indication
of whether it is allowed to let signal handler be called and return
whether they would be called (ie whether it restored the signal mask
or left it for the syscall exit code to do after calling the signal
handlers).
That would allow epoll() to convert timeout+pending signal to EINTR,
or to allow all handlers be called regardless of the return value.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Oleg Nesterov <[email protected]> writes:
> Al, Linus, Eric, please help.
>
> The previous discussion was very confusing, we simply can not understand each
> other.
>
> To me everything looks very simple and clear, but perhaps I missed something
> obvious? Please correct me.
If I have read this thread correctly the core issue is that ther is a
program that used to work and that fails now. The question is why.
There are two ways the semantics for a sigmask changing system call
can be defined. The naive way and by reading posix. I will pick
on pselect.
The naive way:
int pselect(int nfds, fd_set *readfds, fd_set *writefds,
fd_set *exceptfds, const struct timespec *timeout,
const sigset_t *sigmask)
{
sigset_t oldmask;
int ret;
if (sigmask)
sigprocmask(SIG_SETMASK, sigmask, &oldmask);
ret = select(nfds, readfds, writefds, exceptfds, timeout);
if (sigmask)
sigprocmask(SIG_SETMASK, &oldmask, NULL);
return ret;
}
The standard for pselect behavior at
https://pubs.opengroup.org/onlinepubs/009695399/functions/pselect.html
says:
> If sigmask is not a null pointer, then the pselect() function shall
> replace the signal mask of the caller by the set of signals pointed to
> by sigmask before examining the descriptors, and shall restore the
> signal mask of the calling thread before returning.
...
> On failure, the objects pointed to by the readfds, writefds, and
> errorfds arguments shall not be modified. If the timeout interval
> expires without the specified condition being true for any of the
> specified file descriptors, the objects pointed to by the readfds,
> writefds, and errorfds arguments shall have all bits set to 0.
So the standard specified behavior is if the return value is -EINTR you
know you were interrupted by a signal, for any other return value you
don't know.
An error value just indicates that the file descriptor sets have not
been updated.
That is what the standard and reasonable people will expect from the
system call. Looking at set_user_sigmask and restore_user_sigmask
I believe we are don't violate those semantics today.
Which means I believe we have a semantically valid change in behavior
that is causing a regression.
From my inspection of the code the change in behavior is from these
two pieces of code:
From the v4.18 epoll_pwait.
/*
* If we changed the signal mask, we need to restore the original one.
* In case we've got a signal while waiting, we do not restore the
* signal mask yet, and we allow do_signal() to deliver the signal on
* the way back to userspace, before the signal mask is restored.
*/
if (sigmask) {
if (error == -EINTR) {
memcpy(¤t->saved_sigmask, &sigsaved,
sizeof(sigsaved));
set_restore_sigmask();
} else
set_current_blocked(&sigsaved);
}
/*
* restore_user_sigmask:
* usigmask: sigmask passed in from userland.
* sigsaved: saved sigmask when the syscall started and changed the sigmask to
* usigmask.
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed in from userland for the syscalls.
*/
void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
{
if (!usigmask)
return;
/*
* When signals are pending, do not restore them here.
* Restoring sigmask here can lead to delivering signals that the above
* syscalls are intended to block because of the sigmask passed in.
*/
if (signal_pending(current)) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
}
/*
* This is needed because the fast syscall return path does not restore
* saved_sigmask when signals are not pending.
*/
set_current_blocked(sigsaved);
}
Which has been reported results in a return value of 0, and a signal
delivered, where previously that did not happen.
They only way I can see that happening is that set_current_blocked in
__set_task_blocked clears pending signals (that will be blocked) and
calls retarget_shared_pending.
Frankly the only reason this appears to be worth touching is that we
have a userspace regression. Otherwise I would call the current
behavior more correct and desirable than ignoring the signal longer.
If I am reading sitaution properly I suggest we go back to resoring the
sigmask by hand in epoll_pwait, and put in a big fat comment about a
silly userspace program depending on that behavior.
That will be easier to backport and it will just fix the regression and
not overfix the problem for reasonable applications.
Oleg your cleanup also seems reasonable. But I would like to make
certain we understand and fix the regression first. We seem to be
talking very small windows for both cases.
Eric
Eric Wong <[email protected]> writes:
> Agreed... I believe cmogstored has always had a bug in the way
> it uses epoll_pwait because it failed to check interrupts if:
>
> a) an FD is ready + interrupt
> b) epoll_pwait returns 0 on interrupt
>
> The bug remains in userspace for a), which I will fix by adding
> an interrupt check when an FD is ready. The window is very
> small for a) and difficult to trigger, and also in a rare code
> path.
>
> The b) case is the kernel bug introduced in 854a6ed56839a40f
> ("signal: Add restore_user_sigmask()").
>
> I don't think there's any disagreement that b) is a kernel bug.
See my reply to Oleg. I think (b) is a regression that needs to be
fixed. I do not think that (b) is a kernel bug. Both versions of the
of what sigmask means posix and naive will allow (b).
Because fundamentally the sigmask is restored after the rest of the
system call happens.
Eric
On 05/30, Arnd Bergmann wrote:
>
> I think this is a nice simplification, but it would help not to mix up the
> minimal regression fix with the rewrite of those functions.
Yes, yes, agreed.
Plus every file touched by this patch asks for more cleanups. Say, do_poll()
should return -ERESTARTNOHAND, not -EINTR, after that we can remove the ugly
EINTR->ERESTARTNOHAND in its callers. And more.
> For the stable
> kernels, I think we want just the addition of the 'bool interrupted' argument
> to restore_user_sigmask()
or simply revert this patch. I will check if this is possible today... At first
glance 854a6ed56839a40f6 fixed another bug by accident, do_pselect() did
"ret == -ERESTARTNOHAND" after "ret = poll_select_copy_remaining()" which can
turn ERESTARTNOHAND into EINTR, but this is simple. I'll check tomorrow.
> > - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> > + ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
> > if (ret)
> > return ret;
> >
> > ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> > - restore_user_sigmask(ksig.sigmask, &sigsaved);
> > - if (signal_pending(current) && !ret)
> > +
> > + interrupted = signal_pending(current);
> > + update_xxx(interrupted);
>
> Maybe name this
>
> restore_saved_sigmask_if(!interrupted);
Yes, I thought about restore_if(), but to me
restore_saved_sigmask_if(ret != -EINTR);
doesn't look readable... May be
restore_saved_sigmask_unless(ret == -EINTR);
? but actually I agree with any naming.
> and make restore_saved_sigmask_if() an inline function
> next to restore_saved_sigmask()?
agreed,
> With some of the recent discussions about compat syscall handling,
> I now think that we want to just fold set_compat_user_sigmask()
> into set_user_sigmask()
agreed, and I thought about this too. But again, I'd prefer to do this
and other cleanups later, on top of this patch.
Oleg.
From: Eric W. Biederman
> Sent: 30 May 2019 14:01
> Oleg Nesterov <[email protected]> writes:
>
> > Al, Linus, Eric, please help.
> >
> > The previous discussion was very confusing, we simply can not understand each
> > other.
> >
> > To me everything looks very simple and clear, but perhaps I missed something
> > obvious? Please correct me.
>
> If I have read this thread correctly the core issue is that ther is a
> program that used to work and that fails now. The question is why.
>
> There are two ways the semantics for a sigmask changing system call
> can be defined. The naive way and by reading posix. I will pick
> on pselect.
>
> The naive way:
> int pselect(int nfds, fd_set *readfds, fd_set *writefds,
> fd_set *exceptfds, const struct timespec *timeout,
> const sigset_t *sigmask)
> {
> sigset_t oldmask;
> int ret;
>
> if (sigmask)
> sigprocmask(SIG_SETMASK, sigmask, &oldmask);
>
> ret = select(nfds, readfds, writefds, exceptfds, timeout);
>
> if (sigmask)
> sigprocmask(SIG_SETMASK, &oldmask, NULL);
>
> return ret;
> }
>
> The standard for pselect behavior at
> https://pubs.opengroup.org/onlinepubs/009695399/functions/pselect.html
> says:
> > If sigmask is not a null pointer, then the pselect() function shall
> > replace the signal mask of the caller by the set of signals pointed to
> > by sigmask before examining the descriptors, and shall restore the
> > signal mask of the calling thread before returning.
Right, but that bit says nothing about when signals are 'delivered'.
Section 2.4 of http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html
isn't very enlightening either - well, pretty impenetrable really.
But since the ToG specs don't require a user-kernel boundary I believe
that the signal handler is expected to be called as soon as it is unmasked.
Now, for async signals, the kernel can always pretend that they happened
slightly later than they actually did.
So signal delivery is delayed until 'return to user'.
In some cases system calls are restarted to make the whole thing transparent.
For pselect() etc I think this means:
1) Signal handlers can only be called if EINTR is returned.
2) If a signal is deliverable after the mask is changed
then the signal hander must be called.
This means EINTR must be returned.
ie this must be detected before checking anything else.
3) A signal that is raised after the wait completes can be
'deemed' to have happened after the mask is restored
and left masked until the applications next pselect() call.
4) As an optimisation a signal that arrives after the timer
expires, but before the mask is restored can be 'deemed'
to have happened before the timeout expired and EINTR
returned.
Now not all system calls that use a temporary mask may want to
work that way.
In particular they may want to return 'success' and have the
signal handlers called.
So maybe the set_xxx() function that installs to user's mask
should return:
-ve: errno value
0: no signal pending
1: signal pending.
And the update_xxx() function that restores the saved mask
should take a parameter to indicate whether signal handlers
should be called maybe 'bool call_handlers' and return 0/1
depending on whether signals were pending.
pselect() then contains:
rval = set_xxx();
if (rval) {
if (rval < 0)
return rval;
update_xxx(true);
return -EINTR:
}
rval = wait....();
#if 0 // don't report late signals
update_xxx(rval == -EINTR);
#else // report signals after timeout
if (update_xxx(!rval || rval == -EINTR))
rval == -EINTR;
#endif
return rval;
}
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
[email protected] (Eric W. Biederman) writes:
> Which means I believe we have a semantically valid change in behavior
> that is causing a regression.
I haven't made a survey of all of the functions yet but
fucntions return -ENORESTARTNOHAND will never return -EINTR and are
immune from this problem.
AKA pselect is fine. While epoll_pwait can be affected.
Has anyone contacted Omar Kilani to see if that is his issue?
https://lore.kernel.org/lkml/CA+8F9hicnF=kvjXPZFQy=Pa2HJUS3JS+G9VswFHNQQynPMHGVQ@mail.gmail.com/
So far the only regression report I am seeing is from Eric Wong.
AKA https://lore.kernel.org/lkml/20190501021405.hfvd7ps623liu25i@dcvr/
Are there any others? How did we get to be talking about more
than just epoll_pwait?
Eric
> On May 30, 2019, at 8:38 AM, Eric W. Biederman <[email protected]> wrote:
>
> [email protected] (Eric W. Biederman) writes:
>
>> Which means I believe we have a semantically valid change in behavior
>> that is causing a regression.
>
> I haven't made a survey of all of the functions yet but
> fucntions return -ENORESTARTNOHAND will never return -EINTR and are
> immune from this problem.
>
> AKA pselect is fine. While epoll_pwait can be affected.
This was my understanding as well.
> Has anyone contacted Omar Kilani to see if that is his issue?
> https://lore.kernel.org/lkml/CA+8F9hicnF=kvjXPZFQy=Pa2HJUS3JS+G9VswFHNQQynPMHGVQ@mail.gmail.com/
Omar was cc-ed when this regression was reported. I did cc him on fix
and asked if he could try it. We have not heard from him.
> So far the only regression report I am seeing is from Eric Wong.
> AKA https://lore.kernel.org/lkml/20190501021405.hfvd7ps623liu25i@dcvr/
> Are there any others? How did we get to be talking about more
> than just epoll_pwait?
This is the only report that I know of. I’m not sure why people
started talking about pselect. I was also confused why instead of
reviewing the patch and discussing the fix, we ended up talking about
how to simplify the code. We have deviated much from what should have
been a code review.
-Deepa
On 05/30, Eric W. Biederman wrote:
>
> Oleg Nesterov <[email protected]> writes:
>
> > Al, Linus, Eric, please help.
> >
> > The previous discussion was very confusing, we simply can not understand each
> > other.
> >
> > To me everything looks very simple and clear, but perhaps I missed something
> > obvious? Please correct me.
>
> If I have read this thread correctly the core issue is that ther is a
> program that used to work and that fails now. The question is why.
I didn't even try to investigate, I wasn't cc'ed initially and I then I had
enough confusion when I started to look at the patch.
However, 854a6ed56839a40f6 obviously introduced the user-visible change so
I am not surprised it breaks something. And yes, personally I think this
change is not right.
> Which means I believe we have a semantically valid change in behavior
> that is causing a regression.
See below,
> void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> {
>
> if (!usigmask)
> return;
> /*
> * When signals are pending, do not restore them here.
> * Restoring sigmask here can lead to delivering signals that the above
> * syscalls are intended to block because of the sigmask passed in.
> */
> if (signal_pending(current)) {
> current->saved_sigmask = *sigsaved;
> set_restore_sigmask();
> return;
> }
>
> /*
> * This is needed because the fast syscall return path does not restore
> * saved_sigmask when signals are not pending.
> */
> set_current_blocked(sigsaved);
> }
>
> Which has been reported results in a return value of 0,
0 or success
> and a signal
> delivered, where previously that did not happen.
Yes.
And to me this doesn't look right. OK, OK, probably this is because I never
read the docs, only the source code in fs/select.c. But to me pselect() should
either return success/timeout or deliver a signal. Not both. Even if the signal
was already pending before pselect() was called.
To clarify, "a signal" means a signal which was blocked before pselect(sigmask)
and temporary unblocked in this syscall.
And even if this doesn't violate posix, I see no reason to change the historic
behaviour. And this regression probably means we can't ;)
Oleg.
On 05/30, Eric W. Biederman wrote:
>
> [email protected] (Eric W. Biederman) writes:
>
> > Which means I believe we have a semantically valid change in behavior
> > that is causing a regression.
>
> I haven't made a survey of all of the functions yet but
> fucntions return -ENORESTARTNOHAND will never return -EINTR and are
> immune from this problem.
Hmm. handle_signal:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
break;
but I am not sure I understand which problem do you mean..
Oleg.
On 05/30, David Laight wrote:
>
> 4) As an optimisation a signal that arrives after the timer
> expires, but before the mask is restored can be 'deemed'
> to have happened before the timeout expired and EINTR
> returned.
This is what pselect/ppoll already does.
Oleg.
From: Eric W. Biederman
> Sent: 30 May 2019 16:38
> [email protected] (Eric W. Biederman) writes:
>
> > Which means I believe we have a semantically valid change in behavior
> > that is causing a regression.
>
> I haven't made a survey of all of the functions yet but
> fucntions return -ENORESTARTNOHAND will never return -EINTR and are
> immune from this problem.
Eh?
ERESTARTNOHAND just makes the system call restart if there is no
signal handler, EINTR should still be returned if there is a handler.
All the functions that have a temporary signal mask are likely
to be expected to work the same way and thus have the same bugs.
http://pubs.opengroup.org/onlinepubs/9699919799/functions/pselect.html#
isn't overly helpful.
But I think it should return EINTR even if there is no handler unless
SA_RESTART is set:
[EINTR]
The function was interrupted while blocked waiting for any of the selected descriptors to become ready and before the timeout interval expired.
If SA_RESTART has been set for the interrupting signal, it is implementation-defined whether the function restarts or returns with [EINTR].
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Thu, May 30, 2019 at 8:48 AM Deepa Dinamani <[email protected]> wrote:
>
> > On May 30, 2019, at 8:38 AM, Eric W. Biederman <[email protected]> wrote:
> >
> > [email protected] (Eric W. Biederman) writes:
> >
> >> Which means I believe we have a semantically valid change in behavior
> >> that is causing a regression.
> >
> > I haven't made a survey of all of the functions yet but
> > fucntions return -ENORESTARTNOHAND will never return -EINTR and are
> > immune from this problem.
> >
> > AKA pselect is fine. While epoll_pwait can be affected.
>
> This was my understanding as well.
I think I was misremembered here. I had noted this before:
https://lore.kernel.org/linux-fsdevel/CABeXuvq7gCV2qPOo+Q8jvNyRaTvhkRLRbnL_oJ-AuK7Sp=P3QQ@mail.gmail.com/
"sys_io_pgetevents() does not seem to have this problem as we are still
checking signal_pending() here.
sys_pselect6() seems to have a similar problem. The changes to
sys_pselect6() also impact sys_select() as the changes are in the
common code path."
This was the code replaced for io_pgetevents by 854a6ed56839a40f6b is as below.
No matter what events completed, there was signal_pending() check
after the return from do_io_getevents().
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2110,18 +2110,9 @@ SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- if (signal_pending(current)) {
- if (ksig.sigmask) {
- current->saved_sigmask = sigsaved;
- set_restore_sigmask();
- }
-
- if (!ret)
- ret = -ERESTARTNOHAND;
- } else {
- if (ksig.sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- }
+ restore_user_sigmask(ksig.sigmask, &sigsaved);
+ if (signal_pending(current) && !ret)
+ ret = -ERESTARTNOHAND;
Can I ask a simple question for my understanding?
man page for epoll_pwait says
EINTR
The call was interrupted by a signal handler before either any of the
requested events occurred or the timeout expired; see signal(7).
But it is not clear to me if we can figure out(without race) the
chronological order if one of the requested events are completed or a
signal came first.
Is this a correct exectation?
Also like pointed out above, this behavior is not consistent for all
such syscalls(io_pgetevents). Was this also by design?
-Deepa
Oleg Nesterov <[email protected]> writes:
> On 05/30, Eric W. Biederman wrote:
>>
>> [email protected] (Eric W. Biederman) writes:
>>
>> > Which means I believe we have a semantically valid change in behavior
>> > that is causing a regression.
>>
>> I haven't made a survey of all of the functions yet but
>> fucntions return -ENORESTARTNOHAND will never return -EINTR and are
>> immune from this problem.
>
> Hmm. handle_signal:
>
> case -ERESTARTNOHAND:
> regs->ax = -EINTR;
> break;
>
> but I am not sure I understand which problem do you mean..
Yes. My mistake. I looked at the transparent restart case for when a
signal is not pending and failed to look at what happens when a signal
is delivered.
So yes. Everything changed does appear to have a behavioral difference
where they can now succeed and not return -EINTR.
Eric
On Thu, May 30, 2019 at 3:54 AM Eric W. Biederman <[email protected]> wrote:
> Arnd Bergmann <[email protected]> writes:
> > On Wed, May 29, 2019 at 6:12 PM Oleg Nesterov <[email protected]> wrote:
> >
> > Not sure about the order of the cleanups, but probably something like
> > this would work:
> >
> > 1. fix the race (to be backported)
> > 2. unify set_compat_user_sigmask/set_user_sigmask
> > 3. remove unneeded compat handlers
> > 4. replace restore_user_sigmask with restore_saved_sigmask_if()
> > 5. also unify compat_get_fd_set()/get_fd_set() and kill off
> > compat select() variants.
>
> Are new system calls added preventing a revert of the patch in question
> for stable kernels?
Yes, a straight revert would not work, as it was done as a cleanup in
order to simplify the following conversion. I suppose one could undo
the cleanup in both the time32 and time64 versions of each syscall,
but I would consider that a more risky change than just fixing the
bug that was reported.
Arnd
On Thu, May 30, 2019 at 4:41 PM Oleg Nesterov <[email protected]> wrote:
> On 05/30, Arnd Bergmann wrote:
> Plus every file touched by this patch asks for more cleanups. Say, do_poll()
> should return -ERESTARTNOHAND, not -EINTR, after that we can remove the ugly
> EINTR->ERESTARTNOHAND in its callers. And more.
>
> > For the stable
> > kernels, I think we want just the addition of the 'bool interrupted' argument
> > to restore_user_sigmask()
>
> or simply revert this patch. I will check if this is possible today... At first
> glance 854a6ed56839a40f6 fixed another bug by accident, do_pselect() did
> "ret == -ERESTARTNOHAND" after "ret = poll_select_copy_remaining()" which can
> turn ERESTARTNOHAND into EINTR, but this is simple. I'll check tomorrow.
Right, there were several differences between the system calls
that Deepa's original change got rid of. I don't know if any ones besides
the do_pselect() return code can be observed in practice.
> > > - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
> > > + ret = set_xxx(ksig.sigmask, ksig.sigsetsize);
> > > if (ret)
> > > return ret;
> > >
> > > ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> > > - restore_user_sigmask(ksig.sigmask, &sigsaved);
> > > - if (signal_pending(current) && !ret)
> > > +
> > > + interrupted = signal_pending(current);
> > > + update_xxx(interrupted);
> >
> > Maybe name this
> >
> > restore_saved_sigmask_if(!interrupted);
>
> Yes, I thought about restore_if(), but to me
>
> restore_saved_sigmask_if(ret != -EINTR);
>
> doesn't look readable... May be
>
> restore_saved_sigmask_unless(ret == -EINTR);
>
> ? but actually I agree with any naming.
Yes, restore_saved_sigmask_unless() probably better.
> > With some of the recent discussions about compat syscall handling,
> > I now think that we want to just fold set_compat_user_sigmask()
> > into set_user_sigmask()
>
> agreed, and I thought about this too. But again, I'd prefer to do this
> and other cleanups later, on top of this patch.
Ok, fair enough. I don't care much about the order as long as the
regression fix comes first.
Arnd
"Eric W. Biederman" <[email protected]> wrote:
> Frankly the only reason this appears to be worth touching is that we
> have a userspace regression. Otherwise I would call the current
> behavior more correct and desirable than ignoring the signal longer.
>
> If I am reading sitaution properly I suggest we go back to resoring the
> sigmask by hand in epoll_pwait, and put in a big fat comment about a
> silly userspace program depending on that behavior.
>
> That will be easier to backport and it will just fix the regression and
> not overfix the problem for reasonable applications.
Fwiw, the cmogstored (userspace program which regressed) only
hit this regression in an obscure code path for tuning; that
code path probably sees no use outside of the test suite.
Add to that, cmogstored is an unofficial and optional component
to the obscure, largely-forgotten MogileFS.
Finally, the main users of cmogstored are on FreeBSD. They
hit the kqueue+self-pipe code path instead of epoll_pwait.
I only used epoll_pwait on Linux since I figured I could save a
few bytes of memory by skipping eventfd/self-pipe...
Anyways, I updated cmogstored a few weeks back to call `note_run'
(the signal dispatcher) if epoll_pwait (wrapped by `mog_idleq_wait_intr')
returns 0 instead of -1 (EINTR) to workaround this kernel change:
https://bogomips.org/cmogstored-public/[email protected]/
I could easily make a change to call `note_run' unconditionally
when `mog_idleq_wait_intr' returns, too.
But that said, there could be other users hitting the same
problem I did. So maybe cmogstored's primary use on Linux these
days is finding regressions :>
This is the minimal fix for stable, I'll send cleanups later.
The commit 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
restore_user_sigmask()") introduced the visible change which breaks
user-space: a signal temporary unblocked by set_user_sigmask() can
be delivered even if the caller returns success or timeout.
Change restore_user_sigmask() to accept the additional "interrupted"
argument which should be used instead of signal_pending() check, and
update the callers.
Reported-by: Eric Wong <[email protected]>
Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
cc: [email protected] (v5.0+)
Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/aio.c | 28 ++++++++++++++++++++--------
fs/eventpoll.c | 4 ++--
fs/io_uring.c | 7 ++++---
fs/select.c | 18 ++++++------------
include/linux/signal.h | 2 +-
kernel/signal.c | 5 +++--
6 files changed, 36 insertions(+), 28 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1f..c1e581d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2095,6 +2095,7 @@ SYSCALL_DEFINE6(io_pgetevents,
struct __aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 ts;
+ bool interrupted;
int ret;
if (timeout && unlikely(get_timespec64(&ts, timeout)))
@@ -2108,8 +2109,10 @@ SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2128,6 +2131,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
struct __aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 ts;
+ bool interrupted;
int ret;
if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
@@ -2142,8 +2146,10 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2193,6 +2199,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct __compat_aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 t;
+ bool interrupted;
int ret;
if (timeout && get_old_timespec32(&t, timeout))
@@ -2206,8 +2213,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
@@ -2226,6 +2235,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
struct __compat_aio_sigset ksig = { NULL, };
sigset_t ksigmask, sigsaved;
struct timespec64 t;
+ bool interrupted;
int ret;
if (timeout && get_timespec64(&t, timeout))
@@ -2239,8 +2249,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
- restore_user_sigmask(ksig.sigmask, &sigsaved);
- if (signal_pending(current) && !ret)
+
+ interrupted = signal_pending(current);
+ restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ if (interrupted && !ret)
ret = -ERESTARTNOHAND;
return ret;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c6f5131..4c74c76 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2325,7 +2325,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
error = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
return error;
}
@@ -2350,7 +2350,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
err = do_epoll_wait(epfd, events, maxevents, timeout);
- restore_user_sigmask(sigmask, &sigsaved);
+ restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
return err;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0fbb486..1147c5d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2201,11 +2201,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
if (sig)
- restore_user_sigmask(sig, &sigsaved);
+ restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
+
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
}
diff --git a/fs/select.c b/fs/select.c
index 6cbc9ff..a4d8f6e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -758,10 +758,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
return ret;
ret = core_sys_select(n, inp, outp, exp, to);
+ restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
-
return ret;
}
@@ -1106,8 +1105,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
@@ -1142,8 +1140,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
@@ -1350,10 +1347,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
+ restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
- restore_user_sigmask(sigmask, &sigsaved);
-
return ret;
}
@@ -1425,8 +1421,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
@@ -1461,8 +1456,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved);
-
+ restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 9702016..78c2bb3 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -276,7 +276,7 @@ extern int sigprocmask(int, sigset_t *, sigset_t *);
extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
sigset_t *oldset, size_t sigsetsize);
extern void restore_user_sigmask(const void __user *usigmask,
- sigset_t *sigsaved);
+ sigset_t *sigsaved, bool interrupted);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index 328a01e..aa6a6f1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2912,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed in from userland for the syscalls.
*/
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
+ bool interrupted)
{
if (!usigmask)
@@ -2922,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
* Restoring sigmask here can lead to delivering signals that the above
* syscalls are intended to block because of the sigmask passed in.
*/
- if (signal_pending(current)) {
+ if (interrupted) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
--
2.5.0
Oleg Nesterov <[email protected]> writes:
> This is the minimal fix for stable, I'll send cleanups later.
>
> The commit 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
> restore_user_sigmask()") introduced the visible change which breaks
> user-space: a signal temporary unblocked by set_user_sigmask() can
> be delivered even if the caller returns success or timeout.
>
> Change restore_user_sigmask() to accept the additional "interrupted"
> argument which should be used instead of signal_pending() check, and
> update the callers.
>
> Reported-by: Eric Wong <[email protected]>
> Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
> cc: [email protected] (v5.0+)
> Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: "Eric W. Biederman" <[email protected]>
I have read through the patch and it looks good.
For clarity. I don't think this is required by posix, or fundamentally
to remove the races in select. It is what linux has always done and
we have applications who care so I agree this fix is needed.
Further in any case where the semantic change that this patch rolls back
(aka where allowing a signal to be delivered and the select like call to
complete) would be advantage we can do as well if not better by using
signalfd.
Michael is there any chance we can get this guarantee of the linux
implementation of pselect and friends clearly documented. The guarantee
that if the system call completes successfully we are guaranteed that
no signal that is unblocked by using sigmask will be delivered?
Eric
> ---
> fs/aio.c | 28 ++++++++++++++++++++--------
> fs/eventpoll.c | 4 ++--
> fs/io_uring.c | 7 ++++---
> fs/select.c | 18 ++++++------------
> include/linux/signal.h | 2 +-
> kernel/signal.c | 5 +++--
> 6 files changed, 36 insertions(+), 28 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 3490d1f..c1e581d 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -2095,6 +2095,7 @@ SYSCALL_DEFINE6(io_pgetevents,
> struct __aio_sigset ksig = { NULL, };
> sigset_t ksigmask, sigsaved;
> struct timespec64 ts;
> + bool interrupted;
> int ret;
>
> if (timeout && unlikely(get_timespec64(&ts, timeout)))
> @@ -2108,8 +2109,10 @@ SYSCALL_DEFINE6(io_pgetevents,
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> @@ -2128,6 +2131,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
> struct __aio_sigset ksig = { NULL, };
> sigset_t ksigmask, sigsaved;
> struct timespec64 ts;
> + bool interrupted;
> int ret;
>
> if (timeout && unlikely(get_old_timespec32(&ts, timeout)))
> @@ -2142,8 +2146,10 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> @@ -2193,6 +2199,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
> struct __compat_aio_sigset ksig = { NULL, };
> sigset_t ksigmask, sigsaved;
> struct timespec64 t;
> + bool interrupted;
> int ret;
>
> if (timeout && get_old_timespec32(&t, timeout))
> @@ -2206,8 +2213,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> @@ -2226,6 +2235,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
> struct __compat_aio_sigset ksig = { NULL, };
> sigset_t ksigmask, sigsaved;
> struct timespec64 t;
> + bool interrupted;
> int ret;
>
> if (timeout && get_timespec64(&t, timeout))
> @@ -2239,8 +2249,10 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
> return ret;
>
> ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
>
> return ret;
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index c6f5131..4c74c76 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -2325,7 +2325,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
>
> error = do_epoll_wait(epfd, events, maxevents, timeout);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> + restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
>
> return error;
> }
> @@ -2350,7 +2350,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
>
> err = do_epoll_wait(epfd, events, maxevents, timeout);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> + restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
>
> return err;
> }
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 0fbb486..1147c5d 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2201,11 +2201,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> }
>
> ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
> - if (ret == -ERESTARTSYS)
> - ret = -EINTR;
>
> if (sig)
> - restore_user_sigmask(sig, &sigsaved);
> + restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
> +
> + if (ret == -ERESTARTSYS)
> + ret = -EINTR;
>
> return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
> }
> diff --git a/fs/select.c b/fs/select.c
> index 6cbc9ff..a4d8f6e 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -758,10 +758,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
> return ret;
>
> ret = core_sys_select(n, inp, outp, exp, to);
> + restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
> ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> return ret;
> }
>
> @@ -1106,8 +1105,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> @@ -1142,8 +1140,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> @@ -1350,10 +1347,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
> return ret;
>
> ret = compat_core_sys_select(n, inp, outp, exp, to);
> + restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
> ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> return ret;
> }
>
> @@ -1425,8 +1421,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> @@ -1461,8 +1456,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
>
> ret = do_sys_poll(ufds, nfds, to);
>
> - restore_user_sigmask(sigmask, &sigsaved);
> -
> + restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
> /* We can restart this syscall, usually */
> if (ret == -EINTR)
> ret = -ERESTARTNOHAND;
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 9702016..78c2bb3 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -276,7 +276,7 @@ extern int sigprocmask(int, sigset_t *, sigset_t *);
> extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
> sigset_t *oldset, size_t sigsetsize);
> extern void restore_user_sigmask(const void __user *usigmask,
> - sigset_t *sigsaved);
> + sigset_t *sigsaved, bool interrupted);
> extern void set_current_blocked(sigset_t *);
> extern void __set_current_blocked(const sigset_t *);
> extern int show_unhandled_signals;
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 328a01e..aa6a6f1 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2912,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
> * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
> * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
> */
> -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> +void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
> + bool interrupted)
> {
>
> if (!usigmask)
> @@ -2922,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
> * Restoring sigmask here can lead to delivering signals that the above
> * syscalls are intended to block because of the sigmask passed in.
> */
> - if (signal_pending(current)) {
> + if (interrupted) {
> current->saved_sigmask = *sigsaved;
> set_restore_sigmask();
> return;
From: Eric W. Biederman
> Sent: 04 June 2019 16:32
...
> Michael is there any chance we can get this guarantee of the linux
> implementation of pselect and friends clearly documented. The guarantee
> that if the system call completes successfully we are guaranteed that
> no signal that is unblocked by using sigmask will be delivered?
The behaviour certainly needs documenting - the ToG docs are unclear.
I think you need stronger statement that the one above.
Maybe "signals will only be delivered (ie the handler called) if
the system call has to wait and the wait is interrupted by a signal.
(Even then pselect might find ready fd and return success.)
There is also the 'issue' of ERESTARTNOHAND.
Some of the system calls will return EINTR if the wait is interrupted
by a signal that doesn't have a handler, others get restarted.
I'm not at all sure about why there is a difference.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Tue, Jun 4, 2019 at 3:41 PM Oleg Nesterov <[email protected]> wrote:
>
> This is the minimal fix for stable, I'll send cleanups later.
>
> The commit 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
> restore_user_sigmask()") introduced the visible change which breaks
> user-space: a signal temporary unblocked by set_user_sigmask() can
> be delivered even if the caller returns success or timeout.
>
> Change restore_user_sigmask() to accept the additional "interrupted"
> argument which should be used instead of signal_pending() check, and
> update the callers.
>
> Reported-by: Eric Wong <[email protected]>
> Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
> cc: [email protected] (v5.0+)
> Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: Arnd Bergmann <[email protected]>
I hope Eric can test this with the original reproducer, or maybe someone
could create a test case that can be added into LTP.
Arnd
> On Tue, Jun 4, 2019 at 3:41 PM Oleg Nesterov <[email protected]> wrote:
> >
> > This is the minimal fix for stable, I'll send cleanups later.
> >
> > The commit 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
> > restore_user_sigmask()") introduced the visible change which breaks
> > user-space: a signal temporary unblocked by set_user_sigmask() can
> > be delivered even if the caller returns success or timeout.
> >
> > Change restore_user_sigmask() to accept the additional "interrupted"
> > argument which should be used instead of signal_pending() check, and
> > update the callers.
> >
> > Reported-by: Eric Wong <[email protected]>
> > Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
> > cc: [email protected] (v5.0+)
> > Signed-off-by: Oleg Nesterov <[email protected]>
>
Acked-by: Deepa Dinamani <[email protected]>
The original fix posted:
https://lore.kernel.org/patchwork/patch/1077355/ would also have been
a correct fix for this problem. But, given the cleanups that are in
the pipeline, this is a better fix.
-Deepa
Oleg Nesterov <[email protected]> wrote:
> This is the minimal fix for stable, I'll send cleanups later.
>
> The commit 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
> restore_user_sigmask()") introduced the visible change which breaks
> user-space: a signal temporary unblocked by set_user_sigmask() can
> be delivered even if the caller returns success or timeout.
>
> Change restore_user_sigmask() to accept the additional "interrupted"
> argument which should be used instead of signal_pending() check, and
> update the callers.
>
> Reported-by: Eric Wong <[email protected]>
> Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add restore_user_sigmask()")
> cc: [email protected] (v5.0+)
> Signed-off-by: Oleg Nesterov <[email protected]>
Thanks, for epoll_pwait on top of Linux v5.1.7 and cmogstored v1.7.0:
Tested-by: Eric Wong <[email protected]>
(cmogstored v1.7.1 already works around this when it sees a 0
return value (but not >0, yet...))
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 0fbb486..1147c5d 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2201,11 +2201,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
> }
>
> ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
> - if (ret == -ERESTARTSYS)
> - ret = -EINTR;
>
> if (sig)
> - restore_user_sigmask(sig, &sigsaved);
> + restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
> +
> + if (ret == -ERESTARTSYS)
> + ret = -EINTR;
>
> return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
> }
That io_uring bit didn't apply cleanly to stable,
since stable is missing fdb288a679cdf6a71f3c1ae6f348ba4dae742681
("io_uring: use wait_event_interruptible for cq_wait conditional wait")
and related commits.
In any case, I'm not using io_uring anywhere, yet (and probably
won't, since I'll still need threads to deal with open/unlink/rename
on slow JBOD HDDs).
On Tue, Jun 4, 2019 at 6:41 AM Oleg Nesterov <[email protected]> wrote:
>
> This is the minimal fix for stable, I'll send cleanups later.
Ugh. I htink this is correct, but I wish we had a better and more
intuitive interface.
In particular, since restore_user_sigmask() basically wants to check
for "signal_pending()" anyway (to decide if the mask should be
restored by signal handling or by that function), I really get the
feeling that a lot of these patterns like
> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> - if (signal_pending(current) && !ret)
> +
> + interrupted = signal_pending(current);
> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> + if (interrupted && !ret)
> ret = -ERESTARTNOHAND;
are wrong to begin with, and we really should aim for an interface
which says "tell me whether you completed the system call, and I'll
give you an error return if not".
How about we make restore_user_sigmask() take two return codes: the
'ret' we already have, and the return we would get if there is a
signal pending and w're currently returning zero.
IOW, I think the above could become
ret = restore_user_sigmask(ksig.sigmask, &sigsaved, ret, -ERESTARTHAND);
instead if we just made the right interface decision.
Hmm?
Linus
Linus Torvalds <[email protected]> wrote:
> On Tue, Jun 4, 2019 at 6:41 AM Oleg Nesterov <[email protected]> wrote:
> >
> > This is the minimal fix for stable, I'll send cleanups later.
>
> Ugh. I htink this is correct, but I wish we had a better and more
> intuitive interface.
I had the same thoughts, but am not a regular kernel hacker,
so I didn't say anything earlier.
> In particular, since restore_user_sigmask() basically wants to check
> for "signal_pending()" anyway (to decide if the mask should be
> restored by signal handling or by that function), I really get the
> feeling that a lot of these patterns like
>
> > - restore_user_sigmask(ksig.sigmask, &sigsaved);
> > - if (signal_pending(current) && !ret)
> > +
> > + interrupted = signal_pending(current);
> > + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> > + if (interrupted && !ret)
> > ret = -ERESTARTNOHAND;
>
> are wrong to begin with, and we really should aim for an interface
> which says "tell me whether you completed the system call, and I'll
> give you an error return if not".
>
> How about we make restore_user_sigmask() take two return codes: the
> 'ret' we already have, and the return we would get if there is a
> signal pending and w're currently returning zero.
>
> IOW, I think the above could become
>
> ret = restore_user_sigmask(ksig.sigmask, &sigsaved, ret, -ERESTARTHAND);
>
> instead if we just made the right interface decision.
But that falls down if ret were ever expected to match several
similar error codes (not sure if it happens)
When I was considering fixing this on my own a few weeks ago, I
was looking for an inline that could quickly tell if `ret' was
any of the EINTR-like error codes; but couldn't find one...
It'd probably end up being switch/case statement so I'm not sure
if it'd be too big and slow or not...
The caller would just do:
ret = restore_user_sigmask(ksig.sigmask, &sigsaved, ret);
And restore_user_sigmask would call some "was_interrupted(ret)"
inline which could return true if `ret' matched any of the
too-many-to-keep-track-of EINTR-like codes. But I figured
there's probably a good reason it did not exist, already *shrug*
/me goes back to the wonderful world of userspace...
Linus Torvalds <[email protected]> writes:
> On Tue, Jun 4, 2019 at 6:41 AM Oleg Nesterov <[email protected]> wrote:
>>
>> This is the minimal fix for stable, I'll send cleanups later.
>
> Ugh. I htink this is correct, but I wish we had a better and more
> intuitive interface.
>
> In particular, since restore_user_sigmask() basically wants to check
> for "signal_pending()" anyway (to decide if the mask should be
> restored by signal handling or by that function), I really get the
> feeling that a lot of these patterns like
Linus that checking for signal_pending() in restore_user_sigmask is the
bug that caused the regression.
>> - restore_user_sigmask(ksig.sigmask, &sigsaved);
>> - if (signal_pending(current) && !ret)
>> +
>> + interrupted = signal_pending(current);
>> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
>> + if (interrupted && !ret)
>> ret = -ERESTARTNOHAND;
>
> are wrong to begin with, and we really should aim for an interface
> which says "tell me whether you completed the system call, and I'll
> give you an error return if not".
The pattern you are pointing out is specific to io_pgetevents and it's
variations. It does look buggy to me but not for the reason you point
out, but instead because it does not appear to let a pending signal
cause io_pgetevents to return early.
I suspect we should fix that and have do_io_getevents return
-EINTR or -ERESTARTNOHAND like everyone else.
The concept of interrupted (aka return -EINTR to userspace) is truly
fundamental to the current semantics. We effectively put a normally
blocked signal that was triggered back if we won't be returning -EINTR
to userspace.
> How about we make restore_user_sigmask() take two return codes: the
> 'ret' we already have, and the return we would get if there is a
> signal pending and w're currently returning zero.
>
> IOW, I think the above could become
>
> ret = restore_user_sigmask(ksig.sigmask, &sigsaved, ret, -ERESTARTHAND);
>
> instead if we just made the right interface decision.
>
> Hmm?
At best I think that is a cleanup that will complicate creating a simple
straight forward regression fix.
Unless I am misreading things that is optimizing the interface for
dealing with broken code.
So can we please get this fix in and then look at cleaning up and
simplifying this code.
Eric
p.s. A rather compelling cleanup is to:
- Leave the signal mask alone.
- Register with signalfd_wqh for wake ups.
- Have a helper
int signal_pending_sigmask(sigset_t *blocked)
{
struct task_struct *tsk = current;
int ret = 0;
spin_lock_irq(&tsk->sighand->siglock);
if (next_signal(&tsk->pending, blocked) ||
next_signal(&tsk->signal->pending, blocked)) {
ret = -ERESTARTHAND;
if (!sigequalsets(&tsk->blocked, blocked)) {
tsk->saved_sigmask = tsk->blocked;
__set_task_blocked(tsk, blocked);
set_restore_sigmask();
}
}
spin_unlock_irq(&tsk->sighand->siglock);
return ret;
}
- Use that helper instead of signal_pending() in the various
sleep functions.
- Possibly get the signal mask from tsk instead of passing it into
all of the helpers.
Eric
On 06/04, Linus Torvalds wrote:
>
> On Tue, Jun 4, 2019 at 6:41 AM Oleg Nesterov <[email protected]> wrote:
> >
> > This is the minimal fix for stable, I'll send cleanups later.
>
> Ugh. I htink this is correct, but I wish we had a better and more
> intuitive interface.
Yes,
> In particular, since restore_user_sigmask() basically wants to check
> for "signal_pending()" anyway
No, the caller should check signal_pending() anyway and this is enough.
> > - restore_user_sigmask(ksig.sigmask, &sigsaved);
> > - if (signal_pending(current) && !ret)
> > +
> > + interrupted = signal_pending(current);
> > + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> > + if (interrupted && !ret)
> > ret = -ERESTARTNOHAND;
>
> are wrong to begin with,
This is fs/aio.c and I have already mentioned that this code doesn't look
right anyway.
> IOW, I think the above could become
>
> ret = restore_user_sigmask(ksig.sigmask, &sigsaved, ret, -ERESTARTHAND);
>
> instead if we just made the right interface decision.
I think this particular code should simply do
ret = do_io_getevents(...);
if (ret == -ERESTARTSYS)
ret = -EINTR;
restore_user_sigmask(ret == -EINTR);
However I agree that another helper(s) which takes/returns the error code makes
sense and I was going to do this. Lets do this step by step, I think we should
kill sigmask/sigsaved first.
Oleg.
From: Linus Torvalds
> Sent: 04 June 2019 22:27
> Ugh. I htink this is correct, but I wish we had a better and more
> intuitive interface.
>
> In particular, since restore_user_sigmask() basically wants to check
> for "signal_pending()" anyway (to decide if the mask should be
> restored by signal handling or by that function), I really get the
> feeling that a lot of these patterns like
>
> > - restore_user_sigmask(ksig.sigmask, &sigsaved);
> > - if (signal_pending(current) && !ret)
> > +
> > + interrupted = signal_pending(current);
> > + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> > + if (interrupted && !ret)
> > ret = -ERESTARTNOHAND;
>
> are wrong to begin with, and we really should aim for an interface
> which says "tell me whether you completed the system call, and I'll
> give you an error return if not".
>
> How about we make restore_user_sigmask() take two return codes: the
> 'ret' we already have, and the return we would get if there is a
> signal pending and w're currently returning zero.
>
> IOW, I think the above could become
>
> ret = restore_user_sigmask(ksig.sigmask, &sigsaved, ret, -ERESTARTHAND);
>
> instead if we just made the right interface decision.
I think we should tell restore_user_sigmask() whether it should
cause any signal handles to be run (using the current mask)
and it should tell us whether it would run any (ie if it deferred
restoring the mask to syscall exit).
So the above would (probably) be:
if (restore_user_sigmask(ksig.sigmask, &sigsaved, true) && !ret)
ret = -ERESTARTNOHAND;
epoll() would have:
if (restore_user_sigmask(xxx.sigmask, &sigsaved, !ret || ret == -EINTR))
ret = -EINTR;
I also think it could be simplified if code that loaded the 'user sigmask'
saved the old one in 'current->saved_sigmask' (and saved that it had done it).
You'd not need 'sigsaved' nor pass the user sigmask address into
the restore function.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On 06/04, Eric W. Biederman wrote:
>
> >> - restore_user_sigmask(ksig.sigmask, &sigsaved);
> >> - if (signal_pending(current) && !ret)
> >> +
> >> + interrupted = signal_pending(current);
> >> + restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
> >> + if (interrupted && !ret)
> >> ret = -ERESTARTNOHAND;
> >
> > are wrong to begin with, and we really should aim for an interface
> > which says "tell me whether you completed the system call, and I'll
> > give you an error return if not".
>
> The pattern you are pointing out is specific to io_pgetevents and it's
> variations. It does look buggy to me but not for the reason you point
> out, but instead because it does not appear to let a pending signal
> cause io_pgetevents to return early.
>
> I suspect we should fix that and have do_io_getevents return
> -EINTR or -ERESTARTNOHAND like everyone else.
Exactly. It should not even check signal_pending(). It can rely on
wait_event_interruptible_hrtimeout().
> So can we please get this fix in and then look at cleaning up and
> simplifying this code.
Yes ;)
Oleg.
On 06/05, David Laight wrote:
>
> epoll() would have:
> if (restore_user_sigmask(xxx.sigmask, &sigsaved, !ret || ret == -EINTR))
> ret = -EINTR;
I don't think so but lets discuss this later.
> I also think it could be simplified if code that loaded the 'user sigmask'
> saved the old one in 'current->saved_sigmask' (and saved that it had done it).
> You'd not need 'sigsaved' nor pass the user sigmask address into
> the restore function.
Heh. apparently you do not read my emails ;)
This is what I proposed in my very 1st email, and I even showed the patch
and the code with the patch applied twice. Let me do this again.
Let me show the code with the patch applied. I am using epoll_pwait() as an
example because it looks very simple.
static inline void set_restore_sigmask(void)
{
// WARN_ON(!TIF_SIGPENDING) was removed by this patch
current->restore_sigmask = true;
}
int set_xxx(const sigset_t __user *umask, size_t sigsetsize)
{
sigset_t *kmask;
if (!umask)
return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
// we can safely modify ->saved_sigmask/restore_sigmask, they has no meaning
// until the syscall returns.
set_restore_sigmask();
current->saved_sigmask = current->blocked;
set_current_blocked(kmask);
return 0;
}
void update_xxx(bool interrupted)
{
// the main reason for this helper is WARN_ON(!TIF_SIGPENDING) which was "moved"
// from set_restore_sigmask() above.
if (interrupted)
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
else
restore_saved_sigmask();
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
int error;
error = set_xxx(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
update_xxx(error == -EINTR);
return error;
}
Oleg.
From: Oleg Nesterov [mailto:[email protected]]
> Sent: 05 June 2019 10:25
> On 06/05, David Laight wrote:
> >
> > epoll() would have:
> > if (restore_user_sigmask(xxx.sigmask, &sigsaved, !ret || ret == -EINTR))
> > ret = -EINTR;
>
> I don't think so but lets discuss this later.
I certainly think there should be some comments at least
about when/whether signal handlers get called and that
being separate from the return value.
The system call restart stuff does seem strange.
ISTR that was originally added for SIG_SUSPEND (^Z) so that those
signals wouldn't be seen by the appication.
But that makes it a property of the signal, not the system call.
> > I also think it could be simplified if code that loaded the 'user sigmask'
> > saved the old one in 'current->saved_sigmask' (and saved that it had done it).
> > You'd not need 'sigsaved' nor pass the user sigmask address into
> > the restore function.
>
> Heh. apparently you do not read my emails ;)
>
> This is what I proposed in my very 1st email, and I even showed the patch
> and the code with the patch applied twice. Let me do this again.
I did read that one, I've even quoted it in the past :-)
It's just not been mentioned recently.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On top of
signal-remove-the-wrong-signal_pending-check-in-restore_user_sigmask.patch
Let me repeat, every file touched by this patch needs more cleanups,
fs/aio.c looks wrong with or without the recent changes. Lets discuss
this later.
To simplify the review, please see the code with this patch applied.
I am using epoll_pwait() as an example because it looks very simple.
Note that this patch moves WARN_ON(!TIF_SIGPENDING) from set_restore_sigmask()
to restore_unless().
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
sigset_t *kmask;
if (!umask)
return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
set_restore_sigmask();
current->saved_sigmask = current->blocked;
set_current_blocked(kmask);
return 0;
}
static inline void restore_saved_sigmask_unless(bool interrupted)
{
if (interrupted)
WARN_ON(!test_thread_flag(TIF_SIGPENDING));
else
restore_saved_sigmask();
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
int error;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
Oleg.
task->saved_sigmask and ->restore_sigmask are only used in the ret-from-
syscall paths. This means that set_user_sigmask() can save ->blocked in
->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked
was modified.
This way the callers do not need 2 sigset_t's passed to set/restore and
restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns
into the trivial helper which just calls restore_saved_sigmask().
Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/aio.c | 20 +++++--------
fs/eventpoll.c | 12 +++-----
fs/io_uring.c | 11 ++-----
fs/select.c | 34 ++++++++--------------
include/linux/compat.h | 3 +-
include/linux/sched/signal.h | 12 ++++++--
include/linux/signal.h | 4 ---
kernel/signal.c | 69 ++++++++++++--------------------------------
8 files changed, 57 insertions(+), 108 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index c1e581d..8200f97 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2093,7 +2093,6 @@ SYSCALL_DEFINE6(io_pgetevents,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
bool interrupted;
int ret;
@@ -2104,14 +2103,14 @@ SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2129,7 +2128,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
bool interrupted;
int ret;
@@ -2141,14 +2139,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2197,7 +2195,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 t;
bool interrupted;
int ret;
@@ -2208,14 +2205,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2233,7 +2230,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 t;
bool interrupted;
int ret;
@@ -2244,14 +2240,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c74c76..0f9c073 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
size_t, sigsetsize)
{
int error;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
+ restore_saved_sigmask_unless(error == -EINTR);
return error;
}
@@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
compat_size_t, sigsetsize)
{
long err;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ err = set_compat_user_sigmask(sigmask, sigsetsize);
if (err)
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
+ restore_saved_sigmask_unless(err == -EINTR);
return err;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1147c5d..ad7b9bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2180,7 +2180,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
struct io_cq_ring *ring = ctx->cq_ring;
- sigset_t ksigmask, sigsaved;
int ret;
if (io_cqring_events(ring) >= min_events)
@@ -2190,21 +2189,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- &ksigmask, &sigsaved, sigsz);
+ sigsz);
else
#endif
- ret = set_user_sigmask(sig, &ksigmask,
- &sigsaved, sigsz);
+ ret = set_user_sigmask(sig, sigsz);
if (ret)
return ret;
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
- if (sig)
- restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+ restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
diff --git a/fs/select.c b/fs/select.c
index a4d8f6e..1fc1b24 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize,
enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = core_sys_select(n, inp, outp, exp, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
return ret;
@@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
@@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
void __user *tsp, compat_sigset_t __user *sigmask,
compat_size_t sigsetsize, enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
return ret;
@@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
unsigned int, nfds, struct __kernel_timespec __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ebddcb6..16dafd9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -138,8 +138,7 @@ typedef struct {
compat_sigset_word sig[_COMPAT_NSIG_WORDS];
} compat_sigset_t;
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize);
struct compat_sigaction {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 38a0f07..4d9c71d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -417,7 +417,6 @@ void task_join_group_stop(struct task_struct *task);
static inline void set_restore_sigmask(void)
{
set_thread_flag(TIF_RESTORE_SIGMASK);
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
@@ -448,7 +447,6 @@ static inline bool test_and_clear_restore_sigmask(void)
static inline void set_restore_sigmask(void)
{
current->restore_sigmask = true;
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
@@ -481,6 +479,16 @@ static inline void restore_saved_sigmask(void)
__set_current_blocked(¤t->saved_sigmask);
}
+extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
+
+static inline void restore_saved_sigmask_unless(bool interrupted)
+{
+ if (interrupted)
+ WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+ else
+ restore_saved_sigmask();
+}
+
static inline sigset_t *sigmask_to_save(void)
{
sigset_t *res = ¤t->blocked;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 78c2bb3..b5d99482 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,10 +273,6 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
- sigset_t *sigsaved, bool interrupted);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index aa6a6f1..51772da 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2863,80 +2863,49 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t *kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t *kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
- bool interrupted)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (interrupted) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
--
2.5.0
On Wed, Jun 5, 2019 at 8:58 AM Oleg Nesterov <[email protected]> wrote:
>
> To simplify the review, please see the code with this patch applied.
> I am using epoll_pwait() as an example because it looks very simple.
I like it.
However.
I think I'd like it even more if we just said "we don't need
restore_saved_sigmask AT ALL".
Which would be fairly easy to do with something like the attached...
(Yes, this only does x86, which is a problem, but I'm bringing this up
as a RFC..)
Is it worth another TIF flag? This sure would simplify things, and it
really fits the concept too: this really is a do_signal() issue, and
fundamentally goes together with TIF_SIGPENDING.
Linus
Hi Oleg,
I love your patch! Perhaps something to improve:
[auto build test WARNING on mmotm/master]
url: https://github.com/0day-ci/linux/commits/Oleg-Nesterov/signal-simplify-set_user_sigmask-restore_user_sigmask/20190606-062512
base: git://git.cmpxchg.org/linux-mmotm.git master
config: m68k-allyesconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 7.4.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=m68k
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
In file included from arch/m68k/include/asm/uaccess.h:5:0,
from include/linux/uaccess.h:11,
from include/asm-generic/termios.h:6,
from ./arch/m68k/include/generated/uapi/asm/termios.h:1,
from include/uapi/linux/termios.h:6,
from include/linux/tty.h:7,
from kernel/signal.c:26:
kernel/signal.c: In function 'set_user_sigmask':
>> arch/m68k/include/asm/uaccess_mm.h:191:2: warning: 'kmask' may be used uninitialized in this function [-Wmaybe-uninitialized]
asm volatile ("\n" \
^~~
kernel/signal.c:2960:12: note: 'kmask' was declared here
sigset_t *kmask;
^~~~~
vim +/kmask +191 arch/m68k/include/asm/uaccess_mm.h
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 189
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 190 #define ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 @191 asm volatile ("\n" \
e08d703c arch/m68k/include/asm/uaccess_mm.h Greg Ungerer 2011-10-14 192 "1: "MOVES"."#s1" (%2)+,%3\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 193 " move."#s1" %3,(%1)+\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 194 " .ifnc \""#s2"\",\"\"\n" \
e08d703c arch/m68k/include/asm/uaccess_mm.h Greg Ungerer 2011-10-14 195 "2: "MOVES"."#s2" (%2)+,%3\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 196 " move."#s2" %3,(%1)+\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 197 " .ifnc \""#s3"\",\"\"\n" \
e08d703c arch/m68k/include/asm/uaccess_mm.h Greg Ungerer 2011-10-14 198 "3: "MOVES"."#s3" (%2)+,%3\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 199 " move."#s3" %3,(%1)+\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 200 " .endif\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 201 " .endif\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 202 "4:\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 203 " .section __ex_table,\"a\"\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 204 " .align 4\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 205 " .long 1b,10f\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 206 " .ifnc \""#s2"\",\"\"\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 207 " .long 2b,20f\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 208 " .ifnc \""#s3"\",\"\"\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 209 " .long 3b,30f\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 210 " .endif\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 211 " .endif\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 212 " .previous\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 213 "\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 214 " .section .fixup,\"ax\"\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 215 " .even\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 216 "10: addq.l #"#n1",%0\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 217 " .ifnc \""#s2"\",\"\"\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 218 "20: addq.l #"#n2",%0\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 219 " .ifnc \""#s3"\",\"\"\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 220 "30: addq.l #"#n3",%0\n" \
7cefa5a0 arch/m68k/include/asm/uaccess_mm.h Al Viro 2017-03-20 221 " .endif\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 222 " .endif\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 223 " jra 4b\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 224 " .previous\n" \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 225 : "+d" (res), "+&a" (to), "+a" (from), "=&d" (tmp) \
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 226 : : "memory")
53617825 include/asm-m68k/uaccess.h Roman Zippel 2006-06-25 227
:::::: The code at line 191 was first introduced by commit
:::::: 53617825ccf3ff8a71e6efcf3dcf58885ed6f3e5 [PATCH] m68k: fix uaccess.h for gcc-3.x
:::::: TO: Roman Zippel <[email protected]>
:::::: CC: Linus Torvalds <[email protected]>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Oleg,
I love your patch! Perhaps something to improve:
[auto build test WARNING on mmotm/master]
url: https://github.com/0day-ci/linux/commits/Oleg-Nesterov/signal-simplify-set_user_sigmask-restore_user_sigmask/20190606-062512
base: git://git.cmpxchg.org/linux-mmotm.git master
config: sparc64-allyesconfig (attached as .config)
compiler: sparc64-linux-gcc (GCC) 7.4.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=sparc64
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
In file included from include/linux/poll.h:12:0,
from include/linux/ring_buffer.h:7,
from include/linux/trace_events.h:6,
from include/trace/syscall.h:7,
from include/linux/syscalls.h:86,
from kernel/signal.c:30:
kernel/signal.c: In function 'set_user_sigmask':
include/linux/uaccess.h:113:7: warning: 'kmask' may be used uninitialized in this function [-Wmaybe-uninitialized]
res = raw_copy_from_user(to, from, n);
~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/signal.c:2960:12: note: 'kmask' was declared here
sigset_t *kmask;
^~~~~
kernel/signal.c: In function 'set_compat_user_sigmask':
>> kernel/signal.c:2986:6: warning: 'kmask' may be used uninitialized in this function [-Wmaybe-uninitialized]
if (get_compat_sigset(kmask, umask))
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
vim +/kmask +2986 kernel/signal.c
2975
2976 #ifdef CONFIG_COMPAT
2977 int set_compat_user_sigmask(const compat_sigset_t __user *umask,
2978 size_t sigsetsize)
2979 {
2980 sigset_t *kmask;
2981
2982 if (!umask)
2983 return 0;
2984 if (sigsetsize != sizeof(compat_sigset_t))
2985 return -EINVAL;
> 2986 if (get_compat_sigset(kmask, umask))
2987 return -EFAULT;
2988
2989 set_restore_sigmask();
2990 current->saved_sigmask = current->blocked;
2991 set_current_blocked(kmask);
2992
2993 return 0;
2994 }
2995 #endif
2996
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On 06/05, Oleg Nesterov wrote:
>
> +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
> {
> - if (!usigmask)
> - return 0;
> + sigset_t *kmask;
Typo, this obviously should be
sigset_t kmask;
I'll send v2.
Dear Kbuild Test Robot, thank you very much,
Oleg.
On Thu, Jun 6, 2019 at 9:28 AM Oleg Nesterov <[email protected]> wrote:
>
> On 06/05, Oleg Nesterov wrote:
> >
> > +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
> > {
> > - if (!usigmask)
> > - return 0;
> > + sigset_t *kmask;
>
> Typo, this obviously should be
>
> sigset_t kmask;
>
> I'll send v2.
>
>
> Dear Kbuild Test Robot, thank you very much,
>
Machines have emotions, too.
- sed@ -
From: Linus Torvalds
> Sent: 05 June 2019 18:25
> On Wed, Jun 5, 2019 at 8:58 AM Oleg Nesterov <[email protected]> wrote:
> >
> > To simplify the review, please see the code with this patch applied.
> > I am using epoll_pwait() as an example because it looks very simple.
>
> I like it.
>
> However.
>
> I think I'd like it even more if we just said "we don't need
> restore_saved_sigmask AT ALL".
>
> Which would be fairly easy to do with something like the attached...
That would always call the signal handlers even when EINTR wasn't
being returned (which I think ought to happen ...).
The real purpose of restore_saved_sigmask() is to stop signal
handlers that are enabled by the temporary mask being called.
If a signal handler is called, I presume that the trampoline
calls back into the kernel to get further handlers called
and to finally restore the original signal mask?
What happens if a signal handler calls something that
would normally write to current->saved_sigmask?
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On 06/05, Linus Torvalds wrote:
>
> On Wed, Jun 5, 2019 at 8:58 AM Oleg Nesterov <[email protected]> wrote:
> >
> > To simplify the review, please see the code with this patch applied.
> > I am using epoll_pwait() as an example because it looks very simple.
>
> I like it.
>
> However.
>
> I think I'd like it even more if we just said "we don't need
> restore_saved_sigmask AT ALL".
^^^^^^^^^^^^^^^^^^^^^
Did you mean restore_saved_sigmask_unless() introduced by this patch?
If yes:
> Which would be fairly easy to do with something like the attached...
I don't think so,
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -160,7 +160,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
> klp_update_patch_state(current);
>
> /* deal with pending signal delivery */
> - if (cached_flags & _TIF_SIGPENDING)
> + if (cached_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
> do_signal(regs);
...
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2877,6 +2877,7 @@ int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
>
> *oldset = current->blocked;
> set_current_blocked(set);
> + set_thread_flag(TIF_RESTORE_SIGMASK);
This will re-introduce the problem fixed by the previous patch.
Yes, do_signal() does restore_saved_sigmask() at the end, but only if
get_signal() returns false.
This means that restore_saved_sigmask()->set_current_blocked(saved_mask) should
restore ->blocked (and may be clear TIF_SIGPENDING) before ret-from-syscall.
Or I misunderstood?
Oleg.
On 06/06, David Laight wrote:
>
> Some of this code is hard to grep through :-)
I'd suggest to simply read the kernel code once and memorise it, after
that you will not need to use grep.
> > When signal handler returns it does sys_rt_sigreturn() which restores
> > the original mask saved in uc_sigmask.
>
> Does that mean that if 2 signals interrupt epoll_wait() only
> one of the signal handlers is run?
I'll assume that both signals were blocked before syscall and temporary
unblocked by pselect.
Quite contrary, they both will be delivered exactly because original mask
won't be restored until the 1st handler returns.
Unless, of course, the sigaction->sa_mask of the 1st signal blocks another
one.
Didn't I say you do not read my emails? I have already explained this to
you in this thread ;)
Oleg.
From: Oleg Nesterov
> Sent: 06 June 2019 13:41
> On 06/06, David Laight wrote:
> >
> > Some of this code is hard to grep through :-)
>
> I'd suggest to simply read the kernel code once and memorise it, after
> that you will not need to use grep.
Unfortunately all the available buffer space is full of the SYSV and NetBSD
kernels, there isn't any room for the Linux one :-)
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
task->saved_sigmask and ->restore_sigmask are only used in the ret-from-
syscall paths. This means that set_user_sigmask() can save ->blocked in
->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked
was modified.
This way the callers do not need 2 sigset_t's passed to set/restore and
restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns
into the trivial helper which just calls restore_saved_sigmask().
Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/aio.c | 20 +++++--------
fs/eventpoll.c | 12 +++-----
fs/io_uring.c | 11 ++-----
fs/select.c | 34 ++++++++--------------
include/linux/compat.h | 3 +-
include/linux/sched/signal.h | 12 ++++++--
include/linux/signal.h | 4 ---
kernel/signal.c | 69 ++++++++++++--------------------------------
8 files changed, 57 insertions(+), 108 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index c1e581d..8200f97 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2093,7 +2093,6 @@ SYSCALL_DEFINE6(io_pgetevents,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
bool interrupted;
int ret;
@@ -2104,14 +2103,14 @@ SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2129,7 +2128,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
bool interrupted;
int ret;
@@ -2141,14 +2139,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2197,7 +2195,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 t;
bool interrupted;
int ret;
@@ -2208,14 +2205,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2233,7 +2230,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 t;
bool interrupted;
int ret;
@@ -2244,14 +2240,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c74c76..0f9c073 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
size_t, sigsetsize)
{
int error;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
+ restore_saved_sigmask_unless(error == -EINTR);
return error;
}
@@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
compat_size_t, sigsetsize)
{
long err;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ err = set_compat_user_sigmask(sigmask, sigsetsize);
if (err)
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
+ restore_saved_sigmask_unless(err == -EINTR);
return err;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1147c5d..ad7b9bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2180,7 +2180,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
struct io_cq_ring *ring = ctx->cq_ring;
- sigset_t ksigmask, sigsaved;
int ret;
if (io_cqring_events(ring) >= min_events)
@@ -2190,21 +2189,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- &ksigmask, &sigsaved, sigsz);
+ sigsz);
else
#endif
- ret = set_user_sigmask(sig, &ksigmask,
- &sigsaved, sigsz);
+ ret = set_user_sigmask(sig, sigsz);
if (ret)
return ret;
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
- if (sig)
- restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+ restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
diff --git a/fs/select.c b/fs/select.c
index a4d8f6e..1fc1b24 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize,
enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = core_sys_select(n, inp, outp, exp, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
return ret;
@@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
@@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
void __user *tsp, compat_sigset_t __user *sigmask,
compat_size_t sigsetsize, enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
return ret;
@@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
unsigned int, nfds, struct __kernel_timespec __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ebddcb6..16dafd9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -138,8 +138,7 @@ typedef struct {
compat_sigset_word sig[_COMPAT_NSIG_WORDS];
} compat_sigset_t;
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize);
struct compat_sigaction {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 38a0f07..4d9c71d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -417,7 +417,6 @@ void task_join_group_stop(struct task_struct *task);
static inline void set_restore_sigmask(void)
{
set_thread_flag(TIF_RESTORE_SIGMASK);
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
@@ -448,7 +447,6 @@ static inline bool test_and_clear_restore_sigmask(void)
static inline void set_restore_sigmask(void)
{
current->restore_sigmask = true;
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
@@ -481,6 +479,16 @@ static inline void restore_saved_sigmask(void)
__set_current_blocked(¤t->saved_sigmask);
}
+extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
+
+static inline void restore_saved_sigmask_unless(bool interrupted)
+{
+ if (interrupted)
+ WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+ else
+ restore_saved_sigmask();
+}
+
static inline sigset_t *sigmask_to_save(void)
{
sigset_t *res = ¤t->blocked;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 78c2bb3..b5d99482 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,10 +273,6 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
- sigset_t *sigsaved, bool interrupted);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index aa6a6f1..1a1013e1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2863,80 +2863,49 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(&kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
- bool interrupted)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (interrupted) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
--
2.5.0
On 06/06, David Laight wrote:
>
> If a signal handler is called, I presume that the trampoline
> calls back into the kernel to get further handlers called
> and to finally restore the original signal mask?
See sigmask_to_save(), this is what the kernel records in uc.uc_sigmask
before the signal handler runs, after that current->saved_sigmask has no
meaning.
When signal handler returns it does sys_rt_sigreturn() which restores
the original mask saved in uc_sigmask.
> What happens if a signal handler calls something that
> would normally write to current->saved_sigmask?
See above.
Oleg.
From: Oleg Nesterov
> Sent: 06 June 2019 12:05
> On 06/06, David Laight wrote:
> >
> > If a signal handler is called, I presume that the trampoline
> > calls back into the kernel to get further handlers called
> > and to finally restore the original signal mask?
>
> See sigmask_to_save(), this is what the kernel records in uc.uc_sigmask
> before the signal handler runs, after that current->saved_sigmask has no
> meaning.
Some of this code is hard to grep through :-)
> When signal handler returns it does sys_rt_sigreturn() which restores
> the original mask saved in uc_sigmask.
Does that mean that if 2 signals interrupt epoll_wait() only
one of the signal handlers is run?
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Lets start from fs/select.c
I have no idead how to test these changes, please review.
On top of "[PATCH -mm V2 1/1] signal: simplify set_user_sigmask/restore_user_sigmask"
it seems that we can do more unrelated cleanups in this code, say,
poll_select_prepare(enum poll_time_type pt_type).
Oleg.
fs/select.c | 68 +++++++++++++++----------------------------------------------
1 file changed, 16 insertions(+), 52 deletions(-)
do_poll() returns -EINTR if interrupted and after that all its callers
have to translate it into -ERESTARTNOHAND. Change do_poll() to return
-ERESTARTNOHAND and update (simplify) the callers.
Note that this also unifies all users of restore_saved_sigmask_unless(),
see the next patch.
Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/select.c | 30 +++++++-----------------------
1 file changed, 7 insertions(+), 23 deletions(-)
diff --git a/fs/select.c b/fs/select.c
index 1fc1b24..57712c3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -925,7 +925,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
if (!count) {
count = wait->error;
if (signal_pending(current))
- count = -EINTR;
+ count = -ERESTARTNOHAND;
}
if (count || timed_out)
break;
@@ -1040,7 +1040,7 @@ static long do_restart_poll(struct restart_block *restart_block)
ret = do_sys_poll(ufds, nfds, to);
- if (ret == -EINTR) {
+ if (ret == -ERESTARTNOHAND) {
restart_block->fn = do_restart_poll;
ret = -ERESTART_RESTARTBLOCK;
}
@@ -1061,7 +1061,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
ret = do_sys_poll(ufds, nfds, to);
- if (ret == -EINTR) {
+ if (ret == -ERESTARTNOHAND) {
struct restart_block *restart_block;
restart_block = ¤t->restart_block;
@@ -1102,11 +1102,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return ret;
ret = do_sys_poll(ufds, nfds, to);
-
- restore_saved_sigmask_unless(ret == -EINTR);
- /* We can restart this syscall, usually */
- if (ret == -EINTR)
- ret = -ERESTARTNOHAND;
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
@@ -1135,11 +1131,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
return ret;
ret = do_sys_poll(ufds, nfds, to);
-
- restore_saved_sigmask_unless(ret == -EINTR);
- /* We can restart this syscall, usually */
- if (ret == -EINTR)
- ret = -ERESTARTNOHAND;
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1413,11 +1405,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
return ret;
ret = do_sys_poll(ufds, nfds, to);
-
- restore_saved_sigmask_unless(ret == -EINTR);
- /* We can restart this syscall, usually */
- if (ret == -EINTR)
- ret = -ERESTARTNOHAND;
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1446,11 +1434,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
return ret;
ret = do_sys_poll(ufds, nfds, to);
-
- restore_saved_sigmask_unless(ret == -EINTR);
- /* We can restart this syscall, usually */
- if (ret == -EINTR)
- ret = -ERESTARTNOHAND;
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
--
2.5.0
On Thu, Jun 6, 2019 at 7:09 AM Oleg Nesterov <[email protected]> wrote:
>
> do_poll() returns -EINTR if interrupted and after that all its callers
> have to translate it into -ERESTARTNOHAND. Change do_poll() to return
> -ERESTARTNOHAND and update (simplify) the callers.
Ack.
The *right* return value will actually be then chosen by
poll_select_copy_remaining(), which will turn ERESTARTNOHAND to EINTR
when it can't update the timeout.
Except for the cases that use restart_block and do that instead and
don't have the whole timeout restart issue as a result.
Linus
While reviewing Oleg's patches I realized a bunch of the logic around
saved_sigmask was redundant. So I dug just to see what I could see.
I turns out that real_blocked and saved_sigmask were different
implementations of the same idea for slightly different purposes.
Which means we only need either real_blocked or saved_sigmask.
I chose real_blocked as it has just a little bit of code associated
with it to disable optimizations on the signal sending path that do
not apply to blocked signals.
I did a little bit of cleanup of the users. Modified the core to keep
real_blocked in sync with blocked except while a clever system call that
like pselect or sigtimedwait is running.
After the dust cleared this allowed restore_sigmask and all of the logic
to keep it valid to be removed entirely.
I have only done the most cursory of testing at this point. Does anyone
have any thoughts in cleaning up the code in this direction?
Eric W. Biederman (5):
signal: Teach sigsuspend to use set_user_sigmask
signal/kvm: Stop using sigprocmask in kvm_sigset_(activate|deactivate)
signal: Always keep real_blocked in sync with blocked
signal: Remove saved_sigmask
signal: Remove the unnecessary restore_sigmask flag
arch/arc/include/asm/thread_info.h | 1 -
arch/arm/include/asm/thread_info.h | 1 -
arch/arm64/include/asm/thread_info.h | 1 -
arch/c6x/include/asm/thread_info.h | 1 -
arch/csky/include/asm/thread_info.h | 2 -
arch/h8300/include/asm/thread_info.h | 1 -
arch/hexagon/include/asm/thread_info.h | 1 -
arch/m68k/include/asm/thread_info.h | 1 -
arch/mips/include/asm/thread_info.h | 1 -
arch/nds32/include/asm/thread_info.h | 2 -
arch/nios2/include/asm/thread_info.h | 2 -
arch/riscv/include/asm/thread_info.h | 1 -
arch/s390/include/asm/thread_info.h | 1 -
arch/sparc/include/asm/thread_info_32.h | 1 -
arch/um/include/asm/thread_info.h | 1 -
arch/unicore32/include/asm/thread_info.h | 1 -
arch/xtensa/include/asm/thread_info.h | 1 -
include/linux/sched.h | 5 --
include/linux/sched/signal.h | 84 +-------------------------------
kernel/ptrace.c | 15 ++----
kernel/signal.c | 56 +++++++++------------
virt/kvm/kvm_main.c | 11 +----
22 files changed, 31 insertions(+), 160 deletions(-)
Eric
On 06/07, Eric W. Biederman wrote:
>
> Eric W. Biederman (5):
> signal: Teach sigsuspend to use set_user_sigmask
> signal/kvm: Stop using sigprocmask in kvm_sigset_(activate|deactivate)
> signal: Always keep real_blocked in sync with blocked
> signal: Remove saved_sigmask
> signal: Remove the unnecessary restore_sigmask flag
Reviewed-by: Oleg Nesterov <[email protected]>
I guess this should be routed via -mm tree? This depends on
signal-simplify-set_user_sigmask-restore_user_sigmask.patch
Oleg.