The kill() syscall operates on process identifiers. After a process has
exited its pid can be reused by another process. If a caller sends a signal
to a reused pid it will end up signaling the wrong process. This issue has
often surfaced and there has been a push [1] to address this problem.
This patch uses file descriptors from proc/<pid> as stable handles on
struct pid. Even if a pid is recycled the handle will not change. The file
descriptor can be used to send signals to the referenced process.
Thus, the new syscall procfd_signal() is introduced to solve this problem.
It operates on a process file descriptor.
The syscall takes an additional siginfo_t and flags argument. If siginfo_t
is NULL then procfd_signal() behaves like kill() if it is not NULL it
behaves like rt_sigqueueinfo.
The flags argument is added to allow for future extensions of this syscall.
It currently needs to be passed as 0.
With this patch a process can be killed via:
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
int main(int argc, char *argv[])
{
int ret;
char buf[1000];
if (argc < 2)
exit(EXIT_FAILURE);
ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
if (ret < 0)
exit(EXIT_FAILURE);
int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
if (fd < 0) {
printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
exit(EXIT_FAILURE);
}
ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
if (ret < 0) {
printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
close(fd);
exit(EXIT_FAILURE);
}
close(fd);
exit(EXIT_SUCCESS);
}
[1]: https://lkml.org/lkml/2018/11/18/130
Cc: "Eric W. Biederman" <[email protected]>
Cc: Serge Hallyn <[email protected]>
Cc: Jann Horn <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Andy Lutomirsky <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Aleksa Sarai <[email protected]>
Cc: Al Viro <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
---
Changelog:
v2:
- define __NR_procfd_signal in unistd.h
- wire up compat syscall
- s/proc_is_procfd/proc_is_tgid_procfd/g
- provide stubs when CONFIG_PROC_FS=n
- move proc_pid() to linux/proc_fs.h header
- use proc_pid() to grab struct pid from /proc/<pid> fd
v1:
- patch introduced
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 2 +
fs/proc/base.c | 11 ++-
fs/proc/internal.h | 5 -
include/linux/proc_fs.h | 12 +++
include/linux/syscalls.h | 2 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/signal.c | 127 +++++++++++++++++++++++--
8 files changed, 151 insertions(+), 13 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..3f27ffd8ae87 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,4 @@
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
386 i386 rseq sys_rseq __ia32_sys_rseq
+387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..8a30cde82450 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents
334 common rseq __x64_sys_rseq
+335 64 procfd_signal __x64_sys_procfd_signal
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -386,3 +387,4 @@
545 x32 execveat __x32_compat_sys_execveat/ptregs
546 x32 preadv2 __x32_compat_sys_preadv64v2
547 x32 pwritev2 __x32_compat_sys_pwritev64v2
+548 x32 procfd_signal __x32_compat_sys_procfd_signal
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ce3465479447..771c6bd1cac6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int mask)
return generic_permission(inode, mask);
}
-
+struct pid *proc_pid(const struct inode *inode)
+{
+ return PROC_I(inode)->pid;
+}
static const struct inode_operations proc_def_inode_operations = {
.setattr = proc_setattr,
@@ -3038,6 +3041,12 @@ static const struct file_operations proc_tgid_base_operations = {
.llseek = generic_file_llseek,
};
+bool proc_is_tgid_procfd(const struct file *file)
+{
+ return d_is_dir(file->f_path.dentry) &&
+ (file->f_op == &proc_tgid_base_operations);
+}
+
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return proc_pident_lookup(dir, dentry,
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..eb69afba83f3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -113,11 +113,6 @@ static inline void *__PDE_DATA(const struct inode *inode)
return PDE(inode)->data;
}
-static inline struct pid *proc_pid(const struct inode *inode)
-{
- return PROC_I(inode)->pid;
-}
-
static inline struct task_struct *get_proc_task(const struct inode *inode)
{
return get_pid_task(proc_pid(inode), PIDTYPE_PID);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d0e1f1522a78..96df2fe6311d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -73,6 +73,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
int (*show)(struct seq_file *, void *),
proc_write_t write,
void *data);
+extern bool proc_is_tgid_procfd(const struct file *file);
+extern struct pid *proc_pid(const struct inode *inode);
#else /* CONFIG_PROC_FS */
@@ -114,6 +116,16 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
#define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
#define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
+static inline bool proc_is_tgid_procfd(const struct file *file)
+{
+ return false;
+}
+
+static inline struct pid *proc_pid(const struct inode *inode)
+{
+ return NULL;
+}
+
#endif /* CONFIG_PROC_FS */
struct net;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ac3d13a915b..a5ca8cb84566 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -907,6 +907,8 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig);
+asmlinkage long sys_procfd_signal(int fd, int sig, siginfo_t __user *info,
+ int flags);
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 538546edbfbd..4dc81a994ad1 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -738,9 +738,11 @@ __SYSCALL(__NR_statx, sys_statx)
__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
#define __NR_rseq 293
__SYSCALL(__NR_rseq, sys_rseq)
+#define __NR_procfd_signal 294
+__SC_COMP(__NR_procfd_signal, sys_procfd_signal, compat_sys_procfd_signal)
#undef __NR_syscalls
-#define __NR_syscalls 294
+#define __NR_syscalls 295
/*
* 32 bit systems traditionally used different
diff --git a/kernel/signal.c b/kernel/signal.c
index 9a32bc2088c9..13695342f150 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -19,7 +19,9 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
@@ -3286,6 +3288,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
}
#endif
+static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+{
+ clear_siginfo(info);
+ info->si_signo = sig;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = task_tgid_vnr(current);
+ info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
+}
+
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
@@ -3295,16 +3307,119 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
- clear_siginfo(&info);
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_USER;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ prepare_kill_siginfo(sig, &info);
return kill_something_info(sig, &info, pid);
}
+/*
+ * Verify that the signaler and signalee either are in the same pid namespace
+ * or that the signaler's pid namespace is an ancestor of the signalee's pid
+ * namespace.
+ */
+static bool may_signal_procfd(struct pid *pid)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *p = ns_of_pid(pid);
+
+ for (;;) {
+ if (!p)
+ return false;
+ if (p == active)
+ break;
+ p = p->parent;
+ }
+
+ return true;
+}
+
+static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
+ bool had_siginfo)
+{
+ int ret;
+ struct fd f;
+ struct pid *pid;
+
+ /* Enforce flags be set to 0 until we add an extension. */
+ if (flags)
+ return -EINVAL;
+
+ f = fdget_raw(fd);
+ if (!f.file)
+ return -EBADF;
+
+ /* Is this a process file descriptor? */
+ ret = -EINVAL;
+ if (!proc_is_tgid_procfd(f.file))
+ goto err;
+
+ /* Without CONFIG_PROC_FS proc_pid() returns NULL. */
+ pid = proc_pid(file_inode(f.file));
+ if (!pid)
+ goto err;
+
+ if (!may_signal_procfd(pid))
+ goto err;
+
+ if (had_siginfo) {
+ /*
+ * Not even root can pretend to send signals from the kernel.
+ * Nor can they impersonate a kill()/tgkill(), which adds
+ * source info.
+ */
+ ret = -EPERM;
+ if ((kinfo->si_code >= 0 || kinfo->si_code == SI_TKILL) &&
+ (task_pid(current) != pid))
+ goto err;
+ } else {
+ prepare_kill_siginfo(sig, kinfo);
+ }
+
+ ret = kill_pid_info(sig, kinfo, pid);
+
+err:
+ fdput(f);
+ return ret;
+}
+
+/**
+ * sys_procfd_signal - send a signal to a process through a process file
+ * descriptor
+ * @fd: the file descriptor of the process
+ * @sig: signal to be sent
+ * @info: the signal info
+ * @flags: future flags to be passed
+ */
+SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
+ int, flags)
+{
+ kernel_siginfo_t kinfo;
+
+ if (info) {
+ int ret = __copy_siginfo_from_user(sig, &kinfo, info);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig,
+ struct compat_siginfo __user *, info, int, flags)
+{
+ kernel_siginfo_t kinfo;
+
+ if (info) {
+ int ret = __copy_siginfo_from_user32(sig, &kinfo, info);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
+}
+#endif
+
static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
--
2.19.1
Signed-off-by: Christian Brauner <[email protected]>
---
man2/procfd_signal.2 | 208 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 208 insertions(+)
create mode 100644 man2/procfd_signal.2
diff --git a/man2/procfd_signal.2 b/man2/procfd_signal.2
new file mode 100644
index 000000000..889bb426f
--- /dev/null
+++ b/man2/procfd_signal.2
@@ -0,0 +1,208 @@
+.\" Copyright (C) 2018 Christian Brauner <[email protected]>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date. The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein. The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH PROCFD_SIGNAL 2 2017-09-15 "Linux" "Linux Programmer's Manual"
+.SH NAME
+procfd_signal \- send signal to a process through a process descriptor
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.B #include <signal.h>
+.PP
+.BI "int procfd_signal(int " fd ", int " sig ", siginfo_t *" info ", int " flags );
+.fi
+.SH DESCRIPTION
+.BR procfd_signal ()
+sends the signal specified in
+.I sig
+to the process identified by the file descriptor
+.IR fd .
+The permissions required to send a signal are the same as for
+.BR kill (2).
+As with
+.BR kill (2),
+the null signal (0) can be used to check if a process with a given
+PID exists.
+.PP
+The optional
+.I info
+argument specifies the data to accompany the signal.
+This argument is a pointer to a structure of type
+.IR siginfo_t ,
+described in
+.BR sigaction (2)
+(and defined by including
+.IR <sigaction.h> ).
+The caller should set the following fields in this structure:
+.TP
+.I si_code
+This must be one of the
+.B SI_*
+codes in the Linux kernel source file
+.IR include/asm-generic/siginfo.h ,
+with the restriction that the code must be negative
+(i.e., cannot be
+.BR SI_USER ,
+which is used by the kernel to indicate a signal sent by
+.BR kill (2))
+and cannot (since Linux 2.6.39) be
+.BR SI_TKILL
+(which is used by the kernel to indicate a signal sent using
+.\" tkill(2) or
+.BR tgkill (2)).
+.TP
+.I si_pid
+This should be set to a process ID,
+typically the process ID of the sender.
+.TP
+.I si_uid
+This should be set to a user ID,
+typically the real user ID of the sender.
+.TP
+.I si_value
+This field contains the user data to accompany the signal.
+For more information, see the description of the last
+.RI ( "union sigval" )
+argument of
+.BR sigqueue (3).
+.PP
+Internally, the kernel sets the
+.I si_signo
+field to the value specified in
+.IR sig ,
+so that the receiver of the signal can also obtain
+the signal number via that field.
+.PP
+The
+.I flags
+argument is reserved for future extension and must be set to 0.
+.PP
+.SH RETURN VALUE
+On success, this system call returns 0.
+On error, it returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EINVAL
+An invalid signal was specified.
+.TP
+.B EINVAL
+.I fd
+does not refer to a process.
+.TP
+.B EINVAL
+The flags argument was not 0.
+.TP
+.B EINVAL
+The calling process and the target process are neither located
+in the same pid namespace nor is the calling process's
+pid namespace an ancestor of the target process's pid namespaces
+(see
+.BR pid_namespaces (7)).
+.TP
+.B EPERM
+The caller does not have permission to send the signal to the target.
+For the required permissions, see
+.BR kill (2).
+Or:
+.I uinfo->si_code
+is invalid.
+.TP
+.B ESRCH
+The process or process group does not exist.
+Note that an existing process might be a zombie,
+a process that has terminated execution, but
+has not yet been
+.BR wait (2)ed
+for.
+.SH CONFORMING TO
+This system call is Linux-specific.
+.SH NOTES
+The
+.BR procfd_signal ()
+system call will only send signals to processes which either are in
+the same pid namespace or are in a descendant pid namespace of
+the caller.
+.SH EXAMPLE
+The following program demonstrates how to kill a process with
+.BR procfd_signal ()
+.SS Program source
+.EX
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int
+main(int argc, char *argv[])
+{
+ int ret;
+ char buf[1000];
+
+ if (argc < 2)
+ exit(EXIT_FAILURE);
+
+ ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
+ if (ret < 0)
+ exit(EXIT_FAILURE);
+
+ int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
+ if (fd < 0) {
+ printf("%s - Failed to open \\"%s\\"\\n", strerror(errno), buf);
+ exit(EXIT_FAILURE);
+ }
+
+ ret = procfd_signal(fd, SIGKILL, NULL, 0);
+ if (ret < 0) {
+ printf("Failed to send SIGKILL \\"%s\\"\\n", strerror(errno));
+ close(fd);
+ exit(EXIT_FAILURE);
+ }
+
+ close(fd);
+
+ exit(EXIT_SUCCESS);
+}
+.EE
+.SH SEE ALSO
+.BR kill (2),
+.BR sigaction (2),
+.BR sigprocmask (2),
+.BR tgkill (2),
+.BR pthread_sigqueue (3),
+.BR rt_sigqueueinfo (2),
+.BR sigqueue (3),
+.BR signal (7)
--
2.19.1
On Tue, Nov 20, 2018 at 11:51:23AM +0100, Christian Brauner wrote:
> The kill() syscall operates on process identifiers. After a process has
> exited its pid can be reused by another process. If a caller sends a signal
> to a reused pid it will end up signaling the wrong process. This issue has
> often surfaced and there has been a push [1] to address this problem.
>
> This patch uses file descriptors from proc/<pid> as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The file
> descriptor can be used to send signals to the referenced process.
> Thus, the new syscall procfd_signal() is introduced to solve this problem.
> It operates on a process file descriptor.
> The syscall takes an additional siginfo_t and flags argument. If siginfo_t
> is NULL then procfd_signal() behaves like kill() if it is not NULL it
> behaves like rt_sigqueueinfo.
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0.
>
> With this patch a process can be killed via:
>
> #define _GNU_SOURCE
> #include <errno.h>
> #include <fcntl.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <signal.h>
> #include <sys/stat.h>
> #include <sys/syscall.h>
> #include <sys/types.h>
> #include <unistd.h>
>
> int main(int argc, char *argv[])
> {
> int ret;
> char buf[1000];
>
> if (argc < 2)
> exit(EXIT_FAILURE);
>
> ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
> if (ret < 0)
|| ret > sizeof(buf) ? :-) I mean, you *are* passing the string...
> exit(EXIT_FAILURE);
>
> int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
> if (fd < 0) {
> printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
> exit(EXIT_FAILURE);
> }
>
> ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
> if (ret < 0) {
> printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
> close(fd);
> exit(EXIT_FAILURE);
> }
>
> close(fd);
>
> exit(EXIT_SUCCESS);
> }
>
> [1]: https://lkml.org/lkml/2018/11/18/130
>
> Cc: "Eric W. Biederman" <[email protected]>
> Cc: Serge Hallyn <[email protected]>
Acked-by: Serge Hallyn <[email protected]>
> Cc: Jann Horn <[email protected]>
> Cc: Kees Cook <[email protected]>
> Cc: Andy Lutomirsky <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Oleg Nesterov <[email protected]>
> Cc: Aleksa Sarai <[email protected]>
> Cc: Al Viro <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>
> ---
> Changelog:
> v2:
> - define __NR_procfd_signal in unistd.h
> - wire up compat syscall
> - s/proc_is_procfd/proc_is_tgid_procfd/g
> - provide stubs when CONFIG_PROC_FS=n
> - move proc_pid() to linux/proc_fs.h header
> - use proc_pid() to grab struct pid from /proc/<pid> fd
> v1:
> - patch introduced
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 2 +
> fs/proc/base.c | 11 ++-
> fs/proc/internal.h | 5 -
> include/linux/proc_fs.h | 12 +++
> include/linux/syscalls.h | 2 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/signal.c | 127 +++++++++++++++++++++++--
> 8 files changed, 151 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
> 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
> 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
> 386 i386 rseq sys_rseq __ia32_sys_rseq
> +387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
> 332 common statx __x64_sys_statx
> 333 common io_pgetevents __x64_sys_io_pgetevents
> 334 common rseq __x64_sys_rseq
> +335 64 procfd_signal __x64_sys_procfd_signal
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
> 545 x32 execveat __x32_compat_sys_execveat/ptregs
> 546 x32 preadv2 __x32_compat_sys_preadv64v2
> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ce3465479447..771c6bd1cac6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int mask)
> return generic_permission(inode, mask);
> }
>
> -
> +struct pid *proc_pid(const struct inode *inode)
> +{
> + return PROC_I(inode)->pid;
> +}
>
> static const struct inode_operations proc_def_inode_operations = {
> .setattr = proc_setattr,
> @@ -3038,6 +3041,12 @@ static const struct file_operations proc_tgid_base_operations = {
> .llseek = generic_file_llseek,
> };
>
> +bool proc_is_tgid_procfd(const struct file *file)
> +{
> + return d_is_dir(file->f_path.dentry) &&
> + (file->f_op == &proc_tgid_base_operations);
> +}
> +
> static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
> {
> return proc_pident_lookup(dir, dentry,
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 5185d7f6a51e..eb69afba83f3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -113,11 +113,6 @@ static inline void *__PDE_DATA(const struct inode *inode)
> return PDE(inode)->data;
> }
>
> -static inline struct pid *proc_pid(const struct inode *inode)
> -{
> - return PROC_I(inode)->pid;
> -}
> -
> static inline struct task_struct *get_proc_task(const struct inode *inode)
> {
> return get_pid_task(proc_pid(inode), PIDTYPE_PID);
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index d0e1f1522a78..96df2fe6311d 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -73,6 +73,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
> int (*show)(struct seq_file *, void *),
> proc_write_t write,
> void *data);
> +extern bool proc_is_tgid_procfd(const struct file *file);
> +extern struct pid *proc_pid(const struct inode *inode);
>
> #else /* CONFIG_PROC_FS */
>
> @@ -114,6 +116,16 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
> #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
> #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
>
> +static inline bool proc_is_tgid_procfd(const struct file *file)
> +{
> + return false;
> +}
> +
> +static inline struct pid *proc_pid(const struct inode *inode)
> +{
> + return NULL;
> +}
> +
> #endif /* CONFIG_PROC_FS */
>
> struct net;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2ac3d13a915b..a5ca8cb84566 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -907,6 +907,8 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> unsigned mask, struct statx __user *buffer);
> asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
> int flags, uint32_t sig);
> +asmlinkage long sys_procfd_signal(int fd, int sig, siginfo_t __user *info,
> + int flags);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 538546edbfbd..4dc81a994ad1 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -738,9 +738,11 @@ __SYSCALL(__NR_statx, sys_statx)
> __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
> #define __NR_rseq 293
> __SYSCALL(__NR_rseq, sys_rseq)
> +#define __NR_procfd_signal 294
> +__SC_COMP(__NR_procfd_signal, sys_procfd_signal, compat_sys_procfd_signal)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 294
> +#define __NR_syscalls 295
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9a32bc2088c9..13695342f150 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -19,7 +19,9 @@
> #include <linux/sched/task.h>
> #include <linux/sched/task_stack.h>
> #include <linux/sched/cputime.h>
> +#include <linux/file.h>
> #include <linux/fs.h>
> +#include <linux/proc_fs.h>
> #include <linux/tty.h>
> #include <linux/binfmts.h>
> #include <linux/coredump.h>
> @@ -3286,6 +3288,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
> }
> #endif
>
> +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
> +{
> + clear_siginfo(info);
> + info->si_signo = sig;
> + info->si_errno = 0;
> + info->si_code = SI_USER;
> + info->si_pid = task_tgid_vnr(current);
> + info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
> +}
> +
> /**
> * sys_kill - send a signal to a process
> * @pid: the PID of the process
> @@ -3295,16 +3307,119 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
> {
> struct kernel_siginfo info;
>
> - clear_siginfo(&info);
> - info.si_signo = sig;
> - info.si_errno = 0;
> - info.si_code = SI_USER;
> - info.si_pid = task_tgid_vnr(current);
> - info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
> + prepare_kill_siginfo(sig, &info);
>
> return kill_something_info(sig, &info, pid);
> }
>
> +/*
> + * Verify that the signaler and signalee either are in the same pid namespace
> + * or that the signaler's pid namespace is an ancestor of the signalee's pid
> + * namespace.
> + */
> +static bool may_signal_procfd(struct pid *pid)
> +{
> + struct pid_namespace *active = task_active_pid_ns(current);
> + struct pid_namespace *p = ns_of_pid(pid);
> +
> + for (;;) {
> + if (!p)
> + return false;
> + if (p == active)
> + break;
> + p = p->parent;
> + }
> +
> + return true;
> +}
> +
> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
> + bool had_siginfo)
> +{
> + int ret;
> + struct fd f;
> + struct pid *pid;
> +
> + /* Enforce flags be set to 0 until we add an extension. */
> + if (flags)
> + return -EINVAL;
> +
> + f = fdget_raw(fd);
> + if (!f.file)
> + return -EBADF;
> +
> + /* Is this a process file descriptor? */
> + ret = -EINVAL;
> + if (!proc_is_tgid_procfd(f.file))
> + goto err;
> +
> + /* Without CONFIG_PROC_FS proc_pid() returns NULL. */
> + pid = proc_pid(file_inode(f.file));
> + if (!pid)
> + goto err;
> +
> + if (!may_signal_procfd(pid))
> + goto err;
> +
> + if (had_siginfo) {
> + /*
> + * Not even root can pretend to send signals from the kernel.
> + * Nor can they impersonate a kill()/tgkill(), which adds
> + * source info.
> + */
> + ret = -EPERM;
> + if ((kinfo->si_code >= 0 || kinfo->si_code == SI_TKILL) &&
> + (task_pid(current) != pid))
> + goto err;
> + } else {
> + prepare_kill_siginfo(sig, kinfo);
> + }
> +
> + ret = kill_pid_info(sig, kinfo, pid);
> +
> +err:
> + fdput(f);
> + return ret;
> +}
> +
> +/**
> + * sys_procfd_signal - send a signal to a process through a process file
> + * descriptor
> + * @fd: the file descriptor of the process
> + * @sig: signal to be sent
> + * @info: the signal info
> + * @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> + int, flags)
> +{
> + kernel_siginfo_t kinfo;
> +
> + if (info) {
> + int ret = __copy_siginfo_from_user(sig, &kinfo, info);
> + if (unlikely(ret))
> + return ret;
> + }
> +
> + return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +
> +#ifdef CONFIG_COMPAT
> +COMPAT_SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig,
> + struct compat_siginfo __user *, info, int, flags)
> +{
> + kernel_siginfo_t kinfo;
> +
> + if (info) {
> + int ret = __copy_siginfo_from_user32(sig, &kinfo, info);
> + if (unlikely(ret))
> + return ret;
> + }
> +
> + return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +#endif
> +
> static int
> do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
> {
> --
> 2.19.1
On 2018-11-20, Christian Brauner <[email protected]> wrote:
> The kill() syscall operates on process identifiers. After a process has
> exited its pid can be reused by another process. If a caller sends a signal
> to a reused pid it will end up signaling the wrong process. This issue has
> often surfaced and there has been a push [1] to address this problem.
>
> This patch uses file descriptors from proc/<pid> as stable handles on
> struct pid. Even if a pid is recycled the handle will not change. The file
> descriptor can be used to send signals to the referenced process.
> Thus, the new syscall procfd_signal() is introduced to solve this problem.
> It operates on a process file descriptor.
> The syscall takes an additional siginfo_t and flags argument. If siginfo_t
> is NULL then procfd_signal() behaves like kill() if it is not NULL it
> behaves like rt_sigqueueinfo.
> The flags argument is added to allow for future extensions of this syscall.
> It currently needs to be passed as 0.
>
> With this patch a process can be killed via:
>
> #define _GNU_SOURCE
> #include <errno.h>
> #include <fcntl.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <signal.h>
> #include <sys/stat.h>
> #include <sys/syscall.h>
> #include <sys/types.h>
> #include <unistd.h>
>
> int main(int argc, char *argv[])
> {
> int ret;
> char buf[1000];
>
> if (argc < 2)
> exit(EXIT_FAILURE);
>
> ret = snprintf(buf, sizeof(buf), "/proc/%s", argv[1]);
> if (ret < 0)
> exit(EXIT_FAILURE);
>
> int fd = open(buf, O_DIRECTORY | O_CLOEXEC);
> if (fd < 0) {
> printf("%s - Failed to open \"%s\"\n", strerror(errno), buf);
> exit(EXIT_FAILURE);
> }
>
> ret = syscall(__NR_procfd_signal, fd, SIGKILL, NULL, 0);
> if (ret < 0) {
> printf("Failed to send SIGKILL \"%s\"\n", strerror(errno));
> close(fd);
> exit(EXIT_FAILURE);
> }
>
> close(fd);
>
> exit(EXIT_SUCCESS);
> }
>
> [1]: https://lkml.org/lkml/2018/11/18/130
>
> Cc: "Eric W. Biederman" <[email protected]>
> Cc: Serge Hallyn <[email protected]>
> Cc: Jann Horn <[email protected]>
> Cc: Kees Cook <[email protected]>
> Cc: Andy Lutomirsky <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Oleg Nesterov <[email protected]>
> Cc: Aleksa Sarai <[email protected]>
Acked-by: Aleksa Sarai <[email protected]>
> Cc: Al Viro <[email protected]>
> Signed-off-by: Christian Brauner <[email protected]>
> ---
> Changelog:
> v2:
> - define __NR_procfd_signal in unistd.h
> - wire up compat syscall
> - s/proc_is_procfd/proc_is_tgid_procfd/g
> - provide stubs when CONFIG_PROC_FS=n
> - move proc_pid() to linux/proc_fs.h header
> - use proc_pid() to grab struct pid from /proc/<pid> fd
> v1:
> - patch introduced
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 2 +
> fs/proc/base.c | 11 ++-
> fs/proc/internal.h | 5 -
> include/linux/proc_fs.h | 12 +++
> include/linux/syscalls.h | 2 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/signal.c | 127 +++++++++++++++++++++++--
> 8 files changed, 151 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
> 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
> 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
> 386 i386 rseq sys_rseq __ia32_sys_rseq
> +387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
> 332 common statx __x64_sys_statx
> 333 common io_pgetevents __x64_sys_io_pgetevents
> 334 common rseq __x64_sys_rseq
> +335 64 procfd_signal __x64_sys_procfd_signal
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
> 545 x32 execveat __x32_compat_sys_execveat/ptregs
> 546 x32 preadv2 __x32_compat_sys_preadv64v2
> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ce3465479447..771c6bd1cac6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -716,7 +716,10 @@ static int proc_pid_permission(struct inode *inode, int mask)
> return generic_permission(inode, mask);
> }
>
> -
> +struct pid *proc_pid(const struct inode *inode)
> +{
> + return PROC_I(inode)->pid;
> +}
>
> static const struct inode_operations proc_def_inode_operations = {
> .setattr = proc_setattr,
> @@ -3038,6 +3041,12 @@ static const struct file_operations proc_tgid_base_operations = {
> .llseek = generic_file_llseek,
> };
>
> +bool proc_is_tgid_procfd(const struct file *file)
> +{
> + return d_is_dir(file->f_path.dentry) &&
> + (file->f_op == &proc_tgid_base_operations);
> +}
> +
> static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
> {
> return proc_pident_lookup(dir, dentry,
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 5185d7f6a51e..eb69afba83f3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -113,11 +113,6 @@ static inline void *__PDE_DATA(const struct inode *inode)
> return PDE(inode)->data;
> }
>
> -static inline struct pid *proc_pid(const struct inode *inode)
> -{
> - return PROC_I(inode)->pid;
> -}
> -
> static inline struct task_struct *get_proc_task(const struct inode *inode)
> {
> return get_pid_task(proc_pid(inode), PIDTYPE_PID);
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index d0e1f1522a78..96df2fe6311d 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -73,6 +73,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
> int (*show)(struct seq_file *, void *),
> proc_write_t write,
> void *data);
> +extern bool proc_is_tgid_procfd(const struct file *file);
> +extern struct pid *proc_pid(const struct inode *inode);
>
> #else /* CONFIG_PROC_FS */
>
> @@ -114,6 +116,16 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
> #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
> #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
>
> +static inline bool proc_is_tgid_procfd(const struct file *file)
> +{
> + return false;
> +}
> +
> +static inline struct pid *proc_pid(const struct inode *inode)
> +{
> + return NULL;
> +}
> +
> #endif /* CONFIG_PROC_FS */
>
> struct net;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2ac3d13a915b..a5ca8cb84566 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -907,6 +907,8 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> unsigned mask, struct statx __user *buffer);
> asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
> int flags, uint32_t sig);
> +asmlinkage long sys_procfd_signal(int fd, int sig, siginfo_t __user *info,
> + int flags);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 538546edbfbd..4dc81a994ad1 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -738,9 +738,11 @@ __SYSCALL(__NR_statx, sys_statx)
> __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
> #define __NR_rseq 293
> __SYSCALL(__NR_rseq, sys_rseq)
> +#define __NR_procfd_signal 294
> +__SC_COMP(__NR_procfd_signal, sys_procfd_signal, compat_sys_procfd_signal)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 294
> +#define __NR_syscalls 295
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9a32bc2088c9..13695342f150 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -19,7 +19,9 @@
> #include <linux/sched/task.h>
> #include <linux/sched/task_stack.h>
> #include <linux/sched/cputime.h>
> +#include <linux/file.h>
> #include <linux/fs.h>
> +#include <linux/proc_fs.h>
> #include <linux/tty.h>
> #include <linux/binfmts.h>
> #include <linux/coredump.h>
> @@ -3286,6 +3288,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
> }
> #endif
>
> +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
> +{
> + clear_siginfo(info);
> + info->si_signo = sig;
> + info->si_errno = 0;
> + info->si_code = SI_USER;
> + info->si_pid = task_tgid_vnr(current);
> + info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
> +}
> +
> /**
> * sys_kill - send a signal to a process
> * @pid: the PID of the process
> @@ -3295,16 +3307,119 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
> {
> struct kernel_siginfo info;
>
> - clear_siginfo(&info);
> - info.si_signo = sig;
> - info.si_errno = 0;
> - info.si_code = SI_USER;
> - info.si_pid = task_tgid_vnr(current);
> - info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
> + prepare_kill_siginfo(sig, &info);
>
> return kill_something_info(sig, &info, pid);
> }
>
> +/*
> + * Verify that the signaler and signalee either are in the same pid namespace
> + * or that the signaler's pid namespace is an ancestor of the signalee's pid
> + * namespace.
> + */
> +static bool may_signal_procfd(struct pid *pid)
> +{
> + struct pid_namespace *active = task_active_pid_ns(current);
> + struct pid_namespace *p = ns_of_pid(pid);
> +
> + for (;;) {
> + if (!p)
> + return false;
> + if (p == active)
> + break;
> + p = p->parent;
> + }
> +
> + return true;
> +}
> +
> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
> + bool had_siginfo)
> +{
> + int ret;
> + struct fd f;
> + struct pid *pid;
> +
> + /* Enforce flags be set to 0 until we add an extension. */
> + if (flags)
> + return -EINVAL;
> +
> + f = fdget_raw(fd);
> + if (!f.file)
> + return -EBADF;
> +
> + /* Is this a process file descriptor? */
> + ret = -EINVAL;
> + if (!proc_is_tgid_procfd(f.file))
> + goto err;
> +
> + /* Without CONFIG_PROC_FS proc_pid() returns NULL. */
> + pid = proc_pid(file_inode(f.file));
> + if (!pid)
> + goto err;
> +
> + if (!may_signal_procfd(pid))
> + goto err;
> +
> + if (had_siginfo) {
> + /*
> + * Not even root can pretend to send signals from the kernel.
> + * Nor can they impersonate a kill()/tgkill(), which adds
> + * source info.
> + */
> + ret = -EPERM;
> + if ((kinfo->si_code >= 0 || kinfo->si_code == SI_TKILL) &&
> + (task_pid(current) != pid))
> + goto err;
> + } else {
> + prepare_kill_siginfo(sig, kinfo);
> + }
> +
> + ret = kill_pid_info(sig, kinfo, pid);
> +
> +err:
> + fdput(f);
> + return ret;
> +}
> +
> +/**
> + * sys_procfd_signal - send a signal to a process through a process file
> + * descriptor
> + * @fd: the file descriptor of the process
> + * @sig: signal to be sent
> + * @info: the signal info
> + * @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> + int, flags)
> +{
> + kernel_siginfo_t kinfo;
> +
> + if (info) {
> + int ret = __copy_siginfo_from_user(sig, &kinfo, info);
> + if (unlikely(ret))
> + return ret;
> + }
> +
> + return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +
> +#ifdef CONFIG_COMPAT
> +COMPAT_SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig,
> + struct compat_siginfo __user *, info, int, flags)
> +{
> + kernel_siginfo_t kinfo;
> +
> + if (info) {
> + int ret = __copy_siginfo_from_user32(sig, &kinfo, info);
> + if (unlikely(ret))
> + return ret;
> + }
> +
> + return do_procfd_signal(fd, sig, &kinfo, flags, !!info);
> +}
> +#endif
> +
> static int
> do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
> {
> --
> 2.19.1
>
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
On Tue, Nov 20, 2018 at 11:54 AM Christian Brauner <[email protected]> wrote:
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 2 +
> fs/proc/base.c | 11 ++-
> fs/proc/internal.h | 5 -
> include/linux/proc_fs.h | 12 +++
> include/linux/syscalls.h | 2 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/signal.c | 127 +++++++++++++++++++++++--
> 8 files changed, 151 insertions(+), 13 deletions(-)
For asm-generic:
Acked-by: Arnd Bergmann <[email protected]>
I checked that the system call wired up correctly in a way that
works on all architectures using the generic syscall table.
Disclaimer: I'm looking at this patch because Christian requested it.
I'm not a kernel developer.
* Christian Brauner:
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..3f27ffd8ae87 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
> 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
> 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
> 386 i386 rseq sys_rseq __ia32_sys_rseq
> +387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..8a30cde82450 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
> 332 common statx __x64_sys_statx
> 333 common io_pgetevents __x64_sys_io_pgetevents
> 334 common rseq __x64_sys_rseq
> +335 64 procfd_signal __x64_sys_procfd_signal
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -386,3 +387,4 @@
> 545 x32 execveat __x32_compat_sys_execveat/ptregs
> 546 x32 preadv2 __x32_compat_sys_preadv64v2
> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
Is there a reason why these numbers have to be different?
(See the recent discussion with Andy Lutomirski.)
> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t *kinfo, int flags,
> + bool had_siginfo)
> +{
> + int ret;
> + struct fd f;
> + struct pid *pid;
> +
> + /* Enforce flags be set to 0 until we add an extension. */
> + if (flags)
> + return -EINVAL;
> +
> + f = fdget_raw(fd);
> + if (!f.file)
> + return -EBADF;
> +
> + /* Is this a process file descriptor? */
> + ret = -EINVAL;
> + if (!proc_is_tgid_procfd(f.file))
> + goto err;
[…]
> + ret = kill_pid_info(sig, kinfo, pid);
I would like to see some comment here what happens to zombie processes.
> +/**
> + * sys_procfd_signal - send a signal to a process through a process file
> + * descriptor
> + * @fd: the file descriptor of the process
> + * @sig: signal to be sent
> + * @info: the signal info
> + * @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> + int, flags)
Sorry, I'm quite unhappy with the name. “signal” is for signal handler
management. procfd_sendsignal, procfd_sigqueueinfo or something like
that would be fine. Even procfd_kill would be better IMHO.
Looking at the rt_tgsigqueueinfo interface, is there a way to implement
the “tg” part with the current procfd_signal interface? Would you use
openat to retrieve the Tgid: line from "status"?
Thanks,
Florian
> On Nov 29, 2018, at 4:28 AM, Florian Weimer <[email protected]> wrote:
>
> Disclaimer: I'm looking at this patch because Christian requested it.
> I'm not a kernel developer.
>
> * Christian Brauner:
>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -398,3 +398,4 @@
>> 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
>> 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
>> 386 i386 rseq sys_rseq __ia32_sys_rseq
>> +387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index f0b1709a5ffb..8a30cde82450 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -343,6 +343,7 @@
>> 332 common statx __x64_sys_statx
>> 333 common io_pgetevents __x64_sys_io_pgetevents
>> 334 common rseq __x64_sys_rseq
>> +335 64 procfd_signal __x64_sys_procfd_signal
>>
>> #
>> # x32-specific system call numbers start at 512 to avoid cache impact
>> @@ -386,3 +387,4 @@
>> 545 x32 execveat __x32_compat_sys_execveat/ptregs
>> 546 x32 preadv2 __x32_compat_sys_preadv64v2
>> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
>> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
>
> Is there a reason why these numbers have to be different?
>
> (See the recent discussion with Andy Lutomirski.)
Hah, I missed this part of the patch. Let’s not add new x32 syscall numbers.
Also, can we perhaps rework this a bit to get rid of the compat entry point? The easier way would be to check in_compat_syscall(). The nicer way IMO would be to use the 64-bit structure for 32-bit as well.
On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
>
>
>> On Nov 29, 2018, at 4:28 AM, Florian Weimer <[email protected]>
>wrote:
>>
>> Disclaimer: I'm looking at this patch because Christian requested it.
>> I'm not a kernel developer.
>>
>> * Christian Brauner:
>>
>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
>b/arch/x86/entry/syscalls/syscall_32.tbl
>>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>>> @@ -398,3 +398,4 @@
>>> 384 i386 arch_prctl sys_arch_prctl
>__ia32_compat_sys_arch_prctl
>>> 385 i386 io_pgetevents sys_io_pgetevents
>__ia32_compat_sys_io_pgetevents
>>> 386 i386 rseq sys_rseq __ia32_sys_rseq
>>> +387 i386 procfd_signal sys_procfd_signal
>__ia32_compat_sys_procfd_signal
>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>b/arch/x86/entry/syscalls/syscall_64.tbl
>>> index f0b1709a5ffb..8a30cde82450 100644
>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>> @@ -343,6 +343,7 @@
>>> 332 common statx __x64_sys_statx
>>> 333 common io_pgetevents __x64_sys_io_pgetevents
>>> 334 common rseq __x64_sys_rseq
>>> +335 64 procfd_signal __x64_sys_procfd_signal
>>>
>>> #
>>> # x32-specific system call numbers start at 512 to avoid cache
>impact
>>> @@ -386,3 +387,4 @@
>>> 545 x32 execveat __x32_compat_sys_execveat/ptregs
>>> 546 x32 preadv2 __x32_compat_sys_preadv64v2
>>> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
>>> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
>>
>> Is there a reason why these numbers have to be different?
>>
>> (See the recent discussion with Andy Lutomirski.)
>
>Hah, I missed this part of the patch. Let’s not add new x32 syscall
>numbers.
>
>Also, can we perhaps rework this a bit to get rid of the compat entry
>point? The easier way would be to check in_compat_syscall(). The nicer
>way IMO would be to use the 64-bit structure for 32-bit as well.
Do you have a syscall which set precedence/did this before I could look at?
Just if you happen to remember one.
Fwiw, I followed the other signal syscalls.
They all introduce compat syscalls.
On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
>
> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
> >
> >
> >> On Nov 29, 2018, at 4:28 AM, Florian Weimer <[email protected]>
> >wrote:
> >>
> >> Disclaimer: I'm looking at this patch because Christian requested it.
> >> I'm not a kernel developer.
> >>
> >> * Christian Brauner:
> >>
> >>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
> >b/arch/x86/entry/syscalls/syscall_32.tbl
> >>> index 3cf7b533b3d1..3f27ffd8ae87 100644
> >>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> >>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> >>> @@ -398,3 +398,4 @@
> >>> 384 i386 arch_prctl sys_arch_prctl
> >__ia32_compat_sys_arch_prctl
> >>> 385 i386 io_pgetevents sys_io_pgetevents
> >__ia32_compat_sys_io_pgetevents
> >>> 386 i386 rseq sys_rseq __ia32_sys_rseq
> >>> +387 i386 procfd_signal sys_procfd_signal
> >__ia32_compat_sys_procfd_signal
> >>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> >b/arch/x86/entry/syscalls/syscall_64.tbl
> >>> index f0b1709a5ffb..8a30cde82450 100644
> >>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> >>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> >>> @@ -343,6 +343,7 @@
> >>> 332 common statx __x64_sys_statx
> >>> 333 common io_pgetevents __x64_sys_io_pgetevents
> >>> 334 common rseq __x64_sys_rseq
> >>> +335 64 procfd_signal __x64_sys_procfd_signal
> >>>
> >>> #
> >>> # x32-specific system call numbers start at 512 to avoid cache
> >impact
> >>> @@ -386,3 +387,4 @@
> >>> 545 x32 execveat __x32_compat_sys_execveat/ptregs
> >>> 546 x32 preadv2 __x32_compat_sys_preadv64v2
> >>> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
> >>> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
> >>
> >> Is there a reason why these numbers have to be different?
> >>
> >> (See the recent discussion with Andy Lutomirski.)
> >
> >Hah, I missed this part of the patch. Let’s not add new x32 syscall
> >numbers.
> >
> >Also, can we perhaps rework this a bit to get rid of the compat entry
> >point? The easier way would be to check in_compat_syscall(). The nicer
> >way IMO would be to use the 64-bit structure for 32-bit as well.
>
> Do you have a syscall which set precedence/did this before I could look at?
> Just if you happen to remember one.
> Fwiw, I followed the other signal syscalls.
> They all introduce compat syscalls.
>
Not really.
Let me try to explain. I have three issues with the approach in your patchset:
1. You're introducing a new syscall, and it behaves differently on
32-bit and 64-bit because the structure you pass in is different.
This is necessary for old syscalls where compatibility matters, but
maybe we can get rid of it for new syscalls. Could we define a
siginfo64_t that is identical to the 64-bit siginfo_t and just use
that in all cases?
2. Assuming that #1 doesn't work, then we need compat support. But
you're doing it by having two different entry points. Instead, you
could have a single entry point that calls in_compat_syscall() to
decide which structure to read. This would simplify things because
x86 doesn't really support the separate compat entry points, which
leads me to #3.
3. The separate x32 numbers are a huge turd that may have security
holes and certainly have comprehensibility holes. I will object to
any patch that adds a new one (like yours). Fixing #1 or #2 makes
this problem go away.
Does that make any sense? The #2 fix would be something like:
if (in_compat_syscall)
copy...user32();
else
copy_from_user();
The #1 fix would add a copy_siginfo_from_user64() or similar.
On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
> >
> > On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
> > >
> > >
> > >> On Nov 29, 2018, at 4:28 AM, Florian Weimer <[email protected]>
> > >wrote:
> > >>
> > >> Disclaimer: I'm looking at this patch because Christian requested it.
> > >> I'm not a kernel developer.
> > >>
> > >> * Christian Brauner:
> > >>
> > >>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
> > >b/arch/x86/entry/syscalls/syscall_32.tbl
> > >>> index 3cf7b533b3d1..3f27ffd8ae87 100644
> > >>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > >>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > >>> @@ -398,3 +398,4 @@
> > >>> 384 i386 arch_prctl sys_arch_prctl
> > >__ia32_compat_sys_arch_prctl
> > >>> 385 i386 io_pgetevents sys_io_pgetevents
> > >__ia32_compat_sys_io_pgetevents
> > >>> 386 i386 rseq sys_rseq __ia32_sys_rseq
> > >>> +387 i386 procfd_signal sys_procfd_signal
> > >__ia32_compat_sys_procfd_signal
> > >>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> > >b/arch/x86/entry/syscalls/syscall_64.tbl
> > >>> index f0b1709a5ffb..8a30cde82450 100644
> > >>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > >>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > >>> @@ -343,6 +343,7 @@
> > >>> 332 common statx __x64_sys_statx
> > >>> 333 common io_pgetevents __x64_sys_io_pgetevents
> > >>> 334 common rseq __x64_sys_rseq
> > >>> +335 64 procfd_signal __x64_sys_procfd_signal
> > >>>
> > >>> #
> > >>> # x32-specific system call numbers start at 512 to avoid cache
> > >impact
> > >>> @@ -386,3 +387,4 @@
> > >>> 545 x32 execveat __x32_compat_sys_execveat/ptregs
> > >>> 546 x32 preadv2 __x32_compat_sys_preadv64v2
> > >>> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
> > >>> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
> > >>
> > >> Is there a reason why these numbers have to be different?
> > >>
> > >> (See the recent discussion with Andy Lutomirski.)
> > >
> > >Hah, I missed this part of the patch. Let’s not add new x32 syscall
> > >numbers.
> > >
> > >Also, can we perhaps rework this a bit to get rid of the compat entry
> > >point? The easier way would be to check in_compat_syscall(). The nicer
> > >way IMO would be to use the 64-bit structure for 32-bit as well.
> >
> > Do you have a syscall which set precedence/did this before I could look at?
> > Just if you happen to remember one.
> > Fwiw, I followed the other signal syscalls.
> > They all introduce compat syscalls.
> >
>
> Not really.
>
> Let me try to explain. I have three issues with the approach in your patchset:
>
> 1. You're introducing a new syscall, and it behaves differently on
> 32-bit and 64-bit because the structure you pass in is different.
> This is necessary for old syscalls where compatibility matters, but
> maybe we can get rid of it for new syscalls. Could we define a
> siginfo64_t that is identical to the 64-bit siginfo_t and just use
> that in all cases?
>
> 2. Assuming that #1 doesn't work, then we need compat support. But
> you're doing it by having two different entry points. Instead, you
> could have a single entry point that calls in_compat_syscall() to
> decide which structure to read. This would simplify things because
> x86 doesn't really support the separate compat entry points, which
> leads me to #3.
>
> 3. The separate x32 numbers are a huge turd that may have security
> holes and certainly have comprehensibility holes. I will object to
> any patch that adds a new one (like yours). Fixing #1 or #2 makes
> this problem go away.
>
> Does that make any sense? The #2 fix would be something like:
>
> if (in_compat_syscall)
> copy...user32();
> else
> copy_from_user();
>
> The #1 fix would add a copy_siginfo_from_user64() or similar.
Thanks very much! That all helped a bunch already! I'll try to go the
copy_siginfo_from_user64() way first and see if I can make this work. If
we do this I would however only want to use it for the new syscall first
and not change all other signal syscalls over to it too. I'd rather keep
this patchset focussed and small and do such conversions caused by the
new approach later. Does that sound reasonable?
> On Nov 29, 2018, at 11:55 AM, Christian Brauner <[email protected]> wrote:
>
>> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
>>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
>>>
>>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
>>>>
>>>>
>>>>> On Nov 29, 2018, at 4:28 AM, Florian Weimer <[email protected]>
>>>> wrote:
>>>>>
>>>>> Disclaimer: I'm looking at this patch because Christian requested it.
>>>>> I'm not a kernel developer.
>>>>>
>>>>> * Christian Brauner:
>>>>>
>>>>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
>>>> b/arch/x86/entry/syscalls/syscall_32.tbl
>>>>>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>>>>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>>>>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>>>>>> @@ -398,3 +398,4 @@
>>>>>> 384 i386 arch_prctl sys_arch_prctl
>>>> __ia32_compat_sys_arch_prctl
>>>>>> 385 i386 io_pgetevents sys_io_pgetevents
>>>> __ia32_compat_sys_io_pgetevents
>>>>>> 386 i386 rseq sys_rseq __ia32_sys_rseq
>>>>>> +387 i386 procfd_signal sys_procfd_signal
>>>> __ia32_compat_sys_procfd_signal
>>>>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>>>> b/arch/x86/entry/syscalls/syscall_64.tbl
>>>>>> index f0b1709a5ffb..8a30cde82450 100644
>>>>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>>>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>>>>> @@ -343,6 +343,7 @@
>>>>>> 332 common statx __x64_sys_statx
>>>>>> 333 common io_pgetevents __x64_sys_io_pgetevents
>>>>>> 334 common rseq __x64_sys_rseq
>>>>>> +335 64 procfd_signal __x64_sys_procfd_signal
>>>>>>
>>>>>> #
>>>>>> # x32-specific system call numbers start at 512 to avoid cache
>>>> impact
>>>>>> @@ -386,3 +387,4 @@
>>>>>> 545 x32 execveat __x32_compat_sys_execveat/ptregs
>>>>>> 546 x32 preadv2 __x32_compat_sys_preadv64v2
>>>>>> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
>>>>>> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
>>>>>
>>>>> Is there a reason why these numbers have to be different?
>>>>>
>>>>> (See the recent discussion with Andy Lutomirski.)
>>>>
>>>> Hah, I missed this part of the patch. Let’s not add new x32 syscall
>>>> numbers.
>>>>
>>>> Also, can we perhaps rework this a bit to get rid of the compat entry
>>>> point? The easier way would be to check in_compat_syscall(). The nicer
>>>> way IMO would be to use the 64-bit structure for 32-bit as well.
>>>
>>> Do you have a syscall which set precedence/did this before I could look at?
>>> Just if you happen to remember one.
>>> Fwiw, I followed the other signal syscalls.
>>> They all introduce compat syscalls.
>>>
>>
>> Not really.
>>
>> Let me try to explain. I have three issues with the approach in your patchset:
>>
>> 1. You're introducing a new syscall, and it behaves differently on
>> 32-bit and 64-bit because the structure you pass in is different.
>> This is necessary for old syscalls where compatibility matters, but
>> maybe we can get rid of it for new syscalls. Could we define a
>> siginfo64_t that is identical to the 64-bit siginfo_t and just use
>> that in all cases?
>>
>> 2. Assuming that #1 doesn't work, then we need compat support. But
>> you're doing it by having two different entry points. Instead, you
>> could have a single entry point that calls in_compat_syscall() to
>> decide which structure to read. This would simplify things because
>> x86 doesn't really support the separate compat entry points, which
>> leads me to #3.
>>
>> 3. The separate x32 numbers are a huge turd that may have security
>> holes and certainly have comprehensibility holes. I will object to
>> any patch that adds a new one (like yours). Fixing #1 or #2 makes
>> this problem go away.
>>
>> Does that make any sense? The #2 fix would be something like:
>>
>> if (in_compat_syscall)
>> copy...user32();
>> else
>> copy_from_user();
>>
>> The #1 fix would add a copy_siginfo_from_user64() or similar.
>
> Thanks very much! That all helped a bunch already! I'll try to go the
> copy_siginfo_from_user64() way first and see if I can make this work. If
> we do this I would however only want to use it for the new syscall first
> and not change all other signal syscalls over to it too. I'd rather keep
> this patchset focussed and small and do such conversions caused by the
> new approach later. Does that sound reasonable?
Absolutely. I don’t think we can change old syscalls — the ABI is set in stone. But for new syscalls, I think the always-64-bit behavior makes sense.
On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
> > On Nov 29, 2018, at 11:55 AM, Christian Brauner <[email protected]> wrote:
> >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
> >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
> >>
> >> The #1 fix would add a copy_siginfo_from_user64() or similar.
> >
> > Thanks very much! That all helped a bunch already! I'll try to go the
> > copy_siginfo_from_user64() way first and see if I can make this work. If
> > we do this I would however only want to use it for the new syscall first
> > and not change all other signal syscalls over to it too. I'd rather keep
> > this patchset focussed and small and do such conversions caused by the
> > new approach later. Does that sound reasonable?
>
> Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
> But for new syscalls, I think the always-64-bit behavior makes sense.
It looks like we already have a 'struct signalfd_siginfo' that is defined in a
sane architecture-independent way, so I'd suggest we use that.
We may then also want to make sure that any system call that takes a
siginfo has a replacement that takes a signalfd_siginfo, and that this
replacement can be used to implement the old version purely in
user space.
Is the current procfd_signal() proposal (under whichever name) sufficient
to correctly implement both sys_rt_sigqueueinfo() and sys_rt_tgsigqueueinfo()?
Can we implement sys_rt_sigtimedwait() based on signalfd()?
If yes, that would leave waitid(), which already needs a replacement
for y2038, and that should then also return a signalfd_siginfo.
My current preference for waitid() would be to do a version that
closely resembles the current interface, but takes a signalfd_siginfo
and a __kernel_timespec based rusage replacement (possibly
two of them to let us map wait6), but does not operate on procfd or
take a signal mask. That would require yet another syscall, but I
don't think I can do that before we want to have the set of y2038
safe syscalls.
Arnd
On Thu, Nov 29, 2018 at 10:02:13PM +0100, Arnd Bergmann wrote:
> On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
> > > On Nov 29, 2018, at 11:55 AM, Christian Brauner <[email protected]> wrote:
> > >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> > >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
> > >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
> > >>
> > >> The #1 fix would add a copy_siginfo_from_user64() or similar.
> > >
> > > Thanks very much! That all helped a bunch already! I'll try to go the
> > > copy_siginfo_from_user64() way first and see if I can make this work. If
> > > we do this I would however only want to use it for the new syscall first
> > > and not change all other signal syscalls over to it too. I'd rather keep
> > > this patchset focussed and small and do such conversions caused by the
> > > new approach later. Does that sound reasonable?
> >
> > Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
> > But for new syscalls, I think the always-64-bit behavior makes sense.
>
> It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> sane architecture-independent way, so I'd suggest we use that.
Just so that I understand you correctly: swapping out struct signinfo
for struct signalfd_siginfo in procfd_<whatever-suffix>? If so that
sounds great to me!
>
> We may then also want to make sure that any system call that takes a
> siginfo has a replacement that takes a signalfd_siginfo, and that this
> replacement can be used to implement the old version purely in
> user space.
Sounds good but is unrelated to this patchset I take it. :)
>
> Is the current procfd_signal() proposal (under whichever name) sufficient
> to correctly implement both sys_rt_sigqueueinfo() and sys_rt_tgsigqueueinfo()?
Yes, I see no reason why not. My idea is to extend it - after we have a
basic version in - to also work with:
/proc/<pid>/task/<tid>
If I'm not mistaken this should be sufficient to get rt_tgsigqueueinfo.
The thread will be uniquely identified by the tid descriptor and no
combination of /proc/<pid> and /proc/<pid>/task/<tid> is needed. Does
that sound reasonable?
> Can we implement sys_rt_sigtimedwait() based on signalfd()?
> If yes, that would leave waitid(), which already needs a replacement
> for y2038, and that should then also return a signalfd_siginfo.
> My current preference for waitid() would be to do a version that
> closely resembles the current interface, but takes a signalfd_siginfo
> and a __kernel_timespec based rusage replacement (possibly
> two of them to let us map wait6), but does not operate on procfd or
> take a signal mask. That would require yet another syscall, but I
> don't think I can do that before we want to have the set of y2038
> safe syscalls.
All sounds reasonable to me but that's not a blocker for the current
syscall though, is it?
Christian
On Thu, Nov 29, 2018 at 10:35 PM Christian Brauner <[email protected]> wrote:
> On Thu, Nov 29, 2018 at 10:02:13PM +0100, Arnd Bergmann wrote:
> > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
> >
> > Is the current procfd_signal() proposal (under whichever name) sufficient
> > to correctly implement both sys_rt_sigqueueinfo() and sys_rt_tgsigqueueinfo()?
>
> Yes, I see no reason why not. My idea is to extend it - after we have a
> basic version in - to also work with:
> /proc/<pid>/task/<tid>
> If I'm not mistaken this should be sufficient to get rt_tgsigqueueinfo.
> The thread will be uniquely identified by the tid descriptor and no
> combination of /proc/<pid> and /proc/<pid>/task/<tid> is needed. Does
> that sound reasonable?
Yes. So it would currently replace rt_gsigqueueinfo() but
not rt_tgsigqueueinfo(), and could be extended to do both
afterwards, without making the interface ugly in any form?
I suppose we can always add more flags if needed, and you
already ensure that flags is zero for the moment.
> > Can we implement sys_rt_sigtimedwait() based on signalfd()?
> >
> > If yes, that would leave waitid(), which already needs a replacement
> > for y2038, and that should then also return a signalfd_siginfo.
> > My current preference for waitid() would be to do a version that
> > closely resembles the current interface, but takes a signalfd_siginfo
> > and a __kernel_timespec based rusage replacement (possibly
> > two of them to let us map wait6), but does not operate on procfd or
> > take a signal mask. That would require yet another syscall, but I
> > don't think I can do that before we want to have the set of y2038
> > safe syscalls.
>
> All sounds reasonable to me but that's not a blocker for the current
> syscall though, is it?
I'd like to at least understand about sys_rt_sigtimedwait() before
we go on, so we all know what's coming, and document the
plans in the changelog.
waitid() probably remains on my plate anyway, and I hope understand
where we're at with it.
Arnd
On 2018-11-29, Arnd Bergmann <[email protected]> wrote:
> waitid() probably remains on my plate anyway, and I hope understand
> where we're at with it.
Having a way to wait on a processfd is something we'll eventually need,
though the semantics of zombies might get a little bit hairy. I propose
we work through that rewrite in a future series once this one goes in.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
Arnd Bergmann <[email protected]> writes:
> On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
>> > On Nov 29, 2018, at 11:55 AM, Christian Brauner <[email protected]> wrote:
>> >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
>> >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
>> >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
>> >>
>> >> The #1 fix would add a copy_siginfo_from_user64() or similar.
>> >
>> > Thanks very much! That all helped a bunch already! I'll try to go the
>> > copy_siginfo_from_user64() way first and see if I can make this work. If
>> > we do this I would however only want to use it for the new syscall first
>> > and not change all other signal syscalls over to it too. I'd rather keep
>> > this patchset focussed and small and do such conversions caused by the
>> > new approach later. Does that sound reasonable?
>>
>> Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
>> But for new syscalls, I think the always-64-bit behavior makes sense.
>
> It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> sane architecture-independent way, so I'd suggest we use that.
Unfortunately it isn't maintained very well. What you can
express with signalfd_siginfo is a subset what you can express with
siginfo. Many of the linux extensions to siginfo for exception
information add pointers and have integers right after those pointers.
Not all of those linux specific extentions for exceptions are handled
by signalfd_siginfo (it needs new fields).
As originally defined siginfo had the sigval_t union at the end so it
was perfectly fine on both 32bit and 64bit as it only had a single
pointer in the structure and the other fields were 32bits in size.
Although I do feel the pain as x86_64 has to deal with 3 versions
of siginfo. A 64bit one. A 32bit one for ia32. A 32bit one for x32
with a 64bit si_clock_t.
> We may then also want to make sure that any system call that takes a
> siginfo has a replacement that takes a signalfd_siginfo, and that this
> replacement can be used to implement the old version purely in
> user space.
If you are not implementing CRIU and reinserting exceptions to yourself.
At most user space wants the ability to implement sigqueue:
AKA:
sigqueue(pid_t pid, int sig, const union sigval value);
Well sigqueue with it's own si_codes so the following would cover all
the use cases I know of:
int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
The si_code could even be set to SI_USER to request that the normal
trusted SI_USER values be filled in. si_code values of < 0 if not
recognized could reasonably safely be treated as the _rt member of
the siginfo union. Or perhaps better we error out in such a case.
If we want to be flexible and not have N kinds of system calls that
is the direction I would go. That is simple, and you don't need any of
the rest.
Alternatively we abandon the mistake of sigqueueinfo and not allow
a signal number in the arguments that differs from the si_signo in the
siginfo and allow passing the entire thing unchanged from sender to
receiver. That is maximum flexibility.
signalfd_siginfo just sucks in practice. It is larger that siginfo 104
bytes versus 48. We must deliver it to userspace as a siginfo so it
must be translated. Because of the translation signalfd_siginfo adds
no flexiblity in practice, because it can not just be passed through.
Finallay signalfd_siginfo does not have encodings for all of the
siginfo union members, so it fails to be fully general.
Personally if I was to define signalfd_siginfo today I would make it:
struct siginfo_subset {
__u32 sis_signo;
__s32 sis_errno;
__s32 sis_code;
__u32 sis_pad;
__u32 sis_pid;
__u32 sis_uid;
__u64 sis_data (A pointer or integer data field);
};
That is just 32bytes, and is all that is needed for everything
except for synchronous exceptions. Oh and that happens to be a proper
subset of a any sane siginfo layout, on both 32bit and 64bit.
This is one of those rare times where POSIX is sane and what linux
has implemented is not.
Eric
On Thu, Nov 29, 2018 at 11:13:57PM -0600, Eric W. Biederman wrote:
> Arnd Bergmann <[email protected]> writes:
>
> > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
> >> > On Nov 29, 2018, at 11:55 AM, Christian Brauner <[email protected]> wrote:
> >> >> On Thu, Nov 29, 2018 at 11:22:58AM -0800, Andy Lutomirski wrote:
> >> >>> On Thu, Nov 29, 2018 at 11:17 AM Christian Brauner <[email protected]> wrote:
> >> >>>> On November 30, 2018 5:54:18 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
> >> >>
> >> >> The #1 fix would add a copy_siginfo_from_user64() or similar.
> >> >
> >> > Thanks very much! That all helped a bunch already! I'll try to go the
> >> > copy_siginfo_from_user64() way first and see if I can make this work. If
> >> > we do this I would however only want to use it for the new syscall first
> >> > and not change all other signal syscalls over to it too. I'd rather keep
> >> > this patchset focussed and small and do such conversions caused by the
> >> > new approach later. Does that sound reasonable?
> >>
> >> Absolutely. I don’t think we can change old syscalls — the ABI is set in stone.
> >> But for new syscalls, I think the always-64-bit behavior makes sense.
> >
> > It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> > sane architecture-independent way, so I'd suggest we use that.
>
> Unfortunately it isn't maintained very well. What you can
> express with signalfd_siginfo is a subset what you can express with
> siginfo. Many of the linux extensions to siginfo for exception
> information add pointers and have integers right after those pointers.
> Not all of those linux specific extentions for exceptions are handled
> by signalfd_siginfo (it needs new fields).
>
> As originally defined siginfo had the sigval_t union at the end so it
> was perfectly fine on both 32bit and 64bit as it only had a single
> pointer in the structure and the other fields were 32bits in size.
>
> Although I do feel the pain as x86_64 has to deal with 3 versions
> of siginfo. A 64bit one. A 32bit one for ia32. A 32bit one for x32
> with a 64bit si_clock_t.
>
> > We may then also want to make sure that any system call that takes a
> > siginfo has a replacement that takes a signalfd_siginfo, and that this
> > replacement can be used to implement the old version purely in
> > user space.
>
> If you are not implementing CRIU and reinserting exceptions to yourself.
> At most user space wants the ability to implement sigqueue:
>
> AKA:
> sigqueue(pid_t pid, int sig, const union sigval value);
>
> Well sigqueue with it's own si_codes so the following would cover all
> the use cases I know of:
> int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
>
> The si_code could even be set to SI_USER to request that the normal
> trusted SI_USER values be filled in. si_code values of < 0 if not
> recognized could reasonably safely be treated as the _rt member of
> the siginfo union. Or perhaps better we error out in such a case.
>
> If we want to be flexible and not have N kinds of system calls that
> is the direction I would go. That is simple, and you don't need any of
> the rest.
>
>
> Alternatively we abandon the mistake of sigqueueinfo and not allow
> a signal number in the arguments that differs from the si_signo in the
> siginfo and allow passing the entire thing unchanged from sender to
> receiver. That is maximum flexibility.
>
> signalfd_siginfo just sucks in practice. It is larger that siginfo 104
> bytes versus 48. We must deliver it to userspace as a siginfo so it
> must be translated. Because of the translation signalfd_siginfo adds
> no flexiblity in practice, because it can not just be passed through.
> Finallay signalfd_siginfo does not have encodings for all of the
> siginfo union members, so it fails to be fully general.
>
> Personally if I was to define signalfd_siginfo today I would make it:
> struct siginfo_subset {
> __u32 sis_signo;
> __s32 sis_errno;
> __s32 sis_code;
> __u32 sis_pad;
> __u32 sis_pid;
> __u32 sis_uid;
> __u64 sis_data (A pointer or integer data field);
> };
>
> That is just 32bytes, and is all that is needed for everything
> except for synchronous exceptions. Oh and that happens to be a proper
> subset of a any sane siginfo layout, on both 32bit and 64bit.
>
> This is one of those rare times where POSIX is sane and what linux
> has implemented is not.
Thanks for the in-depth explanation. So your point is that we are better
off if we stick with siginfo_t instead of struct signalfd_siginfo in
procfd_signal()?
On Fri, Nov 30, 2018 at 7:56 AM Christian Brauner <[email protected]> wrote:
> On Thu, Nov 29, 2018 at 11:13:57PM -0600, Eric W. Biederman wrote:
> > Arnd Bergmann <[email protected]> writes:
> > > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
> > >
> > > It looks like we already have a 'struct signalfd_siginfo' that is defined in a
> > > sane architecture-independent way, so I'd suggest we use that.
> >
> > Unfortunately it isn't maintained very well. What you can
> > express with signalfd_siginfo is a subset what you can express with
> > siginfo. Many of the linux extensions to siginfo for exception
> > information add pointers and have integers right after those pointers.
> > Not all of those linux specific extentions for exceptions are handled
> > by signalfd_siginfo (it needs new fields).
Would those fit in the 30 remaining padding bytes?
> > As originally defined siginfo had the sigval_t union at the end so it
> > was perfectly fine on both 32bit and 64bit as it only had a single
> > pointer in the structure and the other fields were 32bits in size.
The problem with sigval_t of course is that it is incompatible between
32-bit and 64-bit. We can add the same information, but at least on
the syscall level that would have to be a __u64.
> > Although I do feel the pain as x86_64 has to deal with 3 versions
> > of siginfo. A 64bit one. A 32bit one for ia32. A 32bit one for x32
> > with a 64bit si_clock_t.
At least you and Al have managed to get it down to a single source-level
definition across all architectures, but there is also the (lesser) problem
that the structure has a slightly different binary layout on each of the
classic architectures.
If we go with Andy's suggestion of having only a single binary layout
on x86 for the new call, I'd argue that we should also make it have
the exact same layout on all other architectures.
> > > We may then also want to make sure that any system call that takes a
> > > siginfo has a replacement that takes a signalfd_siginfo, and that this
> > > replacement can be used to implement the old version purely in
> > > user space.
> >
> > If you are not implementing CRIU and reinserting exceptions to yourself.
> > At most user space wants the ability to implement sigqueue:
> >
> > AKA:
> > sigqueue(pid_t pid, int sig, const union sigval value);
> >
> > Well sigqueue with it's own si_codes so the following would cover all
> > the use cases I know of:
> > int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
> >
> > The si_code could even be set to SI_USER to request that the normal
> > trusted SI_USER values be filled in. si_code values of < 0 if not
> > recognized could reasonably safely be treated as the _rt member of
> > the siginfo union. Or perhaps better we error out in such a case.
> >
> > If we want to be flexible and not have N kinds of system calls that
> > is the direction I would go. That is simple, and you don't need any of
> > the rest.
I'm not sure I understand what you are suggesting here. Would the
low-level implementation of this be based on procfd, or do you
mean that would be done for pid_t at the kernel level, plus another
syscall for procfd?
> > Alternatively we abandon the mistake of sigqueueinfo and not allow
> > a signal number in the arguments that differs from the si_signo in the
> > siginfo and allow passing the entire thing unchanged from sender to
> > receiver. That is maximum flexibility.
This would be regardless of which flavor of siginfo (today's arch
specific one, signalfd_siginfo, or a new one) we pass, right?
> > signalfd_siginfo just sucks in practice. It is larger that siginfo 104
> > bytes versus 48. We must deliver it to userspace as a siginfo so it
> > must be translated. Because of the translation signalfd_siginfo adds
> > no flexiblity in practice, because it can not just be passed through.
> > Finallay signalfd_siginfo does not have encodings for all of the
> > siginfo union members, so it fails to be fully general.
> >
> > Personally if I was to define signalfd_siginfo today I would make it:
> > struct siginfo_subset {
> > __u32 sis_signo;
> > __s32 sis_errno;
> > __s32 sis_code;
> > __u32 sis_pad;
> > __u32 sis_pid;
> > __u32 sis_uid;
> > __u64 sis_data (A pointer or integer data field);
> > };
> >
> > That is just 32bytes, and is all that is needed for everything
> > except for synchronous exceptions. Oh and that happens to be a proper
> > subset of a any sane siginfo layout, on both 32bit and 64bit.
And that would work for signalfd and waitid_time64, but not for
procfd_signal/kill/sendsiginfo?
> > This is one of those rare times where POSIX is sane and what linux
> > has implemented is not.
>
> Thanks for the in-depth explanation. So your point is that we are better
> off if we stick with siginfo_t instead of struct signalfd_siginfo in
> procfd_signal()?
I think it means we still need more discussion. Using signalfd_siginfo
without further changes doesn't seem sufficient because of the missing
sigval and the excessive length adds some cost.
siginfo_t as it is now still has a number of other downsides, and Andy in
particular didn't like the idea of having three new variants on x86
(depending on how you count). His alternative suggestion of having
a single syscall entry point that takes a 'signfo_t __user *' but interprets
it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
should work correctly, but feels wrong to me, or at least inconsistent
with how we do this elsewhere.
Arnd
On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
> siginfo_t as it is now still has a number of other downsides, and Andy in
> particular didn't like the idea of having three new variants on x86
> (depending on how you count). His alternative suggestion of having
> a single syscall entry point that takes a 'signfo_t __user *' but interprets
> it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> should work correctly, but feels wrong to me, or at least inconsistent
> with how we do this elsewhere.
If everyone else is okay with it, I can get on board with three
variants on x86. What I can't get on board with is *five* variants on
x86, which would be:
procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
syscall64 with nr == 335 (presumably): 64-bit
syscall64 with nr == 548 | 0x40000000: x32
syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
true, behavior is arbitrary
syscall64 with nr == 335 | 0x40000000: x32 entry, but
in_compat_syscall() == false, behavior is arbitrary
This mess isn't really Christian's fault -- it's been there for a
while, but it's awful and I don't think we want to perpetuate it.
Obviously, I'd prefer a variant where the structure that's passed in
is always the same.
BTW, do we consider siginfo_t to be extensible? If so, and if we pass
in a pointer, presumably we should pass a length as well.
On December 1, 2018 5:35:45 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
>On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
>> siginfo_t as it is now still has a number of other downsides, and
>Andy in
>> particular didn't like the idea of having three new variants on x86
>> (depending on how you count). His alternative suggestion of having
>> a single syscall entry point that takes a 'signfo_t __user *' but
>interprets
>> it as compat_siginfo depending on
>in_compat_syscall()/in_x32_syscall()
>> should work correctly, but feels wrong to me, or at least
>inconsistent
>> with how we do this elsewhere.
>
>If everyone else is okay with it, I can get on board with three
>variants on x86. What I can't get on board with is *five* variants on
Thanks Andy, that helps a lot.
I'm okay with it. Does this require any additional changes to how the syscall
is currently hooked up?
>x86, which would be:
>
>procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
>
>syscall64 with nr == 335 (presumably): 64-bit
>
>syscall64 with nr == 548 | 0x40000000: x32
>
>syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
>true, behavior is arbitrary
>
>syscall64 with nr == 335 | 0x40000000: x32 entry, but
>in_compat_syscall() == false, behavior is arbitrary
>
>This mess isn't really Christian's fault -- it's been there for a
>while, but it's awful and I don't think we want to perpetuate it.
>
>Obviously, I'd prefer a variant where the structure that's passed in
>is always the same.
>
>BTW, do we consider siginfo_t to be extensible? If so, and if we pass
I would prefer if we could consider it extensible.
If so I would prefer if we could pass in a size argument.
>in a pointer, presumably we should pass a length as well.
On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]> wrote:
>
> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
> > siginfo_t as it is now still has a number of other downsides, and Andy in
> > particular didn't like the idea of having three new variants on x86
> > (depending on how you count). His alternative suggestion of having
> > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > should work correctly, but feels wrong to me, or at least inconsistent
> > with how we do this elsewhere.
>
> If everyone else is okay with it, I can get on board with three
> variants on x86. What I can't get on board with is *five* variants on
> x86, which would be:
>
> procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
>
> syscall64 with nr == 335 (presumably): 64-bit
These seem unavoidable
> syscall64 with nr == 548 | 0x40000000: x32
>
> syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
> true, behavior is arbitrary
>
> syscall64 with nr == 335 | 0x40000000: x32 entry, but
> in_compat_syscall() == false, behavior is arbitrary
Am I misreading the code? The way I understand it, setting the
0x40000000 bit means that both in_compat_syscall() and
in_x32_syscall become() true, based on
static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
return true;
#endif
return false;
}
The '548 | 0x40000000' part seems to be the only sensible
way to handle x32 here. What exactly would you propose to
avoid defining the other entry points?
> This mess isn't really Christian's fault -- it's been there for a
> while, but it's awful and I don't think we want to perpetuate it.
I'm not convinced that not assigning an x32 syscall number
improves the situation, it just means that we now have one
syscall that behaves completely differently from all others,
in that the x32 version requires being called through a
SYSCALL_DEFINE() entry point rather than a
COMPAT_SYSCALL_DEFINE() one, and we have to
add more complexity to the copy_siginfo_from_user()
implementation to duplicate the hack that exists in
copy_siginfo_from_user32().
Of course, the nicest option would be to completely remove
x32 so we can stop worrying about it.
Arnd
On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
>On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]>
>wrote:
>>
>> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
>> > siginfo_t as it is now still has a number of other downsides, and
>Andy in
>> > particular didn't like the idea of having three new variants on x86
>> > (depending on how you count). His alternative suggestion of having
>> > a single syscall entry point that takes a 'signfo_t __user *' but
>interprets
>> > it as compat_siginfo depending on
>in_compat_syscall()/in_x32_syscall()
>> > should work correctly, but feels wrong to me, or at least
>inconsistent
>> > with how we do this elsewhere.
>>
>> If everyone else is okay with it, I can get on board with three
>> variants on x86. What I can't get on board with is *five* variants
>on
>> x86, which would be:
>>
>> procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
>>
>> syscall64 with nr == 335 (presumably): 64-bit
>
>These seem unavoidable
>
>> syscall64 with nr == 548 | 0x40000000: x32
>>
>> syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
>> true, behavior is arbitrary
>>
>> syscall64 with nr == 335 | 0x40000000: x32 entry, but
>> in_compat_syscall() == false, behavior is arbitrary
>
>Am I misreading the code? The way I understand it, setting the
>0x40000000 bit means that both in_compat_syscall() and
>in_x32_syscall become() true, based on
>
>static inline bool in_x32_syscall(void)
>{
>#ifdef CONFIG_X86_X32_ABI
> if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
> return true;
>#endif
> return false;
>}
>
>The '548 | 0x40000000' part seems to be the only sensible
>way to handle x32 here. What exactly would you propose to
>avoid defining the other entry points?
>
>> This mess isn't really Christian's fault -- it's been there for a
>> while, but it's awful and I don't think we want to perpetuate it.
>
>I'm not convinced that not assigning an x32 syscall number
>improves the situation, it just means that we now have one
>syscall that behaves completely differently from all others,
>in that the x32 version requires being called through a
>SYSCALL_DEFINE() entry point rather than a
>COMPAT_SYSCALL_DEFINE() one, and we have to
>add more complexity to the copy_siginfo_from_user()
>implementation to duplicate the hack that exists in
>copy_siginfo_from_user32().
>
>Of course, the nicest option would be to completely remove
>x32 so we can stop worrying about it.
One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
However, I do sympathize and understand these concerns.
On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner <[email protected]> wrote:
>
> On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
> >On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]>
> >wrote:
> >>
> >> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
> >> > siginfo_t as it is now still has a number of other downsides, and
> >Andy in
> >> > particular didn't like the idea of having three new variants on x86
> >> > (depending on how you count). His alternative suggestion of having
> >> > a single syscall entry point that takes a 'signfo_t __user *' but
> >interprets
> >> > it as compat_siginfo depending on
> >in_compat_syscall()/in_x32_syscall()
> >> > should work correctly, but feels wrong to me, or at least
> >inconsistent
> >> > with how we do this elsewhere.
> >>
> >> If everyone else is okay with it, I can get on board with three
> >> variants on x86. What I can't get on board with is *five* variants
> >on
> >> x86, which would be:
> >>
> >> procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
> >>
> >> syscall64 with nr == 335 (presumably): 64-bit
> >
> >These seem unavoidable
> >
> >> syscall64 with nr == 548 | 0x40000000: x32
> >>
> >> syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
> >> true, behavior is arbitrary
> >>
> >> syscall64 with nr == 335 | 0x40000000: x32 entry, but
> >> in_compat_syscall() == false, behavior is arbitrary
> >
> >Am I misreading the code? The way I understand it, setting the
> >0x40000000 bit means that both in_compat_syscall() and
> >in_x32_syscall become() true, based on
> >
> >static inline bool in_x32_syscall(void)
> >{
> >#ifdef CONFIG_X86_X32_ABI
> > if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
> > return true;
> >#endif
> > return false;
> >}
> >
> >The '548 | 0x40000000' part seems to be the only sensible
> >way to handle x32 here. What exactly would you propose to
> >avoid defining the other entry points?
> >
> >> This mess isn't really Christian's fault -- it's been there for a
> >> while, but it's awful and I don't think we want to perpetuate it.
> >
> >I'm not convinced that not assigning an x32 syscall number
> >improves the situation, it just means that we now have one
> >syscall that behaves completely differently from all others,
> >in that the x32 version requires being called through a
> >SYSCALL_DEFINE() entry point rather than a
> >COMPAT_SYSCALL_DEFINE() one, and we have to
> >add more complexity to the copy_siginfo_from_user()
> >implementation to duplicate the hack that exists in
> >copy_siginfo_from_user32().
> >
> >Of course, the nicest option would be to completely remove
> >x32 so we can stop worrying about it.
>
> One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
> I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
> However, I do sympathize and understand these concerns.
IMHO, it's fine to just replicate all the splits we have for the
existing signal system calls. It's ugly, but once it's done, it'll be
done for a long time. I can't see a need to add even more signal
system calls after this one.
On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <[email protected]> wrote:
> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner <[email protected]> wrote:
> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
> >
> > One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
> > I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
> > However, I do sympathize and understand these concerns.
>
> IMHO, it's fine to just replicate all the splits we have for the
> existing signal system calls. It's ugly, but once it's done, it'll be
> done for a long time. I can't see a need to add even more signal
> system calls after this one.
We definitely need waitid_time64() and rt_sigtimedwait_time64()
in the very near future.
Arnd
On Sat, Dec 1, 2018 at 12:12 AM Arnd Bergmann <[email protected]> wrote:
>
> On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <[email protected]> wrote:
> > On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner <[email protected]> wrote:
> > > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
> > >
> > > One humble point I would like to make is that what I care about most is a sensible way forward without having to redo essential parts of how syscalls work.
> > > I don't want to introduce a sane, small syscall that ends up breaking all over the place because we decided to fix past mistakes that technically have nothing to do with the patch itself.
> > > However, I do sympathize and understand these concerns.
> >
> > IMHO, it's fine to just replicate all the splits we have for the
> > existing signal system calls. It's ugly, but once it's done, it'll be
> > done for a long time. I can't see a need to add even more signal
> > system calls after this one.
>
> We definitely need waitid_time64() and rt_sigtimedwait_time64()
> in the very near future.
To clarify: we probably don't need rt_sigtimedwait_time64() for
x32, as it already has a 64-bit time_t. We might need waitid_time64()
or something similar though, since the plan now is to change the
time resolution for rusage to nanoseconds (__kernel_timespec)
now. The exact behavior and name of waitid_time64() is still
a matter of discussion.
Arnd
On December 1, 2018 12:12:53 PM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
>On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <[email protected]>
>wrote:
>> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner
><[email protected]> wrote:
>> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann
><[email protected]> wrote:
>> >
>> > One humble point I would like to make is that what I care about
>most is a sensible way forward without having to redo essential parts
>of how syscalls work.
>> > I don't want to introduce a sane, small syscall that ends up
>breaking all over the place because we decided to fix past mistakes
>that technically have nothing to do with the patch itself.
>> > However, I do sympathize and understand these concerns.
>>
>> IMHO, it's fine to just replicate all the splits we have for the
>> existing signal system calls. It's ugly, but once it's done, it'll be
>> done for a long time. I can't see a need to add even more signal
>> system calls after this one.
>
>We definitely need waitid_time64() and rt_sigtimedwait_time64()
>in the very near future.
Right, I remember you pointing this out in a prior mail.
Thanks for working on this for such a long time now, Arnd!
Can we agree to move on with the procfd syscall given the current constraints?
I just don't want to see the syscall being
blocked by a generic problem whose
ultimate solution is to get rid of weird
architectural constraints. :)
On Fri, Nov 30, 2018 at 3:40 PM Christian Brauner <[email protected]> wrote:
>
> On December 1, 2018 12:12:53 PM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
> >On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione <[email protected]>
> >wrote:
> >> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner
> ><[email protected]> wrote:
> >> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann
> ><[email protected]> wrote:
> >> >
> >> > One humble point I would like to make is that what I care about
> >most is a sensible way forward without having to redo essential parts
> >of how syscalls work.
> >> > I don't want to introduce a sane, small syscall that ends up
> >breaking all over the place because we decided to fix past mistakes
> >that technically have nothing to do with the patch itself.
> >> > However, I do sympathize and understand these concerns.
> >>
> >> IMHO, it's fine to just replicate all the splits we have for the
> >> existing signal system calls. It's ugly, but once it's done, it'll be
> >> done for a long time. I can't see a need to add even more signal
> >> system calls after this one.
> >
> >We definitely need waitid_time64() and rt_sigtimedwait_time64()
> >in the very near future.
>
> Right, I remember you pointing this out in a prior mail.
> Thanks for working on this for such a long time now, Arnd!
> Can we agree to move on with the procfd syscall given the current constraints?
> I just don't want to see the syscall being
> blocked by a generic problem whose
> ultimate solution is to get rid of weird
> architectural constraints.
Creating and using a copy_siginfo_from_user64() function would work
for everyone, no?
On November 30, 2018 1:28:15 AM GMT+13:00, Florian Weimer <[email protected]> wrote:
>Disclaimer: I'm looking at this patch because Christian requested it.
>I'm not a kernel developer.
Given all your expertise this really doesn't matter. :)
You're the one having to deal with this
in glibc after all.
Thanks for doing this and sorry for the late reply.
I missed that mail.
>
>* Christian Brauner:
>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
>b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 3cf7b533b3d1..3f27ffd8ae87 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -398,3 +398,4 @@
>> 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
>>
>385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
>> 386 i386 rseq sys_rseq __ia32_sys_rseq
>>
>+387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>b/arch/x86/entry/syscalls/syscall_64.tbl
>> index f0b1709a5ffb..8a30cde82450 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -343,6 +343,7 @@
>> 332 common statx __x64_sys_statx
>> 333 common io_pgetevents __x64_sys_io_pgetevents
>> 334 common rseq __x64_sys_rseq
>> +335 64 procfd_signal __x64_sys_procfd_signal
>>
>> #
>> # x32-specific system call numbers start at 512 to avoid cache
>impact
>> @@ -386,3 +387,4 @@
>> 545 x32 execveat __x32_compat_sys_execveat/ptregs
>> 546 x32 preadv2 __x32_compat_sys_preadv64v2
>> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
>> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
>
>Is there a reason why these numbers have to be different?
>
>(See the recent discussion with Andy Lutomirski.)
>
>> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t
>*kinfo, int flags,
>> + bool had_siginfo)
>> +{
>> + int ret;
>> + struct fd f;
>> + struct pid *pid;
>> +
>> + /* Enforce flags be set to 0 until we add an extension. */
>> + if (flags)
>> + return -EINVAL;
>> +
>> + f = fdget_raw(fd);
>> + if (!f.file)
>> + return -EBADF;
>> +
>> + /* Is this a process file descriptor? */
>> + ret = -EINVAL;
>> + if (!proc_is_tgid_procfd(f.file))
>> + goto err;
>[…]
>> + ret = kill_pid_info(sig, kinfo, pid);
>
>I would like to see some comment here what happens to zombie processes.
You'd get ESRCH.
I'm not sure if that has always been the case.
Eric recently did some excellent refactoring of the signal code.
Iirc, part of that involved not delivering signals to zombies.
That's at least how I remember it.
I don't have access to source code though atm.
>
>> +/**
>> + * sys_procfd_signal - send a signal to a process through a process
>file
>> + * descriptor
>> + * @fd: the file descriptor of the process
>> + * @sig: signal to be sent
>> + * @info: the signal info
>> + * @flags: future flags to be passed
>> + */
>> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user
>*, info,
>> + int, flags)
>
>Sorry, I'm quite unhappy with the name. “signal” is for signal handler
>management. procfd_sendsignal, procfd_sigqueueinfo or something like
>that would be fine. Even procfd_kill would be better IMHO.
Ok. I only have strong opinions to procfd_kill().
Mainly because the new syscall takes
the job of multiple other syscalls
so kill gives the wrong impression.
I'll come up with a better name in the next iteration.
>
>Looking at the rt_tgsigqueueinfo interface, is there a way to implement
>the “tg” part with the current procfd_signal interface? Would you use
>openat to retrieve the Tgid: line from "status"?
Yes, the tg part can be implemented.
As I pointed out in another mail my
I is to make this work by using file
descriptors for /proc/<pid>/task/<tid>.
I don't want this in the initial patchset though.
I prefer to slowly add those features
once we have gotten the basic functionality
in.
>
>Thanks,
>Florian
On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <[email protected]> wrote:
>
> On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]> wrote:
> >
> > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
> > > siginfo_t as it is now still has a number of other downsides, and Andy in
> > > particular didn't like the idea of having three new variants on x86
> > > (depending on how you count). His alternative suggestion of having
> > > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > > should work correctly, but feels wrong to me, or at least inconsistent
> > > with how we do this elsewhere.
> >
> > If everyone else is okay with it, I can get on board with three
> > variants on x86. What I can't get on board with is *five* variants on
> > x86, which would be:
> >
> > procfd_signal via int80 / the 32-bit vDSO: the ia32 structure
> >
> > syscall64 with nr == 335 (presumably): 64-bit
>
> These seem unavoidable
Indeed, although, in hindsight, they should have had the same numbers.
>
> > syscall64 with nr == 548 | 0x40000000: x32
> >
> > syscall64 with nr == 548: 64-bit entry but in_compat_syscall() ==
> > true, behavior is arbitrary
> >
> > syscall64 with nr == 335 | 0x40000000: x32 entry, but
> > in_compat_syscall() == false, behavior is arbitrary
>
> Am I misreading the code? The way I understand it, setting the
> 0x40000000 bit means that both in_compat_syscall() and
> in_x32_syscall become() true, based on
>
> static inline bool in_x32_syscall(void)
> {
> #ifdef CONFIG_X86_X32_ABI
> if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
> return true;
> #endif
> return false;
> }
Yes.
>
> The '548 | 0x40000000' part seems to be the only sensible
> way to handle x32 here. What exactly would you propose to
> avoid defining the other entry points?
I would propose that it should be 335 | 0x40000000. I can't see any
reasonable way to teach the kernel to reject 335 | 0x40000000 that
wouldn't work just as well to accept it and make it do the right
thing. Currently we accept it and do the *wrong* thing, which is no
good.
>
> > This mess isn't really Christian's fault -- it's been there for a
> > while, but it's awful and I don't think we want to perpetuate it.
>
> I'm not convinced that not assigning an x32 syscall number
> improves the situation, it just means that we now have one
> syscall that behaves completely differently from all others,
> in that the x32 version requires being called through a
> SYSCALL_DEFINE() entry point rather than a
> COMPAT_SYSCALL_DEFINE() one,
There's nothing particularly novel about this. seccomp(), for
example, already works like this.
> and we have to
> add more complexity to the copy_siginfo_from_user()
> implementation to duplicate the hack that exists in
> copy_siginfo_from_user32().
What hack are you referring to here?
>
> Of course, the nicest option would be to completely remove
> x32 so we can stop worrying about it.
True :)
On December 1, 2018 12:46:22 PM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
>On Fri, Nov 30, 2018 at 3:40 PM Christian Brauner
><[email protected]> wrote:
>>
>> On December 1, 2018 12:12:53 PM GMT+13:00, Arnd Bergmann
><[email protected]> wrote:
>> >On Sat, Dec 1, 2018 at 12:05 AM Daniel Colascione
><[email protected]>
>> >wrote:
>> >> On Fri, Nov 30, 2018 at 2:26 PM Christian Brauner
>> ><[email protected]> wrote:
>> >> > On December 1, 2018 11:09:58 AM GMT+13:00, Arnd Bergmann
>> ><[email protected]> wrote:
>> >> >
>> >> > One humble point I would like to make is that what I care about
>> >most is a sensible way forward without having to redo essential
>parts
>> >of how syscalls work.
>> >> > I don't want to introduce a sane, small syscall that ends up
>> >breaking all over the place because we decided to fix past mistakes
>> >that technically have nothing to do with the patch itself.
>> >> > However, I do sympathize and understand these concerns.
>> >>
>> >> IMHO, it's fine to just replicate all the splits we have for the
>> >> existing signal system calls. It's ugly, but once it's done, it'll
>be
>> >> done for a long time. I can't see a need to add even more signal
>> >> system calls after this one.
>> >
>> >We definitely need waitid_time64() and rt_sigtimedwait_time64()
>> >in the very near future.
>>
>> Right, I remember you pointing this out in a prior mail.
>> Thanks for working on this for such a long time now, Arnd!
>> Can we agree to move on with the procfd syscall given the current
>constraints?
>> I just don't want to see the syscall being
>> blocked by a generic problem whose
>> ultimate solution is to get rid of weird
>> architectural constraints.
>
>Creating and using a copy_siginfo_from_user64() function would work
>for everyone, no?
Meaning, no compat syscalls, introduce
new struct siginfo64_t and the copy
function you named above?
On November 30, 2018 10:40:49 AM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
>On Thu, Nov 29, 2018 at 10:35 PM Christian Brauner
><[email protected]> wrote:
>> On Thu, Nov 29, 2018 at 10:02:13PM +0100, Arnd Bergmann wrote:
>> > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski
><[email protected]> wrote:
>> >
>> > Is the current procfd_signal() proposal (under whichever name)
>sufficient
>> > to correctly implement both sys_rt_sigqueueinfo() and
>sys_rt_tgsigqueueinfo()?
>>
>> Yes, I see no reason why not. My idea is to extend it - after we have
>a
>> basic version in - to also work with:
>> /proc/<pid>/task/<tid>
>> If I'm not mistaken this should be sufficient to get
>rt_tgsigqueueinfo.
>> The thread will be uniquely identified by the tid descriptor and no
>> combination of /proc/<pid> and /proc/<pid>/task/<tid> is needed. Does
>> that sound reasonable?
>
>Yes. So it would currently replace rt_gsigqueueinfo() but
>not rt_tgsigqueueinfo(), and could be extended to do both
>afterwards, without making the interface ugly in any form?
Yes. :)
>
>I suppose we can always add more flags if needed, and you
>already ensure that flags is zero for the moment.
Yep.
>
>> > Can we implement sys_rt_sigtimedwait() based on signalfd()?
>> >
>> > If yes, that would leave waitid(), which already needs a
>replacement
>> > for y2038, and that should then also return a signalfd_siginfo.
>> > My current preference for waitid() would be to do a version that
>> > closely resembles the current interface, but takes a
>signalfd_siginfo
>> > and a __kernel_timespec based rusage replacement (possibly
>> > two of them to let us map wait6), but does not operate on procfd or
>> > take a signal mask. That would require yet another syscall, but I
>> > don't think I can do that before we want to have the set of y2038
>> > safe syscalls.
>>
>> All sounds reasonable to me but that's not a blocker for the current
>> syscall though, is it?
>
>I'd like to at least understand about sys_rt_sigtimedwait() before
>we go on, so we all know what's coming, and document the
>plans in the changelog.
>
>waitid() probably remains on my plate anyway, and I hope understand
>where we're at with it.
>
> Arnd
On Sat, Dec 1, 2018 at 12:54 AM Andy Lutomirski <[email protected]> wrote:
> On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <[email protected]> wrote:
> > On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]> wrote:
> > > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
> > > > siginfo_t as it is now still has a number of other downsides, and Andy in
> > > > particular didn't like the idea of having three new variants on x86
> > > > (depending on how you count). His alternative suggestion of having
> > > > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > > > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > > > should work correctly, but feels wrong to me, or at least inconsistent
> > > > with how we do this elsewhere.
> > The '548 | 0x40000000' part seems to be the only sensible
> > way to handle x32 here. What exactly would you propose to
> > avoid defining the other entry points?
>
> I would propose that it should be 335 | 0x40000000. I can't see any
> reasonable way to teach the kernel to reject 335 | 0x40000000 that
> wouldn't work just as well to accept it and make it do the right
> thing. Currently we accept it and do the *wrong* thing, which is no
> good.
>
> > and we have to
> > add more complexity to the copy_siginfo_from_user()
> > implementation to duplicate the hack that exists in
> > copy_siginfo_from_user32().
>
> What hack are you referring to here?
I mean this part:
#ifdef CONFIG_COMPAT
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
const struct kernel_siginfo *from)
#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
{
return __copy_siginfo_to_user32(to, from, in_x32_syscall());
}
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
const struct kernel_siginfo *from, bool x32_ABI)
#endif
{
...
case SIL_CHLD:
new.si_pid = from->si_pid;
new.si_uid = from->si_uid;
new.si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
if (x32_ABI) {
new._sifields._sigchld_x32._utime = from->si_utime;
new._sifields._sigchld_x32._stime = from->si_stime;
} else
#endif
{
new.si_utime = from->si_utime;
new.si_stime = from->si_stime;
}
break;
...
}
#endif
If we have a '548 | 0x40000000' entry pointing to
__x32_compat_sys_procfd_kill, then that will do the right
thing. If you instead try to have x32 call into the native
sys_procfd_kill, then copy_siginfo_to_user() will also have
to know about x32, effectively duplicating that mess above,
unless you want to also change all users of
copy_siginfo_to_user32() to use copy_siginfo_to_user()
and handle all cases in one function.
Arnd
On December 1, 2018 9:51:18 PM GMT+13:00, Arnd Bergmann <[email protected]> wrote:
>On Sat, Dec 1, 2018 at 12:54 AM Andy Lutomirski <[email protected]>
>wrote:
>> On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <[email protected]> wrote:
>> > On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]>
>wrote:
>> > > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]>
>wrote:
>> > > > siginfo_t as it is now still has a number of other downsides,
>and Andy in
>> > > > particular didn't like the idea of having three new variants on
>x86
>> > > > (depending on how you count). His alternative suggestion of
>having
>> > > > a single syscall entry point that takes a 'signfo_t __user *'
>but interprets
>> > > > it as compat_siginfo depending on
>in_compat_syscall()/in_x32_syscall()
>> > > > should work correctly, but feels wrong to me, or at least
>inconsistent
>> > > > with how we do this elsewhere.
>
>> > The '548 | 0x40000000' part seems to be the only sensible
>> > way to handle x32 here. What exactly would you propose to
>> > avoid defining the other entry points?
>>
>> I would propose that it should be 335 | 0x40000000. I can't see any
>> reasonable way to teach the kernel to reject 335 | 0x40000000 that
>> wouldn't work just as well to accept it and make it do the right
>> thing. Currently we accept it and do the *wrong* thing, which is no
>> good.
>>
>> > and we have to
>> > add more complexity to the copy_siginfo_from_user()
>> > implementation to duplicate the hack that exists in
>> > copy_siginfo_from_user32().
>>
>> What hack are you referring to here?
>
>I mean this part:
>
>#ifdef CONFIG_COMPAT
>int copy_siginfo_to_user32(struct compat_siginfo __user *to,
> const struct kernel_siginfo *from)
>#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
>{
> return __copy_siginfo_to_user32(to, from, in_x32_syscall());
>}
>int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
> const struct kernel_siginfo *from, bool x32_ABI)
>#endif
>{
>...
> case SIL_CHLD:
> new.si_pid = from->si_pid;
> new.si_uid = from->si_uid;
> new.si_status = from->si_status;
>#ifdef CONFIG_X86_X32_ABI
> if (x32_ABI) {
> new._sifields._sigchld_x32._utime = from->si_utime;
> new._sifields._sigchld_x32._stime = from->si_stime;
> } else
>#endif
> {
> new.si_utime = from->si_utime;
> new.si_stime = from->si_stime;
> }
> break;
>...
>}
>#endif
>
>If we have a '548 | 0x40000000' entry pointing to
>__x32_compat_sys_procfd_kill, then that will do the right
>thing. If you instead try to have x32 call into the native
>sys_procfd_kill, then copy_siginfo_to_user() will also have
>to know about x32, effectively duplicating that mess above,
>unless you want to also change all users of
>copy_siginfo_to_user32() to use copy_siginfo_to_user()
>and handle all cases in one function.
I've been looking into having siginfo64_t
with the new copy_siginfo_to_user64()
function. It looks like a pretty intricate
task. Are we sure that we want to go
down this road? I'm not sure that it'll be
worth it. Especially since we force yet
another signal struct on user space.
On Sat, Dec 1, 2018 at 9:51 AM Arnd Bergmann <[email protected]> wrote:
> On Sat, Dec 1, 2018 at 12:54 AM Andy Lutomirski <[email protected]> wrote:
> > On Fri, Nov 30, 2018 at 2:10 PM Arnd Bergmann <[email protected]> wrote:
> > > On Fri, Nov 30, 2018 at 5:36 PM Andy Lutomirski <[email protected]> wrote:
> > > > On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
> > > > > siginfo_t as it is now still has a number of other downsides, and Andy in
> > > > > particular didn't like the idea of having three new variants on x86
> > > > > (depending on how you count). His alternative suggestion of having
> > > > > a single syscall entry point that takes a 'signfo_t __user *' but interprets
> > > > > it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
> > > > > should work correctly, but feels wrong to me, or at least inconsistent
> > > > > with how we do this elsewhere.
>
> > > The '548 | 0x40000000' part seems to be the only sensible
> > > way to handle x32 here. What exactly would you propose to
> > > avoid defining the other entry points?
> >
> > I would propose that it should be 335 | 0x40000000. I can't see any
> > reasonable way to teach the kernel to reject 335 | 0x40000000 that
> > wouldn't work just as well to accept it and make it do the right
> > thing. Currently we accept it and do the *wrong* thing, which is no
> > good.
I guess we could start with something like the change below, which
would unify the entry points for rt_{tg,}sigqueueinfo, so that
e.g. the 129 and 536 syscall numbers do the exact same thing, and
that would be the lp64 or ilp32 behavior, depending on the
0x40000000 bit. For the new syscalls, we can then do the same
thing without assigning another number.
Arnd
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index d5252bc1e380..3233fb889a51 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -7,6 +7,11 @@
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
+#ifdef CONFIG_X86_X32_ABI
+#define __x64_sys_x86_rt_sigqueueinfo __x64_sys_rt_sigqueueinfo
+#define __x64_sys_x86_rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo
+#endif
+
/* this is a lie, but it does not hurt as sys_ni_syscall just returns
-EINVAL */
extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const
struct pt_regs *);
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
b/arch/x86/entry/syscalls/syscall_64.tbl
index 0823eed2b02e..4a7393d34e03 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -137,7 +137,7 @@
126 common capset __x64_sys_capset
127 64 rt_sigpending __x64_sys_rt_sigpending
128 64 rt_sigtimedwait __x64_sys_rt_sigtimedwait
-129 64 rt_sigqueueinfo __x64_sys_rt_sigqueueinfo
+129 64 rt_sigqueueinfo __x64_sys_x86_rt_sigqueueinfo
130 common rt_sigsuspend __x64_sys_rt_sigsuspend
131 64 sigaltstack __x64_sys_sigaltstack
132 common utime __x64_sys_utime
@@ -305,7 +305,7 @@
294 common inotify_init1 __x64_sys_inotify_init1
295 64 preadv __x64_sys_preadv
296 64 pwritev __x64_sys_pwritev
-297 64 rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo
+297 64 rt_tgsigqueueinfo __x64_sys_x86_rt_tgsigqueueinfo
298 common perf_event_open __x64_sys_perf_event_open
299 64 recvmmsg __x64_sys_recvmmsg
300 common fanotify_init __x64_sys_fanotify_init
@@ -369,7 +369,7 @@
521 x32 ptrace __x32_compat_sys_ptrace
522 x32 rt_sigpending __x32_compat_sys_rt_sigpending
523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait
-524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo
+524 x32 rt_sigqueueinfo __x64_sys_x86_rt_sigqueueinfo
525 x32 sigaltstack __x32_compat_sys_sigaltstack
526 x32 timer_create __x32_compat_sys_timer_create
527 x32 mq_notify __x32_compat_sys_mq_notify
@@ -381,7 +381,7 @@
533 x32 move_pages __x32_compat_sys_move_pages
534 x32 preadv __x32_compat_sys_preadv64
535 x32 pwritev __x32_compat_sys_pwritev64
-536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo
+536 x32 rt_tgsigqueueinfo __x64_sys_x86_rt_tgsigqueueinfo
537 x32 recvmmsg __x32_compat_sys_recvmmsg
538 x32 sendmmsg __x32_compat_sys_sendmmsg
539 x32 process_vm_readv __x32_compat_sys_process_vm_readv
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 92a3b312a53c..2f16330cac83 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -892,4 +892,38 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
signal_fault(regs, frame, "x32 rt_sigreturn");
return 0;
}
+
+SYSCALL_DEFINE3(x86_rt_sigqueueinfo, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ kernel_siginfo_t info;
+ int ret;
+
+ if (!in_x32_syscall()
+ ret = __copy_siginfo_from_user(sig, &info, uinfo);
+ else
+ ret = __copy_siginfo_from_user32(sig, &info, uinfo);
+
+ if (unlikely(ret))
+ return ret;
+ return do_rt_sigqueueinfo(pid, sig, &info);
+}
+
+SYSCALL_DEFINE3(x86_rt_tsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ kernel_siginfo_t info;
+ int ret;
+
+ if (!in_x32_syscall()
+ ret = __copy_siginfo_from_user(sig, &info, uinfo);
+ else
+ ret = __copy_siginfo_from_user32(sig, &info, uinfo);
+
+ if (unlikely(ret))
+ return ret;
+ return do_rt_tsigqueueinfo(tgid, pid, sig, &info);
+}
+
+
#endif
Andy Lutomirski <[email protected]> writes:
> On Fri, Nov 30, 2018 at 3:41 AM Arnd Bergmann <[email protected]> wrote:
>> siginfo_t as it is now still has a number of other downsides, and Andy in
>> particular didn't like the idea of having three new variants on x86
>> (depending on how you count). His alternative suggestion of having
>> a single syscall entry point that takes a 'signfo_t __user *' but interprets
>> it as compat_siginfo depending on in_compat_syscall()/in_x32_syscall()
>> should work correctly, but feels wrong to me, or at least inconsistent
>> with how we do this elsewhere.
>
> BTW, do we consider siginfo_t to be extensible? If so, and if we pass
> in a pointer, presumably we should pass a length as well.
siginfo is extensible in the sense that the structure is 128 bytes
and we use at most 48 bytes. siginfo gets embedded in stack frames when
signals get delivered so a size change upwards is non-trivial, and is
possibly and ABI break so I believe a length field would be pointless.
Eric
Arnd Bergmann <[email protected]> writes:
> On Fri, Nov 30, 2018 at 7:56 AM Christian Brauner <[email protected]> wrote:
>> On Thu, Nov 29, 2018 at 11:13:57PM -0600, Eric W. Biederman wrote:
>> > Arnd Bergmann <[email protected]> writes:
>> > > On Thu, Nov 29, 2018 at 9:14 PM Andy Lutomirski <[email protected]> wrote:
>> > >
>> > > It looks like we already have a 'struct signalfd_siginfo' that is defined in a
>> > > sane architecture-independent way, so I'd suggest we use that.
>> >
>> > Unfortunately it isn't maintained very well. What you can
>> > express with signalfd_siginfo is a subset what you can express with
>> > siginfo. Many of the linux extensions to siginfo for exception
>> > information add pointers and have integers right after those pointers.
>> > Not all of those linux specific extentions for exceptions are handled
>> > by signalfd_siginfo (it needs new fields).
>
> Would those fit in the 30 remaining padding bytes?
Probably. I expect at some point the technique signalfd has been using
won't scale. Most of what are missing are synchronous exceptions but
the crazy seccomp extensions might be missing as well. I say crazy
because seccomp places an integer immediately after a pointer which
makes 32bit 64bit compatibility impossible.
>> > As originally defined siginfo had the sigval_t union at the end so it
>> > was perfectly fine on both 32bit and 64bit as it only had a single
>> > pointer in the structure and the other fields were 32bits in size.
>
> The problem with sigval_t of course is that it is incompatible between
> 32-bit and 64-bit. We can add the same information, but at least on
> the syscall level that would have to be a __u64.
But sigval_t with nothing after it is not incompatible between 32bit and
64bit. With nothing after it sigval_t in struct siginfo already has
the extra 32bits you would need. It is sort of like transparently
adding a __u64 member to the union.
That is where we really are with sigval_t. Except for some crazy
corner cases.
>> > Although I do feel the pain as x86_64 has to deal with 3 versions
>> > of siginfo. A 64bit one. A 32bit one for ia32. A 32bit one for x32
>> > with a 64bit si_clock_t.
>
> At least you and Al have managed to get it down to a single source-level
> definition across all architectures, but there is also the (lesser) problem
> that the structure has a slightly different binary layout on each of the
> classic architectures.
>
> If we go with Andy's suggestion of having only a single binary layout
> on x86 for the new call, I'd argue that we should also make it have
> the exact same layout on all other architectures.
Yes.
>> > > We may then also want to make sure that any system call that takes a
>> > > siginfo has a replacement that takes a signalfd_siginfo, and that this
>> > > replacement can be used to implement the old version purely in
>> > > user space.
>> >
>> > If you are not implementing CRIU and reinserting exceptions to yourself.
>> > At most user space wants the ability to implement sigqueue:
>> >
>> > AKA:
>> > sigqueue(pid_t pid, int sig, const union sigval value);
>> >
>> > Well sigqueue with it's own si_codes so the following would cover all
>> > the use cases I know of:
>> > int sendnewsig(pid_t pid, int sig, int si_code, const union sigval value);
>> >
>> > The si_code could even be set to SI_USER to request that the normal
>> > trusted SI_USER values be filled in. si_code values of < 0 if not
>> > recognized could reasonably safely be treated as the _rt member of
>> > the siginfo union. Or perhaps better we error out in such a case.
>> >
>> > If we want to be flexible and not have N kinds of system calls that
>> > is the direction I would go. That is simple, and you don't need any of
>> > the rest.
>
> I'm not sure I understand what you are suggesting here. Would the
> low-level implementation of this be based on procfd, or do you
> mean that would be done for pid_t at the kernel level, plus another
> syscall for procfd?
I was thinking in general and for something that would be enough to back
sigqueue, and normal kill semantics.
For Christians proposed system call I can think of two ways we could go:
long procfd_sigsend(int fd, int sig, int si_code, __u64 sigval);
Where sigval is an appropriately written version of union sigval that
works on both 32bit and 64bit. Or we could go with:
long procfd_sigqueueinfo(int fd, struct siginfo_subset *info)
Both would support the same set of system calls today and both would
have some similar challenges. Both would take an si_code value of
SI_USER as a request that the kernel fill in the struct siginfo as kill
would. As otherwise it is invalid for userspace to send a signal with
SI_USER. I would not worry about the signal injection case for CRIU
where we explicitly allow any siginfo to be sent if we are sending it to
ourselves. We already have a system call for that.
What I would do is carefully define siginfo_subset architecturally as
a proper subset of siginfo on each architecture. That in most if not
all cases will work on 32bit and 64bit. Because of the extra padding
after union sigval.
That siginfo_subset userspace could just consider a siginfo_t. We would
just leave out the trouble some bits. So we could do a simple
untranslated copy_from_user on it, and then validate that si_code is in
a range we support.
That definition of siginfo_subset doesn't preclude extensions in the
future. And so it could very much be a pass through interface on 32bit
on 64bit and into everyone's existing stack frames that embed siginfo.
So if we want to fix the situation that is what I would do, as userspace
is not allowed to send any of the problematic siginfo definitions today.
There might be some finicky detail of union sigval or some architure
might have funny alignment between 32bit and 64bit, but I don't think
there is. If we want to be more flexible than my procfd_sigsend above
I think we should definitely look at that.
>> > Alternatively we abandon the mistake of sigqueueinfo and not allow
>> > a signal number in the arguments that differs from the si_signo in the
>> > siginfo and allow passing the entire thing unchanged from sender to
>> > receiver. That is maximum flexibility.
>
> This would be regardless of which flavor of siginfo (today's arch
> specific one, signalfd_siginfo, or a new one) we pass, right?
No. signalfd_siginfo offers much less future extensibility because it
must be translated. Once you can not just pass the structure through
you fail. As you can't be transparently forward compatible.
In a similar way Linux filesystems written assuming all the world was
ascii wound up forward compatible with utf8 as they are both byte
encodings of strings that can use a NULL terminator.
>> > signalfd_siginfo just sucks in practice. It is larger that siginfo 104
>> > bytes versus 48. We must deliver it to userspace as a siginfo so it
>> > must be translated. Because of the translation signalfd_siginfo adds
>> > no flexiblity in practice, because it can not just be passed through.
>> > Finallay signalfd_siginfo does not have encodings for all of the
>> > siginfo union members, so it fails to be fully general.
>> >
>> > Personally if I was to define signalfd_siginfo today I would make it:
>> > struct siginfo_subset {
>> > __u32 sis_signo;
>> > __s32 sis_errno;
>> > __s32 sis_code;
>> > __u32 sis_pad;
>> > __u32 sis_pid;
>> > __u32 sis_uid;
>> > __u64 sis_data (A pointer or integer data field);
>> > };
>> >
>> > That is just 32bytes, and is all that is needed for everything
>> > except for synchronous exceptions. Oh and that happens to be a proper
>> > subset of a any sane siginfo layout, on both 32bit and 64bit.
>
> And that would work for signalfd and waitid_time64, but not for
> procfd_signal/kill/sendsiginfo?
No. I was bringing this up because it will work for procfd_signal. See
above. It supports every signal userspace can send today. We should be
able to specify it in a way that is compatible with the existing siginfo
definitions on all architectures.
For signalfd or let me say a future signalfd2 the practical problem is
that we have Linux specific signal extensions that are not synchronous
signals like seccomp and sigchld that have fields in a bad location.
If we successfully define a siginfo_subset that is the same on
32bit and 64bit I would include the defintion in the kernels definition
of siginfo_t so new extensions are automatically picked up and require
any extensions to siginfo_t to be added to siginfo_subset_t, in the
appropriate 32bit/64bit way.
>> > This is one of those rare times where POSIX is sane and what linux
>> > has implemented is not.
>>
>> Thanks for the in-depth explanation. So your point is that we are better
>> off if we stick with siginfo_t instead of struct signalfd_siginfo in
>> procfd_signal()?
My point is that we are better sticking with siginfo_t or just cherry
picking the relevant fields and making them system call parameters.
The only advantage of using siginfo_t is forwards compatibility. If we
give up forward compatibility we can just as easily have our system call
prototype include "...(..., int sig, int code, union sigval value)"
for the definition of the signal bits. Simple easy and enough for all
posix signals.
Eric
It just occurs to me that the simple way to implement
procfd_sigqueueinfo info is like:
int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
{
#ifdef CONFIG_COMPAT
if (in_compat_syscall)
return copy_siginfo_from_user32(info, uinfo);
#endif
return copy_siginfo_from_user(info, uinfo);
}
long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
{
kernel_siginfo info;
if (copy_siginfo_from_user_any(&info, uinfo))
return -EFAULT;
...;
}
It looks like there is already a place in ptrace.c that already
hand rolls copy_siginfo_from_user_any.
So while I would love to figure out the subset of siginfo_t tha we can
just pass through, as I think that would make a better more forward
compatible copy_siginfo_from_user32. I think for this use case we just
add the in_compat_syscall test and then we just need to ensure this new
system call is placed in the proper places in the syscall table.
Because we will need 3 call sights: x86_64, x32 and ia32. As the layout
changes between those three subarchitecuters.
Eric
> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <[email protected]> wrote:
>
>
> It just occurs to me that the simple way to implement
> procfd_sigqueueinfo info is like:
>
> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
> {
> #ifdef CONFIG_COMPAT
> if (in_compat_syscall)
> return copy_siginfo_from_user32(info, uinfo);
> #endif
> return copy_siginfo_from_user(info, uinfo);
> }
>
> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
> {
> kernel_siginfo info;
>
> if (copy_siginfo_from_user_any(&info, uinfo))
> return -EFAULT;
> ...;
> }
>
> It looks like there is already a place in ptrace.c that already
> hand rolls copy_siginfo_from_user_any.
>
> So while I would love to figure out the subset of siginfo_t tha we can
> just pass through, as I think that would make a better more forward
> compatible copy_siginfo_from_user32.
Seems reasonable to me. It’s less code overall than any other suggestion, too.
> I think for this use case we just
> add the in_compat_syscall test and then we just need to ensure this new
> system call is placed in the proper places in the syscall table.
>
> Because we will need 3 call sights: x86_64, x32 and ia32. As the layout
> changes between those three subarchitecuters.
>
>
If it’s done this way, it can just be “common” in the 64-bit table. And we kick the can a bit farther down the road :)
I’m working on patches to clean up x86’s syscall mess. It’s slow because I keep finding new messes. So far I have rt_sigreturn working like every other syscall — whee.
Also, Eric, for your edification, I have a draft patch set to radically simplify x86’s signal delivery and return. Once that’s done, I can trivially speed up delivery by a ton by using sysret.
On December 2, 2018 4:52:37 AM GMT+13:00, Andy Lutomirski <[email protected]> wrote:
>
>
>> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <[email protected]>
>wrote:
>>
>>
>> It just occurs to me that the simple way to implement
>> procfd_sigqueueinfo info is like:
>>
>> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t
>*uinfo)
>> {
>> #ifdef CONFIG_COMPAT
>> if (in_compat_syscall)
>> return copy_siginfo_from_user32(info, uinfo);
>> #endif
>> return copy_siginfo_from_user(info, uinfo);
>
>> }
>>
>> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
>> {
>> kernel_siginfo info;
>>
>> if (copy_siginfo_from_user_any(&info, uinfo))
>> return -EFAULT;
>> ...;
>> }
>>
>> It looks like there is already a place in ptrace.c that already
>> hand rolls copy_siginfo_from_user_any.
>>
>> So while I would love to figure out the subset of siginfo_t tha we
>can
>> just pass through, as I think that would make a better more forward
>> compatible copy_siginfo_from_user32.
>
>Seems reasonable to me. It’s less code overall than any other
>suggestion, too.
Thanks everyone, that was super helpful!
All things equal I'm going to send out an
updated version of the patch latest next week!
>
>> I think for this use case we just
>> add the in_compat_syscall test and then we just need to ensure this
>new
>> system call is placed in the proper places in the syscall table.
>>
>> Because we will need 3 call sights: x86_64, x32 and ia32. As the
>layout
>> changes between those three subarchitecuters.
>>
>>
>
>If it’s done this way, it can just be “common” in the 64-bit table. And
>we kick the can a bit farther down the road :)
>
>I’m working on patches to clean up x86’s syscall mess. It’s slow
>because I keep finding new messes. So far I have rt_sigreturn working
>like every other syscall — whee.
>
>Also, Eric, for your edification, I have a draft patch set to radically
>simplify x86’s signal delivery and return. Once that’s done, I can
>trivially speed up delivery by a ton by using sysret.
Andy Lutomirski <[email protected]> writes:
>> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <[email protected]> wrote:
>>
>>
>> It just occurs to me that the simple way to implement
>> procfd_sigqueueinfo info is like:
>>
>> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
>> {
>> #ifdef CONFIG_COMPAT
>> if (in_compat_syscall)
>> return copy_siginfo_from_user32(info, uinfo);
>> #endif
>> return copy_siginfo_from_user(info, uinfo);
>> }
>>
>> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
>> {
>> kernel_siginfo info;
>>
>> if (copy_siginfo_from_user_any(&info, uinfo))
>> return -EFAULT;
>> ...;
>> }
>>
>> It looks like there is already a place in ptrace.c that already
>> hand rolls copy_siginfo_from_user_any.
>>
>> So while I would love to figure out the subset of siginfo_t tha we can
>> just pass through, as I think that would make a better more forward
>> compatible copy_siginfo_from_user32.
>
> Seems reasonable to me. It’s less code overall than any other suggestion, too.
>
>> I think for this use case we just
>> add the in_compat_syscall test and then we just need to ensure this new
>> system call is placed in the proper places in the syscall table.
>>
>> Because we will need 3 call sights: x86_64, x32 and ia32. As the layout
>> changes between those three subarchitecuters.
>>
>>
>
> If it’s done this way, it can just be “common” in the 64-bit
> table. And we kick the can a bit farther down the road :)
>
> I’m working on patches to clean up x86’s syscall mess. It’s slow
> because I keep finding new messes. So far I have rt_sigreturn working
> like every other syscall — whee.
>
> Also, Eric, for your edification, I have a draft patch set to
> radically simplify x86’s signal delivery and return. Once that’s
> done, I can trivially speed up delivery by a ton by using sysret.
Nice.
Do we care about the performance of synchronous signal delivery (AKA
hardware exceptions) vs ordinary signal delivery. I get the feeling
there are serious simplifications to be had in that case.
Eric
On Sat, Dec 1, 2018 at 4:07 PM Eric W. Biederman <[email protected]> wrote:
>
> Andy Lutomirski <[email protected]> writes:
>
> >> On Dec 1, 2018, at 7:28 AM, Eric W. Biederman <[email protected]> wrote:
> >>
> >>
> >> It just occurs to me that the simple way to implement
> >> procfd_sigqueueinfo info is like:
> >>
> >> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
> >> {
> >> #ifdef CONFIG_COMPAT
> >> if (in_compat_syscall)
> >> return copy_siginfo_from_user32(info, uinfo);
> >> #endif
> >> return copy_siginfo_from_user(info, uinfo);
> >> }
> >>
> >> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
> >> {
> >> kernel_siginfo info;
> >>
> >> if (copy_siginfo_from_user_any(&info, uinfo))
> >> return -EFAULT;
> >> ...;
> >> }
> >>
> >> It looks like there is already a place in ptrace.c that already
> >> hand rolls copy_siginfo_from_user_any.
> >>
> >> So while I would love to figure out the subset of siginfo_t tha we can
> >> just pass through, as I think that would make a better more forward
> >> compatible copy_siginfo_from_user32.
> >
> > Seems reasonable to me. It’s less code overall than any other suggestion, too.
> >
> >> I think for this use case we just
> >> add the in_compat_syscall test and then we just need to ensure this new
> >> system call is placed in the proper places in the syscall table.
> >>
> >> Because we will need 3 call sights: x86_64, x32 and ia32. As the layout
> >> changes between those three subarchitecuters.
> >>
> >>
> >
> > If it’s done this way, it can just be “common” in the 64-bit
> > table. And we kick the can a bit farther down the road :)
> >
> > I’m working on patches to clean up x86’s syscall mess. It’s slow
> > because I keep finding new messes. So far I have rt_sigreturn working
> > like every other syscall — whee.
> >
> > Also, Eric, for your edification, I have a draft patch set to
> > radically simplify x86’s signal delivery and return. Once that’s
> > done, I can trivially speed up delivery by a ton by using sysret.
>
> Nice.
>
> Do we care about the performance of synchronous signal delivery (AKA
> hardware exceptions) vs ordinary signal delivery. I get the feeling
> there are serious simplifications to be had in that case.
>
I dunno what user code cares about. Linux's support for synchronous
exception handling is so far behind, say, Windows, that I don't know
if it's even used for anything very serious. We should probably
profile it after I finish my changes and we can see how bad it is. We
can't do anything at all about the time it takes the CPU to deliver
the exception, and trying to avoid IRET when we return would be tricky
at best, although siglongjmp() might end up skipping it.
On Sat, Dec 01, 2018 at 09:28:47AM -0600, Eric W. Biederman wrote:
>
> It just occurs to me that the simple way to implement
> procfd_sigqueueinfo info is like:
>
> int copy_siginfo_from_user_any(kernel_siginfo_t *info, siginfo_t *uinfo)
> {
> #ifdef CONFIG_COMPAT
> if (in_compat_syscall)
> return copy_siginfo_from_user32(info, uinfo);
Right, though that would require a cast afaict.
static int __copy_siginfo_from_user_generic(int signo, kernel_siginfo_t *kinfo,
siginfo_t *info)
{
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
return __copy_siginfo_from_user32(
signo, kinfo, (struct compat_siginfo __user *)info);
#endif
return __copy_siginfo_from_user(signo, kinfo, info);
}
It seems that a cast to (compat_siginfo __user *) should be safe in this
context? I've at least seen similar things done for __sys_sendmsg().
> #endif
> return copy_siginfo_from_user(info, uinfo);
> }
>
> long procfd_sigqueueinfo(int fd, siginfo_t *uinfo)
**bikeshedding**
Not a fan of that name. I'm going to go with procfd_send_signal().
sigqueue gives non-native speakers a lot of room for spelling errors and
it always seemed opaque to me what this function is doing without
consulting the manpage. :)
> {
> kernel_siginfo info;
>
> if (copy_siginfo_from_user_any(&info, uinfo))
> return -EFAULT;
> ...;
> }
>
> It looks like there is already a place in ptrace.c that already
> hand rolls copy_siginfo_from_user_any.
>
> So while I would love to figure out the subset of siginfo_t tha we can
> just pass through, as I think that would make a better more forward
> compatible copy_siginfo_from_user32. I think for this use case we just
> add the in_compat_syscall test and then we just need to ensure this new
> system call is placed in the proper places in the syscall table.
>
> Because we will need 3 call sights: x86_64, x32 and ia32. As the layout
> changes between those three subarchitecuters.
>
> Eric
>
On Sat, Dec 01, 2018 at 12:52:24PM +1300, Christian Brauner wrote:
> On November 30, 2018 1:28:15 AM GMT+13:00, Florian Weimer <[email protected]> wrote:
> >Disclaimer: I'm looking at this patch because Christian requested it.
> >I'm not a kernel developer.
>
> Given all your expertise this really doesn't matter. :)
> You're the one having to deal with this
> in glibc after all.
> Thanks for doing this and sorry for the late reply.
> I missed that mail.
>
> >
> >* Christian Brauner:
> >
> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl
> >b/arch/x86/entry/syscalls/syscall_32.tbl
> >> index 3cf7b533b3d1..3f27ffd8ae87 100644
> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> >> @@ -398,3 +398,4 @@
> >> 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
> >>
> >385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
> >> 386 i386 rseq sys_rseq __ia32_sys_rseq
> >>
> >+387 i386 procfd_signal sys_procfd_signal __ia32_compat_sys_procfd_signal
> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> >b/arch/x86/entry/syscalls/syscall_64.tbl
> >> index f0b1709a5ffb..8a30cde82450 100644
> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> >> @@ -343,6 +343,7 @@
> >> 332 common statx __x64_sys_statx
> >> 333 common io_pgetevents __x64_sys_io_pgetevents
> >> 334 common rseq __x64_sys_rseq
> >> +335 64 procfd_signal __x64_sys_procfd_signal
> >>
> >> #
> >> # x32-specific system call numbers start at 512 to avoid cache
> >impact
> >> @@ -386,3 +387,4 @@
> >> 545 x32 execveat __x32_compat_sys_execveat/ptregs
> >> 546 x32 preadv2 __x32_compat_sys_preadv64v2
> >> 547 x32 pwritev2 __x32_compat_sys_pwritev64v2
> >> +548 x32 procfd_signal __x32_compat_sys_procfd_signal
> >
> >Is there a reason why these numbers have to be different?
> >
> >(See the recent discussion with Andy Lutomirski.)
> >
> >> +static int do_procfd_signal(int fd, int sig, kernel_siginfo_t
> >*kinfo, int flags,
> >> + bool had_siginfo)
> >> +{
> >> + int ret;
> >> + struct fd f;
> >> + struct pid *pid;
> >> +
> >> + /* Enforce flags be set to 0 until we add an extension. */
> >> + if (flags)
> >> + return -EINVAL;
> >> +
> >> + f = fdget_raw(fd);
> >> + if (!f.file)
> >> + return -EBADF;
> >> +
> >> + /* Is this a process file descriptor? */
> >> + ret = -EINVAL;
> >> + if (!proc_is_tgid_procfd(f.file))
> >> + goto err;
> >[…]
> >> + ret = kill_pid_info(sig, kinfo, pid);
> >
> >I would like to see some comment here what happens to zombie processes.
>
> You'd get ESRCH.
> I'm not sure if that has always been the case.
> Eric recently did some excellent refactoring of the signal code.
> Iirc, part of that involved not delivering signals to zombies.
> That's at least how I remember it.
> I don't have access to source code though atm.
Ok, I finally have access to source code again. Scratch what I said above!
I looked at the code and tested it. If the process has exited but not
yet waited upon aka is a zombie procfd_send_signal() will return 0. This
is identical to kill(2) behavior. It should've been sort-of obvious
since when a process is in zombie state /proc/<pid> will still be around
which means that struct pid must still be around.
>
> >
> >> +/**
> >> + * sys_procfd_signal - send a signal to a process through a process
> >file
> >> + * descriptor
> >> + * @fd: the file descriptor of the process
> >> + * @sig: signal to be sent
> >> + * @info: the signal info
> >> + * @flags: future flags to be passed
> >> + */
> >> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user
> >*, info,
> >> + int, flags)
> >
> >Sorry, I'm quite unhappy with the name. “signal” is for signal handler
> >management. procfd_sendsignal, procfd_sigqueueinfo or something like
> >that would be fine. Even procfd_kill would be better IMHO.
>
> Ok. I only have strong opinions to procfd_kill().
> Mainly because the new syscall takes
> the job of multiple other syscalls
> so kill gives the wrong impression.
> I'll come up with a better name in the next iteration.
>
> >
> >Looking at the rt_tgsigqueueinfo interface, is there a way to implement
> >the “tg” part with the current procfd_signal interface? Would you use
> >openat to retrieve the Tgid: line from "status"?
>
> Yes, the tg part can be implemented.
> As I pointed out in another mail my
> I is to make this work by using file
> descriptors for /proc/<pid>/task/<tid>.
> I don't want this in the initial patchset though.
> I prefer to slowly add those features
> once we have gotten the basic functionality
> in.
>
>
> >
> >Thanks,
> >Florian
>
* Christian Brauner:
> Ok, I finally have access to source code again. Scratch what I said above!
> I looked at the code and tested it. If the process has exited but not
> yet waited upon aka is a zombie procfd_send_signal() will return 0. This
> is identical to kill(2) behavior. It should've been sort-of obvious
> since when a process is in zombie state /proc/<pid> will still be around
> which means that struct pid must still be around.
Should we make this state more accessible, by providing a different
error code?
Will the system call ever return ESRCH, given that you have a handle for
the process?
>> >Looking at the rt_tgsigqueueinfo interface, is there a way to implement
>> >the “tg” part with the current procfd_signal interface? Would you use
>> >openat to retrieve the Tgid: line from "status"?
>
> Yes, the tg part can be implemented.
I meant on top of the existing interface.
> As I pointed out in another mail my I is to make this work by using
> file descriptors for /proc/<pid>/task/<tid>. I don't want this in the
> initial patchset though. I prefer to slowly add those features once
> we have gotten the basic functionality in.
Do you want to land all this in one kernel release? I wonder how
applications are supposed to discover kernel support if functionality is
split across several kernel releases. If you get EINVAL or EBADF, it
may not be obvious what is going on.
What happens if you use the new interface with an O_PATH descriptor?
Thanks,
Florian
On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
> * Christian Brauner:
>
> > Ok, I finally have access to source code again. Scratch what I said above!
> > I looked at the code and tested it. If the process has exited but not
> > yet waited upon aka is a zombie procfd_send_signal() will return 0. This
> > is identical to kill(2) behavior. It should've been sort-of obvious
> > since when a process is in zombie state /proc/<pid> will still be around
> > which means that struct pid must still be around.
>
> Should we make this state more accessible, by providing a different
> error code?
No, I don't think we want that. Imho, It's not really helpful. Signals
are still delivered to zombies. If zombie state were to always mean that
no-one is going to wait on this thread anymore then it would make sense
to me. But given that zombie can also mean that someone put a
sleep(1000) right before their wait() call in the parent it seems odd to
report back that it is a zombie.
>
> Will the system call ever return ESRCH, given that you have a handle for
> the process?
Yes, whenever you signal a process that has already been waited upon:
- get procfd handle referring to <proc>
- <proc> exits and is waited upon
- procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH
>
> >> >Looking at the rt_tgsigqueueinfo interface, is there a way to implement
> >> >the “tg” part with the current procfd_signal interface? Would you use
> >> >openat to retrieve the Tgid: line from "status"?
> >
> > Yes, the tg part can be implemented.
>
> I meant on top of the existing interface.
See below.
>
> > As I pointed out in another mail my I is to make this work by using
> > file descriptors for /proc/<pid>/task/<tid>. I don't want this in the
> > initial patchset though. I prefer to slowly add those features once
> > we have gotten the basic functionality in.
>
> Do you want to land all this in one kernel release? I wonder how
> applications are supposed to discover kernel support if functionality is
> split across several kernel releases. If you get EINVAL or EBADF, it
> may not be obvious what is going on.
Sigh, I get that but I really don't want to have to land this in one big
chunk. I want this syscall to go in in a as soon as we can to fulfill
the most basic need: having a way that guarantees us that we signal the
process that we intended to signal.
The thread case is easy to implement on top of it. But I suspect we will
quibble about the exact semantics for a long time. Even now we have been
on multiple - justified - detrous. That's all pefectly fine and
expected. But if we have the basic functionality in we have time to do
all of that. We might even land it in the same kernel release still. I
really don't want to come of as tea-party-kernel-conservative here but I
have time-and-time again seen that making something fancy and cover ever
interesting feature in one patchset takes a very very long time.
If you care about userspace being able to detect that case I can return
EOPNOTSUPP when a tid descriptor is passed.
>
> What happens if you use the new interface with an O_PATH descriptor?
You get EINVAL. When an O_PATH file descriptor is created the kernel
will set file->f_op = &empty_fops at which point the check I added
if (!proc_is_tgid_procfd(f.file))
goto err;
will fail. Imho this is correct behavior since technically signaling a
struct pid is the equivalent of writing to a file and hence doesn't
purely operate on the file descriptor level.
Christian
On 2018-12-03, Christian Brauner <[email protected]> wrote:
> > > As I pointed out in another mail my I is to make this work by using
> > > file descriptors for /proc/<pid>/task/<tid>. I don't want this in the
> > > initial patchset though. I prefer to slowly add those features once
> > > we have gotten the basic functionality in.
> >
> > Do you want to land all this in one kernel release? I wonder how
> > applications are supposed to discover kernel support if functionality is
> > split across several kernel releases. If you get EINVAL or EBADF, it
> > may not be obvious what is going on.
>
> Sigh, I get that but I really don't want to have to land this in one big
> chunk. I want this syscall to go in in a as soon as we can to fulfill
> the most basic need: having a way that guarantees us that we signal the
> process that we intended to signal.
>
> The thread case is easy to implement on top of it. But I suspect we will
> quibble about the exact semantics for a long time. Even now we have been
> on multiple - justified - detrous. That's all pefectly fine and
> expected. But if we have the basic functionality in we have time to do
> all of that. We might even land it in the same kernel release still. I
> really don't want to come of as tea-party-kernel-conservative here but I
> have time-and-time again seen that making something fancy and cover ever
> interesting feature in one patchset takes a very very long time.
>
> If you care about userspace being able to detect that case I can return
> EOPNOTSUPP when a tid descriptor is passed.
Personally, I'm +1 on -EOPNOTSUPP so we can get an MVP merged, and add
new features in later patches.
> > What happens if you use the new interface with an O_PATH descriptor?
>
> You get EINVAL. When an O_PATH file descriptor is created the kernel
> will set file->f_op = &empty_fops at which point the check I added
> if (!proc_is_tgid_procfd(f.file))
> goto err;
> will fail. Imho this is correct behavior since technically signaling a
> struct pid is the equivalent of writing to a file and hence doesn't
> purely operate on the file descriptor level.
Not to mention that O_PATH file descriptors are a whole kettle of fish
when it comes to permission checking semantics.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
* Christian Brauner:
> On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
>> * Christian Brauner:
>>
>> > Ok, I finally have access to source code again. Scratch what I said above!
>> > I looked at the code and tested it. If the process has exited but not
>> > yet waited upon aka is a zombie procfd_send_signal() will return 0. This
>> > is identical to kill(2) behavior. It should've been sort-of obvious
>> > since when a process is in zombie state /proc/<pid> will still be around
>> > which means that struct pid must still be around.
>>
>> Should we make this state more accessible, by providing a different
>> error code?
>
> No, I don't think we want that. Imho, It's not really helpful. Signals
> are still delivered to zombies. If zombie state were to always mean that
> no-one is going to wait on this thread anymore then it would make sense
> to me. But given that zombie can also mean that someone put a
> sleep(1000) right before their wait() call in the parent it seems odd to
> report back that it is a zombie.
It allows for error checking that the recipient of a signal is still
running. It's obviously not reliable, but I think it could be helpful
in the context of closely cooperating processes.
>> Will the system call ever return ESRCH, given that you have a handle for
>> the process?
>
> Yes, whenever you signal a process that has already been waited upon:
> - get procfd handle referring to <proc>
> - <proc> exits and is waited upon
> - procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH
I see, thanks.
>> Do you want to land all this in one kernel release? I wonder how
>> applications are supposed to discover kernel support if functionality is
>> split across several kernel releases. If you get EINVAL or EBADF, it
>> may not be obvious what is going on.
>
> Sigh, I get that but I really don't want to have to land this in one big
> chunk. I want this syscall to go in in a as soon as we can to fulfill
> the most basic need: having a way that guarantees us that we signal the
> process that we intended to signal.
>
> The thread case is easy to implement on top of it. But I suspect we will
> quibble about the exact semantics for a long time. Even now we have been
> on multiple - justified - detrous. That's all pefectly fine and
> expected. But if we have the basic functionality in we have time to do
> all of that. We might even land it in the same kernel release still. I
> really don't want to come of as tea-party-kernel-conservative here but I
> have time-and-time again seen that making something fancy and cover ever
> interesting feature in one patchset takes a very very long time.
>
> If you care about userspace being able to detect that case I can return
> EOPNOTSUPP when a tid descriptor is passed.
I suppose that's fine. Or alternatively, when thread group support is
added, introduce a flag that applications have to use to enable it, so
that they can probe for support by checking support for the flag.
I wouldn't be opposed to a new system call like this either:
int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
But I think this is frowned upon on the kernel side.
>> What happens if you use the new interface with an O_PATH descriptor?
>
> You get EINVAL. When an O_PATH file descriptor is created the kernel
> will set file->f_op = &empty_fops at which point the check I added
> if (!proc_is_tgid_procfd(f.file))
> goto err;
> will fail. Imho this is correct behavior since technically signaling a
> struct pid is the equivalent of writing to a file and hence doesn't
> purely operate on the file descriptor level.
Yes, that's quite reasonable. Thanks.
Florian
On Tue, Dec 04, 2018 at 01:55:10PM +0100, Florian Weimer wrote:
> * Christian Brauner:
>
> > On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
> >> * Christian Brauner:
> >>
> >> > Ok, I finally have access to source code again. Scratch what I said above!
> >> > I looked at the code and tested it. If the process has exited but not
> >> > yet waited upon aka is a zombie procfd_send_signal() will return 0. This
> >> > is identical to kill(2) behavior. It should've been sort-of obvious
> >> > since when a process is in zombie state /proc/<pid> will still be around
> >> > which means that struct pid must still be around.
> >>
> >> Should we make this state more accessible, by providing a different
> >> error code?
> >
> > No, I don't think we want that. Imho, It's not really helpful. Signals
> > are still delivered to zombies. If zombie state were to always mean that
> > no-one is going to wait on this thread anymore then it would make sense
> > to me. But given that zombie can also mean that someone put a
> > sleep(1000) right before their wait() call in the parent it seems odd to
> > report back that it is a zombie.
>
> It allows for error checking that the recipient of a signal is still
> running. It's obviously not reliable, but I think it could be helpful
> in the context of closely cooperating processes.
>
> >> Will the system call ever return ESRCH, given that you have a handle for
> >> the process?
> >
> > Yes, whenever you signal a process that has already been waited upon:
> > - get procfd handle referring to <proc>
> > - <proc> exits and is waited upon
> > - procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH
>
> I see, thanks.
>
> >> Do you want to land all this in one kernel release? I wonder how
> >> applications are supposed to discover kernel support if functionality is
> >> split across several kernel releases. If you get EINVAL or EBADF, it
> >> may not be obvious what is going on.
> >
> > Sigh, I get that but I really don't want to have to land this in one big
> > chunk. I want this syscall to go in in a as soon as we can to fulfill
> > the most basic need: having a way that guarantees us that we signal the
> > process that we intended to signal.
> >
> > The thread case is easy to implement on top of it. But I suspect we will
> > quibble about the exact semantics for a long time. Even now we have been
> > on multiple - justified - detrous. That's all pefectly fine and
> > expected. But if we have the basic functionality in we have time to do
> > all of that. We might even land it in the same kernel release still. I
> > really don't want to come of as tea-party-kernel-conservative here but I
> > have time-and-time again seen that making something fancy and cover ever
> > interesting feature in one patchset takes a very very long time.
> >
> > If you care about userspace being able to detect that case I can return
> > EOPNOTSUPP when a tid descriptor is passed.
>
> I suppose that's fine. Or alternatively, when thread group support is
> added, introduce a flag that applications have to use to enable it, so
> that they can probe for support by checking support for the flag.
>
> I wouldn't be opposed to a new system call like this either:
>
> int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
>
> But I think this is frowned upon on the kernel side.
If this is purely about getting a procfd then I think this isn't really
necessary since you can get it from /proc/<pid> and
/proc/<pid>/task/<tid> so a syscall just for that is likely overkill.
However, I started to pick up the CLONE_FD patchset but ideally I would
like it to be way simpler to what was proposed back in the day (which is
not a critique, I just don't feel comfortable with bringing massive
patches to the table that I can barely judge wrt to their correctness.
:)). I have toyed around with this a little and I'm tempted to simply
have the syscall always return an fd for the process and not require a
separate flag for this. But I need to work through the details and this
is really far out into the (kernel) future.
>
> >> What happens if you use the new interface with an O_PATH descriptor?
> >
> > You get EINVAL. When an O_PATH file descriptor is created the kernel
> > will set file->f_op = &empty_fops at which point the check I added
> > if (!proc_is_tgid_procfd(f.file))
> > goto err;
> > will fail. Imho this is correct behavior since technically signaling a
> > struct pid is the equivalent of writing to a file and hence doesn't
> > purely operate on the file descriptor level.
>
> Yes, that's quite reasonable. Thanks.
>
> Florian
> On Dec 4, 2018, at 4:55 AM, Florian Weimer <[email protected]> wrote:
>
> * Christian Brauner:
>
>>> On Mon, Dec 03, 2018 at 05:57:51PM +0100, Florian Weimer wrote:
>>> * Christian Brauner:
>>>
>>>> Ok, I finally have access to source code again. Scratch what I said above!
>>>> I looked at the code and tested it. If the process has exited but not
>>>> yet waited upon aka is a zombie procfd_send_signal() will return 0. This
>>>> is identical to kill(2) behavior. It should've been sort-of obvious
>>>> since when a process is in zombie state /proc/<pid> will still be around
>>>> which means that struct pid must still be around.
>>>
>>> Should we make this state more accessible, by providing a different
>>> error code?
>>
>> No, I don't think we want that. Imho, It's not really helpful. Signals
>> are still delivered to zombies. If zombie state were to always mean that
>> no-one is going to wait on this thread anymore then it would make sense
>> to me. But given that zombie can also mean that someone put a
>> sleep(1000) right before their wait() call in the parent it seems odd to
>> report back that it is a zombie.
>
> It allows for error checking that the recipient of a signal is still
> running. It's obviously not reliable, but I think it could be helpful
> in the context of closely cooperating processes.
>
>>> Will the system call ever return ESRCH, given that you have a handle for
>>> the process?
>>
>> Yes, whenever you signal a process that has already been waited upon:
>> - get procfd handle referring to <proc>
>> - <proc> exits and is waited upon
>> - procfd_send_signal(procfd, ...) returns -1 with errno == ESRCH
>
> I see, thanks.
>
>>> Do you want to land all this in one kernel release? I wonder how
>>> applications are supposed to discover kernel support if functionality is
>>> split across several kernel releases. If you get EINVAL or EBADF, it
>>> may not be obvious what is going on.
>>
>> Sigh, I get that but I really don't want to have to land this in one big
>> chunk. I want this syscall to go in in a as soon as we can to fulfill
>> the most basic need: having a way that guarantees us that we signal the
>> process that we intended to signal.
>>
>> The thread case is easy to implement on top of it. But I suspect we will
>> quibble about the exact semantics for a long time. Even now we have been
>> on multiple - justified - detrous. That's all pefectly fine and
>> expected. But if we have the basic functionality in we have time to do
>> all of that. We might even land it in the same kernel release still. I
>> really don't want to come of as tea-party-kernel-conservative here but I
>> have time-and-time again seen that making something fancy and cover ever
>> interesting feature in one patchset takes a very very long time.
>>
>> If you care about userspace being able to detect that case I can return
>> EOPNOTSUPP when a tid descriptor is passed.
>
> I suppose that's fine. Or alternatively, when thread group support is
> added, introduce a flag that applications have to use to enable it, so
> that they can probe for support by checking support for the flag.
>
> I wouldn't be opposed to a new system call like this either:
>
> int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
>
> But I think this is frowned upon on the kernel side.
I have no problem with it, except that I think it shouldn’t return an
fd that can be used for proc filesystem access.
* Andy Lutomirski:
>> I suppose that's fine. Or alternatively, when thread group support is
>> added, introduce a flag that applications have to use to enable it, so
>> that they can probe for support by checking support for the flag.
>>
>> I wouldn't be opposed to a new system call like this either:
>>
>> int procfd_open (pid_t thread_group, pid_t thread_id, unsigned flags);
>>
>> But I think this is frowned upon on the kernel side.
>
> I have no problem with it, except that I think it shouldn’t return an
> fd that can be used for proc filesystem access.
Oh no, my intention was that it would just be used with *_send_signal
and related functions.
Thanks,
Florian
On December 7, 2018 7:56:44 AM GMT+13:00, Florian Weimer <[email protected]> wrote:
>* Andy Lutomirski:
>
>>> I suppose that's fine. Or alternatively, when thread group support
>is
>>> added, introduce a flag that applications have to use to enable it,
>so
>>> that they can probe for support by checking support for the flag.
>>>
>>> I wouldn't be opposed to a new system call like this either:
>>>
>>> int procfd_open (pid_t thread_group, pid_t thread_id, unsigned
>flags);
>>>
>>> But I think this is frowned upon on the kernel side.
>>
>> I have no problem with it, except that I think it shouldn’t return an
>> fd that can be used for proc filesystem access.
>
>Oh no, my intention was that it would just be used with *_send_signal
>and related functions.
Let's postpone that discussion a little.
I think we don't need a syscall to base this off of pids.
As I said I rather send my revived version of CLONE_NEWFD that would serve the same task.
The same way we could also just add a new open() flag that blocks fs access completely.
I just pitched that idea to Serge a few days back: O_NOCHDIR or similar.
That could even be part of Aleksa's path resolution patchset.
>
>Thanks,
>Florian
Is it possible to avoid adding any syscall?
Since holding /proc/pid/reg_file can also hold the pid.
With this guarantee, /proc/pid/uuid (universally unique identifier ) can be
introduced to identify tasks, the kernel generates
a uuid for every task when created.
save_pid_uuid_pair_for_later_kill(int pid) {
/* save via /proc/$pid/uuid */
/* don't need to keep any fd after save */
}
safe_kill(pid, uuid, sig) {
fd = open(/proc/$pid/uuid); /* also hold the pid until close() if
open() successes */
if (open successes and read uuid from fd and if it equals to uuid)
kill(pid, sig)
close(fd)
}
All things needed to be done is to implement /proc/pid/uuid. And if pid can't
be recycled within 1 ticket, or the user can ensure it. The user can use
starttime(in /proc/pid/stat) instead.
save_pid_starttime_pair_for_later_kill(int pid) {
/* save via /proc/$pid/stat */
/* don't need to keep any fd after save or keep it for 1 ticket at most */
}
safe_kill(pid, starttime, sig) {
fd = open(/proc/$pid/stat); /* also hold the pid until close() if
open() successes */
if (open successes and read starttime from fd and if it equals to starttime)
kill(pid, sig)
close(fd)
}
In this case, zero LOC is added in the kernel. All of it depends on
the guarantee that holding /proc/pid/reg_file also holds the pid,
one of which I haven't checked carefully either.
On Fri, Dec 7, 2018 at 3:05 AM Christian Brauner <[email protected]> wrote:
>
> On December 7, 2018 7:56:44 AM GMT+13:00, Florian Weimer <[email protected]> wrote:
> >* Andy Lutomirski:
> >
> >>> I suppose that's fine. Or alternatively, when thread group support
> >is
> >>> added, introduce a flag that applications have to use to enable it,
> >so
> >>> that they can probe for support by checking support for the flag.
> >>>
> >>> I wouldn't be opposed to a new system call like this either:
> >>>
> >>> int procfd_open (pid_t thread_group, pid_t thread_id, unsigned
> >flags);
> >>>
> >>> But I think this is frowned upon on the kernel side.
> >>
> >> I have no problem with it, except that I think it shouldn’t return an
> >> fd that can be used for proc filesystem access.
> >
> >Oh no, my intention was that it would just be used with *_send_signal
> >and related functions.
>
> Let's postpone that discussion a little.
> I think we don't need a syscall to base this off of pids.
> As I said I rather send my revived version of CLONE_NEWFD that would serve the same task.
> The same way we could also just add a new open() flag that blocks fs access completely.
> I just pitched that idea to Serge a few days back: O_NOCHDIR or similar.
> That could even be part of Aleksa's path resolution patchset.
>
> >
> >Thanks,
> >Florian
>
On Tue, Dec 25, 2018 at 1:32 PM Lai Jiangshan
<[email protected]> wrote:
>
> Is it possible to avoid adding any syscall?
>
> Since holding /proc/pid/reg_file can also hold the pid.
> With this guarantee, /proc/pid/uuid (universally unique identifier ) can be
> introduced to identify tasks, the kernel generates
> a uuid for every task when created.
>
> save_pid_uuid_pair_for_later_kill(int pid) {
> /* save via /proc/$pid/uuid */
> /* don't need to keep any fd after save */
> }
>
> safe_kill(pid, uuid, sig) {
> fd = open(/proc/$pid/uuid); /* also hold the pid until close() if
> open() successes */
> if (open successes and read uuid from fd and if it equals to uuid)
> kill(pid, sig)
> close(fd)
> }
>
> All things needed to be done is to implement /proc/pid/uuid. And if pid can't
> be recycled within 1 ticket, or the user can ensure it. The user can use
> starttime(in /proc/pid/stat) instead.
>
> save_pid_starttime_pair_for_later_kill(int pid) {
> /* save via /proc/$pid/stat */
> /* don't need to keep any fd after save or keep it for 1 ticket at most */
> }
>
> safe_kill(pid, starttime, sig) {
> fd = open(/proc/$pid/stat); /* also hold the pid until close() if
> open() successes */
> if (open successes and read starttime from fd and if it equals to starttime)
> kill(pid, sig)
> close(fd)
> }
>
> In this case, zero LOC is added in the kernel. All of it depends on
> the guarantee that holding /proc/pid/reg_file also holds the pid,
> one of which I haven't checked carefully either.
>
Oh, Sorry, I was wrong, the pid isn't reserved even when
the fd is kept in the user space. And I'm sorry that I had
replied to an "old" email thread.
On 2018-12-25, Lai Jiangshan <[email protected]> wrote:
> On Tue, Dec 25, 2018 at 1:32 PM Lai Jiangshan
> <[email protected]> wrote:
> >
> > Is it possible to avoid adding any syscall?
> >
> > Since holding /proc/pid/reg_file can also hold the pid.
> > With this guarantee, /proc/pid/uuid (universally unique identifier ) can be
> > introduced to identify tasks, the kernel generates
> > a uuid for every task when created.
> >
> > save_pid_uuid_pair_for_later_kill(int pid) {
> > /* save via /proc/$pid/uuid */
> > /* don't need to keep any fd after save */
> > }
> >
> > safe_kill(pid, uuid, sig) {
> > fd = open(/proc/$pid/uuid); /* also hold the pid until close() if
> > open() successes */
> > if (open successes and read uuid from fd and if it equals to uuid)
> > kill(pid, sig)
> > close(fd)
> > }
> >
> > All things needed to be done is to implement /proc/pid/uuid. And if pid can't
> > be recycled within 1 ticket, or the user can ensure it. The user can use
> > starttime(in /proc/pid/stat) instead.
> >
> > save_pid_starttime_pair_for_later_kill(int pid) {
> > /* save via /proc/$pid/stat */
> > /* don't need to keep any fd after save or keep it for 1 ticket at most */
> > }
> >
> > safe_kill(pid, starttime, sig) {
> > fd = open(/proc/$pid/stat); /* also hold the pid until close() if
> > open() successes */
> > if (open successes and read starttime from fd and if it equals to starttime)
> > kill(pid, sig)
> > close(fd)
> > }
> >
> > In this case, zero LOC is added in the kernel. All of it depends on
> > the guarantee that holding /proc/pid/reg_file also holds the pid,
> > one of which I haven't checked carefully either.
> >
>
> Oh, Sorry, I was wrong, the pid isn't reserved even when
> the fd is kept in the user space. And I'm sorry that I had
> replied to an "old" email thread.
Don't worry, this was a common point of confusion during this (and
sister) threads. All the fd ensures is that access through that fd will
give you -ESRCH if the process is gone (and if the PID is reused it will
still give you -ESRCH).
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>