Linux now exposes signals, timers, and events via file descriptors
through signalfd, timerfd, and eventfd. This means programmers can use a
single select/[e]poll call to monitor all change in their program. This
patch aims to expose child death via the same mechanism.
waitfd provides a file descriptor out of which may be read a series of
siginfo_t objects describing child death. A child process is reaped as
soon as its information is read. This means child monitoring too can be
performed with that same poll call.
Patch is against v2.6.28
--CJD
diff --git a/arch/x86/include/asm/unistd_32.h
b/arch/x86/include/asm/unistd_32.h
index f2bba78..134d83c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_waitfd 333
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h
b/arch/x86/include/asm/unistd_64.h
index d2e415e..b28eb07 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_waitfd 295
+__SYSCALL(__NR_waitfd, sys_waitfd)
#ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S
b/arch/x86/kernel/syscall_table_32.S
index d44395f..c796a8b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_waitfd
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe..74c31fb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
+obj-$(CONFIG_WAITFD) += waitfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
diff --git a/fs/waitfd.c b/fs/waitfd.c
new file mode 100644
index 0000000..0155a83
--- /dev/null
+++ b/fs/waitfd.c
@@ -0,0 +1,117 @@
+/*
+ * fs/waitfd.c
+ *
+ * Copyright (C) 2008 Red Hat, Casey Dahlin <[email protected]>
+ *
+ * Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru);
+
+struct waitfd_ctx {
+ int ops;
+ int which;
+ pid_t upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ long value;
+
+ poll_wait(file, ¤t->signal->wait_chldexit, wait);
+
+ value = do_waitid(ctx->which, ctx->upid, NULL,
+ ctx->ops | WNOHANG | WNOWAIT, NULL);
+ if (value > 0 || value == -ECHILD)
+ return POLLIN;
+
+ return 0;
+}
+
+/*
+ * Returns a multiple of the size of a struct siginfo, or a negative
+ * error code. The "count" parameter must be at least sizeof(struct
siginfo)
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t
count,
+ loff_t *ppos)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ struct siginfo __user *info_addr = (struct siginfo *)buf;
+ int flags = ctx->ops;
+ ssize_t ret, total = 0;
+
+ count /= sizeof(struct siginfo);
+ if (!count)
+ return -EINVAL;
+
+ do {
+ ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+ if (ret == 0)
+ ret = -EAGAIN;
+ if (ret == -ECHILD)
+ ret = 0;
+ if (ret <= 0)
+ break;
+
+ info_addr++;
+ total += sizeof(struct siginfo);
+ flags |= WNOHANG;
+ } while (--count);
+
+ return total ? total: ret;
+}
+
+static const struct file_operations waitfd_fops = {
+ .release = waitfd_release,
+ .poll = waitfd_poll,
+ .read = waitfd_read,
+};
+
+asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
+{
+ int ufd;
+ struct waitfd_ctx *ctx;
+
+ /* Just to make sure we don't end up with a sys_waitfd4 */
+ (void)unused;
+
+ if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = options;
+ ctx->upid = upid;
+ ctx->which = which;
+
+ ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+ (options & WNOHANG) ? O_NONBLOCK : 0);
+ if (ufd < 0)
+ kfree(ctx);
+
+ return ufd;
+}
diff --git a/init/Kconfig b/init/Kconfig
index f763762..bc34871 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config EPOLL
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.
+config WAITFD
+ bool "Enable waitfd() system call" if EMBEDDED
+ select ANON_INODES
+ default y
+ help
+ Enable the waitfd() system call that allows receving child state
+ changes from a file descriptor.
+
+ If unsure, say Y.
+
config SIGNALFD
bool "Enable signalfd() system call" if EMBEDDED
select ANON_INODES
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..b53e8ba 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct
task_struct *p, pid_t pid, uid_t uid,
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
put_task_struct(p);
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
+ if (infop) {
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
if (!retval)
retval = pid;
return retval;
@@ -1727,35 +1729,12 @@ repeat:
end:
current->state = TASK_RUNNING;
remove_wait_queue(¤t->signal->wait_chldexit,&wait);
- if (infop) {
- if (retval > 0)
- retval = 0;
- else {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!retval)
- retval = put_user(0, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user(0, &infop->si_code);
- if (!retval)
- retval = put_user(0, &infop->si_pid);
- if (!retval)
- retval = put_user(0, &infop->si_uid);
- if (!retval)
- retval = put_user(0, &infop->si_status);
- }
- }
return retval;
}
-asmlinkage long sys_waitid(int which, pid_t upid,
- struct siginfo __user *infop, int options,
- struct rusage __user *ru)
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
{
struct pid *pid = NULL;
enum pid_type type;
@@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
ret = do_wait(type, pid, options, infop, NULL, ru);
put_pid(pid);
+ return ret;
+}
+
+asmlinkage long sys_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
+{
+ long ret;
+
+ ret = do_waitid(which, upid, infop, options, ru);
+
+ if (ret > 0)
+ ret = 0;
+ else {
+ /*
+ * For a WNOHANG return, clear out all the fields
+ * we would set so the user can easily tell the
+ * difference.
+ */
+ if (!ret)
+ ret = put_user(0, &infop->si_signo);
+ if (!ret)
+ ret = put_user(0, &infop->si_errno);
+ if (!ret)
+ ret = put_user(0, &infop->si_code);
+ if (!ret)
+ ret = put_user(0, &infop->si_pid);
+ if (!ret)
+ ret = put_user(0, &infop->si_uid);
+ if (!ret)
+ ret = put_user(0, &infop->si_status);
+ }
+
/* avoid REGPARM breakage on x86: */
asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232..e8d4da6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
cond_syscall(sys_ioprio_get);
/* New file descriptors */
+cond_syscall(sys_waitfd);
cond_syscall(sys_signalfd);
cond_syscall(sys_signalfd4);
cond_syscall(compat_sys_signalfd);
> waitfd provides a file descriptor out of which may be read a series of
> siginfo_t objects describing child death. A child process is reaped as
> soon as its information is read. This means child monitoring too can be
> performed with that same poll call.
>
> Patch is against v2.6.28
Same comment as last time - the right way to do this is to extend the
existing notify APIs to the /proc file system
Casey Dahlin wrote:
> Linux now exposes signals, timers, and events via file descriptors
> through signalfd, timerfd, and eventfd. This means programmers can use a
> single select/[e]poll call to monitor all change in their program. This
> patch aims to expose child death via the same mechanism.
>
> waitfd provides a file descriptor out of which may be read a series of
> siginfo_t objects describing child death. A child process is reaped as
> soon as its information is read. This means child monitoring too can be
> performed with that same poll call.
>
> Patch is against v2.6.28
>
> --CJD
>
> diff --git a/arch/x86/include/asm/unistd_32.h
> b/arch/x86/include/asm/unistd_32.h
> index f2bba78..134d83c 100644
> --- a/arch/x86/include/asm/unistd_32.h
> +++ b/arch/x86/include/asm/unistd_32.h
> @@ -338,6 +338,7 @@
> #define __NR_dup3 330
> #define __NR_pipe2 331
> #define __NR_inotify_init1 332
> +#define __NR_waitfd 333
>
> #ifdef __KERNEL__
>
> diff --git a/arch/x86/include/asm/unistd_64.h
> b/arch/x86/include/asm/unistd_64.h
> index d2e415e..b28eb07 100644
> --- a/arch/x86/include/asm/unistd_64.h
> +++ b/arch/x86/include/asm/unistd_64.h
> @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
> __SYSCALL(__NR_pipe2, sys_pipe2)
> #define __NR_inotify_init1 294
> __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
> +#define __NR_waitfd 295
> +__SYSCALL(__NR_waitfd, sys_waitfd)
>
Only for x86??
>
> #ifndef __NO_STUBS
> diff --git a/arch/x86/kernel/syscall_table_32.S
> b/arch/x86/kernel/syscall_table_32.S
> index d44395f..c796a8b 100644
> --- a/arch/x86/kernel/syscall_table_32.S
> +++ b/arch/x86/kernel/syscall_table_32.S
> @@ -332,3 +332,4 @@ ENTRY(sys_call_table)
> .long sys_dup3 /* 330 */
> .long sys_pipe2
> .long sys_inotify_init1
> + .long sys_waitfd
> diff --git a/fs/Makefile b/fs/Makefile
> index d9f8afe..74c31fb 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
> obj-$(CONFIG_EPOLL) += eventpoll.o
> obj-$(CONFIG_ANON_INODES) += anon_inodes.o
> obj-$(CONFIG_SIGNALFD) += signalfd.o
> +obj-$(CONFIG_WAITFD) += waitfd.o
> obj-$(CONFIG_TIMERFD) += timerfd.o
> obj-$(CONFIG_EVENTFD) += eventfd.o
> obj-$(CONFIG_AIO) += aio.o
> diff --git a/fs/waitfd.c b/fs/waitfd.c
> new file mode 100644
> index 0000000..0155a83
> --- /dev/null
> +++ b/fs/waitfd.c
> @@ -0,0 +1,117 @@
> +/*
> + * fs/waitfd.c
> + *
> + * Copyright (C) 2008 Red Hat, Casey Dahlin <[email protected]>
> + *
> + * Largely derived from fs/signalfd.c
> + */
> +
> +#include <linux/file.h>
> +#include <linux/poll.h>
> +#include <linux/init.h>
> +#include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/kernel.h>
> +#include <linux/signal.h>
> +#include <linux/list.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/syscalls.h>
> +
> +long do_waitid(int which, pid_t upid,
> + struct siginfo __user *infop, int options,
> + struct rusage __user *ru);
> +
> +struct waitfd_ctx {
> + int ops;
> + int which;
> + pid_t upid;
> +};
> +
Please use kernel coding style: use tabs to indent, not <lots-of-spaces>,
and struct members, functions, etc., are indented by one tab stop minimum.
> +static int waitfd_release(struct inode *inode, struct file *file)
> +{
> + kfree(file->private_data);
> + return 0;
> +}
> +
> +static unsigned int waitfd_poll(struct file *file, poll_table *wait)
> +{
> + struct waitfd_ctx *ctx = file->private_data;
> + long value;
> +
> + poll_wait(file, ¤t->signal->wait_chldexit, wait);
> +
> + value = do_waitid(ctx->which, ctx->upid, NULL,
> + ctx->ops | WNOHANG | WNOWAIT, NULL);
> + if (value > 0 || value == -ECHILD)
> + return POLLIN;
> +
> + return 0;
> +}
> +
> +/*
> + * Returns a multiple of the size of a struct siginfo, or a negative
> + * error code. The "count" parameter must be at least sizeof(struct
> siginfo)
> + */
> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t
> count,
> + loff_t *ppos)
> +{
> + struct waitfd_ctx *ctx = file->private_data;
> + struct siginfo __user *info_addr = (struct siginfo *)buf;
> + int flags = ctx->ops;
> + ssize_t ret, total = 0;
> +
> + count /= sizeof(struct siginfo);
> + if (!count)
> + return -EINVAL;
> +
> + do {
> + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
> + if (ret == 0)
> + ret = -EAGAIN;
> + if (ret == -ECHILD)
> + ret = 0;
> + if (ret <= 0)
> + break;
> +
> + info_addr++;
> + total += sizeof(struct siginfo);
> + flags |= WNOHANG;
> + } while (--count);
> +
> + return total ? total: ret;
return total ? total : ret;
please.
> +}
> +
> +static const struct file_operations waitfd_fops = {
> + .release = waitfd_release,
> + .poll = waitfd_poll,
> + .read = waitfd_read,
> +};
> +
> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
> +{
> + int ufd;
> + struct waitfd_ctx *ctx;
> +
> + /* Just to make sure we don't end up with a sys_waitfd4 */
> + (void)unused;
> +
> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
> + return -EINVAL;
> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
> + return -EINVAL;
Use spaces around '|'.
> +
> + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> + if (!ctx)
> + return -ENOMEM;
> +
> + ctx->ops = options;
> + ctx->upid = upid;
> + ctx->which = which;
> +
> + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
> + (options & WNOHANG) ? O_NONBLOCK : 0);
> + if (ufd < 0)
> + kfree(ctx);
> +
> + return ufd;
> +}
> diff --git a/init/Kconfig b/init/Kconfig
> index f763762..bc34871 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -683,6 +683,16 @@ config EPOLL
> Disabling this option will cause the kernel to be built without
> support for epoll family of system calls.
>
> +config WAITFD
> + bool "Enable waitfd() system call" if EMBEDDED
> + select ANON_INODES
> + default y
> + help
> + Enable the waitfd() system call that allows receving child state
receiving
Kconfig help text should be indented by <tab><space><space>.
> + changes from a file descriptor.
> +
> + If unsure, say Y.
> +
> config SIGNALFD
> bool "Enable signalfd() system call" if EMBEDDED
> select ANON_INODES
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 2d8be7e..b53e8ba 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct
> task_struct *p, pid_t pid, uid_t uid,
> int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
>
> put_task_struct(p);
> - if (!retval)
> - retval = put_user(SIGCHLD, &infop->si_signo);
> - if (!retval)
> - retval = put_user(0, &infop->si_errno);
> - if (!retval)
> - retval = put_user((short)why, &infop->si_code);
> - if (!retval)
> - retval = put_user(pid, &infop->si_pid);
> - if (!retval)
> - retval = put_user(uid, &infop->si_uid);
> - if (!retval)
> - retval = put_user(status, &infop->si_status);
> + if (infop) {
> + if (!retval)
> + retval = put_user(SIGCHLD, &infop->si_signo);
> + if (!retval)
> + retval = put_user(0, &infop->si_errno);
> + if (!retval)
> + retval = put_user((short)why, &infop->si_code);
> + if (!retval)
> + retval = put_user(pid, &infop->si_pid);
> + if (!retval)
> + retval = put_user(uid, &infop->si_uid);
> + if (!retval)
> + retval = put_user(status, &infop->si_status);
> + }
> if (!retval)
> retval = pid;
> return retval;
> @@ -1727,35 +1729,12 @@ repeat:
> end:
> current->state = TASK_RUNNING;
> remove_wait_queue(¤t->signal->wait_chldexit,&wait);
> - if (infop) {
> - if (retval > 0)
> - retval = 0;
> - else {
> - /*
> - * For a WNOHANG return, clear out all the fields
> - * we would set so the user can easily tell the
> - * difference.
> - */
> - if (!retval)
> - retval = put_user(0, &infop->si_signo);
> - if (!retval)
> - retval = put_user(0, &infop->si_errno);
> - if (!retval)
> - retval = put_user(0, &infop->si_code);
> - if (!retval)
> - retval = put_user(0, &infop->si_pid);
> - if (!retval)
> - retval = put_user(0, &infop->si_uid);
> - if (!retval)
> - retval = put_user(0, &infop->si_status);
> - }
> - }
> return retval;
> }
>
> -asmlinkage long sys_waitid(int which, pid_t upid,
> - struct siginfo __user *infop, int options,
> - struct rusage __user *ru)
> +long do_waitid(int which, pid_t upid,
> + struct siginfo __user *infop, int options,
> + struct rusage __user *ru)
> {
> struct pid *pid = NULL;
> enum pid_type type;
> @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
> ret = do_wait(type, pid, options, infop, NULL, ru);
> put_pid(pid);
>
> + return ret;
> +}
> +
> +asmlinkage long sys_waitid(int which, pid_t upid,
> + struct siginfo __user *infop, int options,
> + struct rusage __user *ru)
> +{
> + long ret;
> +
> + ret = do_waitid(which, upid, infop, options, ru);
> +
> + if (ret > 0)
> + ret = 0;
> + else {
> + /*
> + * For a WNOHANG return, clear out all the fields
> + * we would set so the user can easily tell the
> + * difference.
> + */
> + if (!ret)
> + ret = put_user(0, &infop->si_signo);
> + if (!ret)
> + ret = put_user(0, &infop->si_errno);
> + if (!ret)
> + ret = put_user(0, &infop->si_code);
> + if (!ret)
> + ret = put_user(0, &infop->si_pid);
> + if (!ret)
> + ret = put_user(0, &infop->si_uid);
> + if (!ret)
> + ret = put_user(0, &infop->si_status);
> + }
> +
> /* avoid REGPARM breakage on x86: */
> asmlinkage_protect(5, ret, which, upid, infop, options, ru);
> return ret;
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index e14a232..e8d4da6 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
> cond_syscall(sys_ioprio_get);
>
> /* New file descriptors */
> +cond_syscall(sys_waitfd);
> cond_syscall(sys_signalfd);
> cond_syscall(sys_signalfd4);
> cond_syscall(compat_sys_signalfd);
--
~Randy
Randy Dunlap wrote:
> Casey Dahlin wrote:
>
>> Linux now exposes signals, timers, and events via file descriptors
>> through signalfd, timerfd, and eventfd. This means programmers can use a
>> single select/[e]poll call to monitor all change in their program. This
>> patch aims to expose child death via the same mechanism.
>>
>> waitfd provides a file descriptor out of which may be read a series of
>> siginfo_t objects describing child death. A child process is reaped as
>> soon as its information is read. This means child monitoring too can be
>> performed with that same poll call.
>>
>> Patch is against v2.6.28
>>
>> --CJD
>>
>> diff --git a/arch/x86/include/asm/unistd_32.h
>> b/arch/x86/include/asm/unistd_32.h
>> index f2bba78..134d83c 100644
>> --- a/arch/x86/include/asm/unistd_32.h
>> +++ b/arch/x86/include/asm/unistd_32.h
>> @@ -338,6 +338,7 @@
>> #define __NR_dup3 330
>> #define __NR_pipe2 331
>> #define __NR_inotify_init1 332
>> +#define __NR_waitfd 333
>>
>> #ifdef __KERNEL__
>>
>> diff --git a/arch/x86/include/asm/unistd_64.h
>> b/arch/x86/include/asm/unistd_64.h
>> index d2e415e..b28eb07 100644
>> --- a/arch/x86/include/asm/unistd_64.h
>> +++ b/arch/x86/include/asm/unistd_64.h
>> @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
>> __SYSCALL(__NR_pipe2, sys_pipe2)
>> #define __NR_inotify_init1 294
>> __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
>> +#define __NR_waitfd 295
>> +__SYSCALL(__NR_waitfd, sys_waitfd)
>>
>>
>
> Only for x86??
>
>
At the moment. I should have mentioned this earlier but I haven't made
the syscall table entries for archs I don't test on. That will change
once the rest of the change has settled out.
>> #ifndef __NO_STUBS
>> diff --git a/arch/x86/kernel/syscall_table_32.S
>> b/arch/x86/kernel/syscall_table_32.S
>> index d44395f..c796a8b 100644
>> --- a/arch/x86/kernel/syscall_table_32.S
>> +++ b/arch/x86/kernel/syscall_table_32.S
>> @@ -332,3 +332,4 @@ ENTRY(sys_call_table)
>> .long sys_dup3 /* 330 */
>> .long sys_pipe2
>> .long sys_inotify_init1
>> + .long sys_waitfd
>> diff --git a/fs/Makefile b/fs/Makefile
>> index d9f8afe..74c31fb 100644
>> --- a/fs/Makefile
>> +++ b/fs/Makefile
>> @@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
>> obj-$(CONFIG_EPOLL) += eventpoll.o
>> obj-$(CONFIG_ANON_INODES) += anon_inodes.o
>> obj-$(CONFIG_SIGNALFD) += signalfd.o
>> +obj-$(CONFIG_WAITFD) += waitfd.o
>> obj-$(CONFIG_TIMERFD) += timerfd.o
>> obj-$(CONFIG_EVENTFD) += eventfd.o
>> obj-$(CONFIG_AIO) += aio.o
>> diff --git a/fs/waitfd.c b/fs/waitfd.c
>> new file mode 100644
>> index 0000000..0155a83
>> --- /dev/null
>> +++ b/fs/waitfd.c
>> @@ -0,0 +1,117 @@
>> +/*
>> + * fs/waitfd.c
>> + *
>> + * Copyright (C) 2008 Red Hat, Casey Dahlin <[email protected]>
>> + *
>> + * Largely derived from fs/signalfd.c
>> + */
>> +
>> +#include <linux/file.h>
>> +#include <linux/poll.h>
>> +#include <linux/init.h>
>> +#include <linux/fs.h>
>> +#include <linux/sched.h>
>> +#include <linux/kernel.h>
>> +#include <linux/signal.h>
>> +#include <linux/list.h>
>> +#include <linux/anon_inodes.h>
>> +#include <linux/syscalls.h>
>> +
>> +long do_waitid(int which, pid_t upid,
>> + struct siginfo __user *infop, int options,
>> + struct rusage __user *ru);
>> +
>> +struct waitfd_ctx {
>> + int ops;
>> + int which;
>> + pid_t upid;
>> +};
>> +
>>
>
> Please use kernel coding style: use tabs to indent, not <lots-of-spaces>,
> and struct members, functions, etc., are indented by one tab stop minimum.
>
>
Damnit. This is a mailer artifact. This is the first time thunderbird
has eaten a patch on me. I'll look in to it.
>> +static int waitfd_release(struct inode *inode, struct file *file)
>> +{
>> + kfree(file->private_data);
>> + return 0;
>> +}
>> +
>> +static unsigned int waitfd_poll(struct file *file, poll_table *wait)
>> +{
>> + struct waitfd_ctx *ctx = file->private_data;
>> + long value;
>> +
>> + poll_wait(file, ¤t->signal->wait_chldexit, wait);
>> +
>> + value = do_waitid(ctx->which, ctx->upid, NULL,
>> + ctx->ops | WNOHANG | WNOWAIT, NULL);
>> + if (value > 0 || value == -ECHILD)
>> + return POLLIN;
>> +
>> + return 0;
>> +}
>> +
>> +/*
>> + * Returns a multiple of the size of a struct siginfo, or a negative
>> + * error code. The "count" parameter must be at least sizeof(struct
>> siginfo)
>> + */
>> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t
>> count,
>> + loff_t *ppos)
>> +{
>> + struct waitfd_ctx *ctx = file->private_data;
>> + struct siginfo __user *info_addr = (struct siginfo *)buf;
>> + int flags = ctx->ops;
>> + ssize_t ret, total = 0;
>> +
>> + count /= sizeof(struct siginfo);
>> + if (!count)
>> + return -EINVAL;
>> +
>> + do {
>> + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
>> + if (ret == 0)
>> + ret = -EAGAIN;
>> + if (ret == -ECHILD)
>> + ret = 0;
>> + if (ret <= 0)
>> + break;
>> +
>> + info_addr++;
>> + total += sizeof(struct siginfo);
>> + flags |= WNOHANG;
>> + } while (--count);
>> +
>> + return total ? total: ret;
>>
>
> return total ? total : ret;
> please.
>
>
I actually saw this used elsewhere in the kernel. Assumed I'd missed it
in the style guidelines :)
>> +}
>> +
>> +static const struct file_operations waitfd_fops = {
>> + .release = waitfd_release,
>> + .poll = waitfd_poll,
>> + .read = waitfd_read,
>> +};
>> +
>> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
>> +{
>> + int ufd;
>> + struct waitfd_ctx *ctx;
>> +
>> + /* Just to make sure we don't end up with a sys_waitfd4 */
>> + (void)unused;
>> +
>> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
>> + return -EINVAL;
>> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
>> + return -EINVAL;
>>
>
> Use spaces around '|'.
>
>
Those 4 lines are copied almost exactly from kernel/exit.c. Is there
motivation to keep them consistent?
>> +
>> + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
>> + if (!ctx)
>> + return -ENOMEM;
>> +
>> + ctx->ops = options;
>> + ctx->upid = upid;
>> + ctx->which = which;
>> +
>> + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
>> + (options & WNOHANG) ? O_NONBLOCK : 0);
>> + if (ufd < 0)
>> + kfree(ctx);
>> +
>> + return ufd;
>> +}
>> diff --git a/init/Kconfig b/init/Kconfig
>> index f763762..bc34871 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -683,6 +683,16 @@ config EPOLL
>> Disabling this option will cause the kernel to be built without
>> support for epoll family of system calls.
>>
>> +config WAITFD
>> + bool "Enable waitfd() system call" if EMBEDDED
>> + select ANON_INODES
>> + default y
>> + help
>> + Enable the waitfd() system call that allows receving child state
>>
>
> receiving
>
> Kconfig help text should be indented by <tab><space><space>.
>
>
Likely more of the mailer eating the patch.
>> + changes from a file descriptor.
>> +
>> + If unsure, say Y.
>> +
>> config SIGNALFD
>> bool "Enable signalfd() system call" if EMBEDDED
>> select ANON_INODES
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index 2d8be7e..b53e8ba 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct
>> task_struct *p, pid_t pid, uid_t uid,
>> int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
>>
>> put_task_struct(p);
>> - if (!retval)
>> - retval = put_user(SIGCHLD, &infop->si_signo);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_errno);
>> - if (!retval)
>> - retval = put_user((short)why, &infop->si_code);
>> - if (!retval)
>> - retval = put_user(pid, &infop->si_pid);
>> - if (!retval)
>> - retval = put_user(uid, &infop->si_uid);
>> - if (!retval)
>> - retval = put_user(status, &infop->si_status);
>> + if (infop) {
>> + if (!retval)
>> + retval = put_user(SIGCHLD, &infop->si_signo);
>> + if (!retval)
>> + retval = put_user(0, &infop->si_errno);
>> + if (!retval)
>> + retval = put_user((short)why, &infop->si_code);
>> + if (!retval)
>> + retval = put_user(pid, &infop->si_pid);
>> + if (!retval)
>> + retval = put_user(uid, &infop->si_uid);
>> + if (!retval)
>> + retval = put_user(status, &infop->si_status);
>> + }
>> if (!retval)
>> retval = pid;
>> return retval;
>> @@ -1727,35 +1729,12 @@ repeat:
>> end:
>> current->state = TASK_RUNNING;
>> remove_wait_queue(¤t->signal->wait_chldexit,&wait);
>> - if (infop) {
>> - if (retval > 0)
>> - retval = 0;
>> - else {
>> - /*
>> - * For a WNOHANG return, clear out all the fields
>> - * we would set so the user can easily tell the
>> - * difference.
>> - */
>> - if (!retval)
>> - retval = put_user(0, &infop->si_signo);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_errno);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_code);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_pid);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_uid);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_status);
>> - }
>> - }
>> return retval;
>> }
>>
>> -asmlinkage long sys_waitid(int which, pid_t upid,
>> - struct siginfo __user *infop, int options,
>> - struct rusage __user *ru)
>> +long do_waitid(int which, pid_t upid,
>> + struct siginfo __user *infop, int options,
>> + struct rusage __user *ru)
>> {
>> struct pid *pid = NULL;
>> enum pid_type type;
>> @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
>> ret = do_wait(type, pid, options, infop, NULL, ru);
>> put_pid(pid);
>>
>> + return ret;
>> +}
>> +
>> +asmlinkage long sys_waitid(int which, pid_t upid,
>> + struct siginfo __user *infop, int options,
>> + struct rusage __user *ru)
>> +{
>> + long ret;
>> +
>> + ret = do_waitid(which, upid, infop, options, ru);
>> +
>> + if (ret > 0)
>> + ret = 0;
>> + else {
>> + /*
>> + * For a WNOHANG return, clear out all the fields
>> + * we would set so the user can easily tell the
>> + * difference.
>> + */
>> + if (!ret)
>> + ret = put_user(0, &infop->si_signo);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_errno);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_code);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_pid);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_uid);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_status);
>> + }
>> +
>> /* avoid REGPARM breakage on x86: */
>> asmlinkage_protect(5, ret, which, upid, infop, options, ru);
>> return ret;
>> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> index e14a232..e8d4da6 100644
>> --- a/kernel/sys_ni.c
>> +++ b/kernel/sys_ni.c
>> @@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
>> cond_syscall(sys_ioprio_get);
>>
>> /* New file descriptors */
>> +cond_syscall(sys_waitfd);
>> cond_syscall(sys_signalfd);
>> cond_syscall(sys_signalfd4);
>> cond_syscall(compat_sys_signalfd);
>>
Thanks for the review.
--CJD
Casey Dahlin <[email protected]> writes:
> Linux now exposes signals, timers, and events via file descriptors
> through signalfd, timerfd, and eventfd. This means programmers can use
> a single select/[e]poll call to monitor all change in their
> program. This patch aims to expose child death via the same mechanism.
>
> waitfd provides a file descriptor out of which may be read a series of
> siginfo_t objects describing child death. A child process is reaped as
> soon as its information is read. This means child monitoring too can
> be performed with that same poll call.
>
> Patch is against v2.6.28
Can you please provide a manpage with the exact proposed semantics of
the interface.
Without that it's very hard to evaluate if the interface makes sense
or not.
In general a single paragraph is not enough to justify a new system
call.
-Andi
--
[email protected]
Casey Dahlin wrote:
> Randy Dunlap wrote:
>> Casey Dahlin wrote:
>>
>>> Linux now exposes signals, timers, and events via file descriptors
>>> through signalfd, timerfd, and eventfd. This means programmers can use a
>>> single select/[e]poll call to monitor all change in their program. This
>>> patch aims to expose child death via the same mechanism.
>>>
>>> waitfd provides a file descriptor out of which may be read a series of
>>> siginfo_t objects describing child death. A child process is reaped as
>>> soon as its information is read. This means child monitoring too can be
>>> performed with that same poll call.
>>>
>>> Patch is against v2.6.28
>>>
>>> --CJD
>>>
>>> diff --git a/arch/x86/include/asm/unistd_32.h
>>> b/arch/x86/include/asm/unistd_32.h
>>> index f2bba78..134d83c 100644
>>> --- a/arch/x86/include/asm/unistd_32.h
>>> +++ b/arch/x86/include/asm/unistd_32.h
>>> @@ -338,6 +338,7 @@
>>> #define __NR_dup3 330
>>> #define __NR_pipe2 331
>>> #define __NR_inotify_init1 332
>>> +#define __NR_waitfd 333
>>>
>>> #ifdef __KERNEL__
>>>
>>> diff --git a/arch/x86/include/asm/unistd_64.h
>>> b/arch/x86/include/asm/unistd_64.h
>>> index d2e415e..b28eb07 100644
>>> --- a/arch/x86/include/asm/unistd_64.h
>>> +++ b/arch/x86/include/asm/unistd_64.h
>>> @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
>>> __SYSCALL(__NR_pipe2, sys_pipe2)
>>> #define __NR_inotify_init1 294
>>> __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
>>> +#define __NR_waitfd 295
>>> +__SYSCALL(__NR_waitfd, sys_waitfd)
>>>
>>>
>>
>> Only for x86??
>>
>>
>
> At the moment. I should have mentioned this earlier but I haven't made
> the syscall table entries for archs I don't test on. That will change
> once the rest of the change has settled out.
>>> diff --git a/fs/waitfd.c b/fs/waitfd.c
>>> new file mode 100644
>>> index 0000000..0155a83
>>> --- /dev/null
>>> +++ b/fs/waitfd.c
>>> @@ -0,0 +1,117 @@
>>> +/*
>>> + * fs/waitfd.c
>>> + *
>>> + * Copyright (C) 2008 Red Hat, Casey Dahlin <[email protected]>
>>> + *
>>> + * Largely derived from fs/signalfd.c
>>> + */
>>> +
>>> +#include <linux/file.h>
>>> +#include <linux/poll.h>
>>> +#include <linux/init.h>
>>> +#include <linux/fs.h>
>>> +#include <linux/sched.h>
>>> +#include <linux/kernel.h>
>>> +#include <linux/signal.h>
>>> +#include <linux/list.h>
>>> +#include <linux/anon_inodes.h>
>>> +#include <linux/syscalls.h>
>>> +
>>> +long do_waitid(int which, pid_t upid,
>>> + struct siginfo __user *infop, int options,
>>> + struct rusage __user *ru);
>>> +
>>> +struct waitfd_ctx {
>>> + int ops;
>>> + int which;
>>> + pid_t upid;
>>> +};
>>> +
>>>
>>
>> Please use kernel coding style: use tabs to indent, not
>> <lots-of-spaces>,
>> and struct members, functions, etc., are indented by one tab stop
>> minimum.
>>
>>
>
> Damnit. This is a mailer artifact. This is the first time thunderbird
> has eaten a patch on me. I'll look in to it.
OK.
You can see if Documentation/email-clients.txt helps you any.
>>> +}
>>> +
>>> +static const struct file_operations waitfd_fops = {
>>> + .release = waitfd_release,
>>> + .poll = waitfd_poll,
>>> + .read = waitfd_read,
>>> +};
>>> +
>>> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int
>>> unused)
>>> +{
>>> + int ufd;
>>> + struct waitfd_ctx *ctx;
>>> +
>>> + /* Just to make sure we don't end up with a sys_waitfd4 */
>>> + (void)unused;
>>> +
>>> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
>>> + return -EINVAL;
>>> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
>>> + return -EINVAL;
>>>
>>
>> Use spaces around '|'.
>>
>>
>
> Those 4 lines are copied almost exactly from kernel/exit.c. Is there
> motivation to keep them consistent?
I would say no.
--
~Randy
Same as last time, but the formatting should be right now.
Original description:
Linux now exposes signals, timers, and events via file descriptors through signalfd, timerfd, and eventfd. This means programmers can use a single select/[e]poll call to monitor all change in their program. This patch aims to expose child death via the same mechanism.
waitfd provides a file descriptor out of which may be read a series of siginfo_t objects describing child death. A child process is reaped as soon as its information is read. This means child monitoring too can be performed with that same poll call.
Patch is against v2.6.28
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78..134d83c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_waitfd 333
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e..b28eb07 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_waitfd 295
+__SYSCALL(__NR_waitfd, sys_waitfd)
#ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395f..c796a8b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_waitfd
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe..74c31fb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
+obj-$(CONFIG_WAITFD) += waitfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
diff --git a/fs/waitfd.c b/fs/waitfd.c
new file mode 100644
index 0000000..0155a83
--- /dev/null
+++ b/fs/waitfd.c
@@ -0,0 +1,117 @@
+/*
+ * fs/waitfd.c
+ *
+ * Copyright (C) 2008 Red Hat, Casey Dahlin <[email protected]>
+ *
+ * Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru);
+
+struct waitfd_ctx {
+ int ops;
+ int which;
+ pid_t upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ long value;
+
+ poll_wait(file, ¤t->signal->wait_chldexit, wait);
+
+ value = do_waitid(ctx->which, ctx->upid, NULL,
+ ctx->ops | WNOHANG | WNOWAIT, NULL);
+ if (value > 0 || value == -ECHILD)
+ return POLLIN;
+
+ return 0;
+}
+
+/*
+ * Returns a multiple of the size of a struct siginfo, or a negative
+ * error code. The "count" parameter must be at least sizeof(struct siginfo)
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ struct siginfo __user *info_addr = (struct siginfo *)buf;
+ int flags = ctx->ops;
+ ssize_t ret, total = 0;
+
+ count /= sizeof(struct siginfo);
+ if (!count)
+ return -EINVAL;
+
+ do {
+ ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+ if (ret == 0)
+ ret = -EAGAIN;
+ if (ret == -ECHILD)
+ ret = 0;
+ if (ret <= 0)
+ break;
+
+ info_addr++;
+ total += sizeof(struct siginfo);
+ flags |= WNOHANG;
+ } while (--count);
+
+ return total ? total: ret;
+}
+
+static const struct file_operations waitfd_fops = {
+ .release = waitfd_release,
+ .poll = waitfd_poll,
+ .read = waitfd_read,
+};
+
+asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
+{
+ int ufd;
+ struct waitfd_ctx *ctx;
+
+ /* Just to make sure we don't end up with a sys_waitfd4 */
+ (void)unused;
+
+ if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = options;
+ ctx->upid = upid;
+ ctx->which = which;
+
+ ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+ (options & WNOHANG) ? O_NONBLOCK : 0);
+ if (ufd < 0)
+ kfree(ctx);
+
+ return ufd;
+}
diff --git a/init/Kconfig b/init/Kconfig
index f763762..bc34871 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config EPOLL
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.
+config WAITFD
+ bool "Enable waitfd() system call" if EMBEDDED
+ select ANON_INODES
+ default y
+ help
+ Enable the waitfd() system call that allows receving child state
+ changes from a file descriptor.
+
+ If unsure, say Y.
+
config SIGNALFD
bool "Enable signalfd() system call" if EMBEDDED
select ANON_INODES
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..b53e8ba 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
put_task_struct(p);
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
+ if (infop) {
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
if (!retval)
retval = pid;
return retval;
@@ -1727,35 +1729,12 @@ repeat:
end:
current->state = TASK_RUNNING;
remove_wait_queue(¤t->signal->wait_chldexit,&wait);
- if (infop) {
- if (retval > 0)
- retval = 0;
- else {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!retval)
- retval = put_user(0, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user(0, &infop->si_code);
- if (!retval)
- retval = put_user(0, &infop->si_pid);
- if (!retval)
- retval = put_user(0, &infop->si_uid);
- if (!retval)
- retval = put_user(0, &infop->si_status);
- }
- }
return retval;
}
-asmlinkage long sys_waitid(int which, pid_t upid,
- struct siginfo __user *infop, int options,
- struct rusage __user *ru)
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
{
struct pid *pid = NULL;
enum pid_type type;
@@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
ret = do_wait(type, pid, options, infop, NULL, ru);
put_pid(pid);
+ return ret;
+}
+
+asmlinkage long sys_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
+{
+ long ret;
+
+ ret = do_waitid(which, upid, infop, options, ru);
+
+ if (ret > 0)
+ ret = 0;
+ else {
+ /*
+ * For a WNOHANG return, clear out all the fields
+ * we would set so the user can easily tell the
+ * difference.
+ */
+ if (!ret)
+ ret = put_user(0, &infop->si_signo);
+ if (!ret)
+ ret = put_user(0, &infop->si_errno);
+ if (!ret)
+ ret = put_user(0, &infop->si_code);
+ if (!ret)
+ ret = put_user(0, &infop->si_pid);
+ if (!ret)
+ ret = put_user(0, &infop->si_uid);
+ if (!ret)
+ ret = put_user(0, &infop->si_status);
+ }
+
/* avoid REGPARM breakage on x86: */
asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232..e8d4da6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
cond_syscall(sys_ioprio_get);
/* New file descriptors */
+cond_syscall(sys_waitfd);
cond_syscall(sys_signalfd);
cond_syscall(sys_signalfd4);
cond_syscall(compat_sys_signalfd);
(Cc:-ed a few more folks who might be interested in this)
* Casey Dahlin <[email protected]> wrote:
> Original description:
>
> Linux now exposes signals, timers, and events via file descriptors
> through signalfd, timerfd, and eventfd. This means programmers can use a
> single select/[e]poll call to monitor all change in their program. This
> patch aims to expose child death via the same mechanism.
>
> waitfd provides a file descriptor out of which may be read a series of
> siginfo_t objects describing child death. A child process is reaped as
> soon as its information is read. This means child monitoring too can be
> performed with that same poll call.
>
> Patch is against v2.6.28
The principle looks sound and acceptable - to complete the epoll mechanism
to all event sources in the system. There's a few small (mostly stylistic)
details though:
> index 0000000..0155a83
> --- /dev/null
> +++ b/fs/waitfd.c
> +#include <linux/anon_inodes.h>
> +#include <linux/syscalls.h>
> +
> +long do_waitid(int which, pid_t upid,
> + struct siginfo __user *infop, int options,
> + struct rusage __user *ru);
please move this prototype into sched.h.
> +struct waitfd_ctx {
> + int ops;
> + int which;
> + pid_t upid;
> +};
please align structure fields vertically, similarly to how you aligned
other definitions in your patch. Something like:
> +struct waitfd_ctx {
> + int ops;
> + int which;
> + pid_t upid;
> +};
(otherwise it looks all a bit too crammed together)
> +static int waitfd_release(struct inode *inode, struct file *file)
> +{
> + kfree(file->private_data);
> + return 0;
> +}
> +
> +static unsigned int waitfd_poll(struct file *file, poll_table *wait)
> +{
> + struct waitfd_ctx *ctx = file->private_data;
> + long value;
> +
> + poll_wait(file, ¤t->signal->wait_chldexit, wait);
> +
> + value = do_waitid(ctx->which, ctx->upid, NULL,
> + ctx->ops | WNOHANG | WNOWAIT, NULL);
> + if (value > 0 || value == -ECHILD)
> + return POLLIN;
> +
> + return 0;
> +}
> +
> +/*
> + * Returns a multiple of the size of a struct siginfo, or a negative
> + * error code. The "count" parameter must be at least sizeof(struct siginfo)
> + */
> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
> + loff_t *ppos)
> +{
> + struct waitfd_ctx *ctx = file->private_data;
> + struct siginfo __user *info_addr = (struct siginfo *)buf;
> + int flags = ctx->ops;
> + ssize_t ret, total = 0;
> +
> + count /= sizeof(struct siginfo);
> + if (!count)
> + return -EINVAL;
> +
> + do {
> + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
> + if (ret == 0)
> + ret = -EAGAIN;
> + if (ret == -ECHILD)
> + ret = 0;
> + if (ret <= 0)
> + break;
> +
> + info_addr++;
> + total += sizeof(struct siginfo);
> + flags |= WNOHANG;
> + } while (--count);
> +
> + return total ? total: ret;
please use symmetric spacing, i.e.:
> + return total ? total : ret;
> +}
> +
> +static const struct file_operations waitfd_fops = {
> + .release = waitfd_release,
> + .poll = waitfd_poll,
> + .read = waitfd_read,
> +};
> +
> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
> +{
> + int ufd;
> + struct waitfd_ctx *ctx;
> +
> + /* Just to make sure we don't end up with a sys_waitfd4 */
> + (void)unused;
looks a bit silly ...
> +
> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
> + return -EINVAL;
> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
> + return -EINVAL;
> +
> + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> + if (!ctx)
> + return -ENOMEM;
> +
> + ctx->ops = options;
> + ctx->upid = upid;
> + ctx->which = which;
> +
> + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
> + (options & WNOHANG) ? O_NONBLOCK : 0);
> + if (ufd < 0)
> + kfree(ctx);
> +
> + return ufd;
> +}
> diff --git a/init/Kconfig b/init/Kconfig
> index f763762..bc34871 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -683,6 +683,16 @@ config EPOLL
> Disabling this option will cause the kernel to be built without
> support for epoll family of system calls.
>
> +config WAITFD
> + bool "Enable waitfd() system call" if EMBEDDED
> + select ANON_INODES
> + default y
> + help
> + Enable the waitfd() system call that allows receving child state
typo.
> + changes from a file descriptor.
> +
> + If unsure, say Y.
> +
> config SIGNALFD
> bool "Enable signalfd() system call" if EMBEDDED
> select ANON_INODES
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 2d8be7e..b53e8ba 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
> int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
>
> put_task_struct(p);
> - if (!retval)
> - retval = put_user(SIGCHLD, &infop->si_signo);
> - if (!retval)
> - retval = put_user(0, &infop->si_errno);
> - if (!retval)
> - retval = put_user((short)why, &infop->si_code);
> - if (!retval)
> - retval = put_user(pid, &infop->si_pid);
> - if (!retval)
> - retval = put_user(uid, &infop->si_uid);
> - if (!retval)
> - retval = put_user(status, &infop->si_status);
> + if (infop) {
> + if (!retval)
> + retval = put_user(SIGCHLD, &infop->si_signo);
> + if (!retval)
> + retval = put_user(0, &infop->si_errno);
> + if (!retval)
> + retval = put_user((short)why, &infop->si_code);
> + if (!retval)
> + retval = put_user(pid, &infop->si_pid);
> + if (!retval)
> + retval = put_user(uid, &infop->si_uid);
> + if (!retval)
> + retval = put_user(status, &infop->si_status);
> + }
> if (!retval)
> retval = pid;
> return retval;
> @@ -1727,35 +1729,12 @@ repeat:
> end:
> current->state = TASK_RUNNING;
> remove_wait_queue(¤t->signal->wait_chldexit,&wait);
> - if (infop) {
> - if (retval > 0)
> - retval = 0;
> - else {
> - /*
> - * For a WNOHANG return, clear out all the fields
> - * we would set so the user can easily tell the
> - * difference.
> - */
> - if (!retval)
> - retval = put_user(0, &infop->si_signo);
> - if (!retval)
> - retval = put_user(0, &infop->si_errno);
> - if (!retval)
> - retval = put_user(0, &infop->si_code);
> - if (!retval)
> - retval = put_user(0, &infop->si_pid);
> - if (!retval)
> - retval = put_user(0, &infop->si_uid);
> - if (!retval)
> - retval = put_user(0, &infop->si_status);
> - }
> - }
> return retval;
> }
>
> -asmlinkage long sys_waitid(int which, pid_t upid,
> - struct siginfo __user *infop, int options,
> - struct rusage __user *ru)
> +long do_waitid(int which, pid_t upid,
> + struct siginfo __user *infop, int options,
> + struct rusage __user *ru)
> {
> struct pid *pid = NULL;
> enum pid_type type;
> @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
> ret = do_wait(type, pid, options, infop, NULL, ru);
> put_pid(pid);
>
> + return ret;
> +}
> +
> +asmlinkage long sys_waitid(int which, pid_t upid,
> + struct siginfo __user *infop, int options,
> + struct rusage __user *ru)
> +{
> + long ret;
> +
> + ret = do_waitid(which, upid, infop, options, ru);
> +
> + if (ret > 0)
> + ret = 0;
> + else {
> + /*
> + * For a WNOHANG return, clear out all the fields
> + * we would set so the user can easily tell the
> + * difference.
> + */
> + if (!ret)
> + ret = put_user(0, &infop->si_signo);
> + if (!ret)
> + ret = put_user(0, &infop->si_errno);
> + if (!ret)
> + ret = put_user(0, &infop->si_code);
> + if (!ret)
> + ret = put_user(0, &infop->si_pid);
> + if (!ret)
> + ret = put_user(0, &infop->si_uid);
> + if (!ret)
> + ret = put_user(0, &infop->si_status);
even if this just moves existing code around, if we touch this, it would
be far cleaner (and faster as well) to do what other bits of the signal
code do:
> + ret = put_user(0, &infop->si_signo);
> + ret |= put_user(0, &infop->si_errno);
> + ret |= put_user(0, &infop->si_code);
> + ret |= put_user(0, &infop->si_pid);
> + ret |= put_user(0, &infop->si_uid);
> + ret |= put_user(0, &infop->si_status);
Since put_user() can only return -EFAULT or zero.
(same for wait_noreap_copyout())
Ingo
Ingo Molnar wrote:
> (Cc:-ed a few more folks who might be interested in this)
>
> * Casey Dahlin <[email protected]> wrote:
>
>> Original description:
>>
>> Linux now exposes signals, timers, and events via file descriptors
>> through signalfd, timerfd, and eventfd. This means programmers can use a
>> single select/[e]poll call to monitor all change in their program. This
>> patch aims to expose child death via the same mechanism.
>>
>> waitfd provides a file descriptor out of which may be read a series of
>> siginfo_t objects describing child death. A child process is reaped as
>> soon as its information is read. This means child monitoring too can be
>> performed with that same poll call.
>>
>> Patch is against v2.6.28
>
> The principle looks sound and acceptable - to complete the epoll mechanism
> to all event sources in the system. There's a few small (mostly stylistic)
> details though:
>
>> index 0000000..0155a83
>> --- /dev/null
>> +++ b/fs/waitfd.c
>
>> +#include <linux/anon_inodes.h>
>> +#include <linux/syscalls.h>
>> +
>> +long do_waitid(int which, pid_t upid,
>> + struct siginfo __user *infop, int options,
>> + struct rusage __user *ru);
>
> please move this prototype into sched.h.
>
Sure thing. Knew it shouldn't be here, just didn't know where it went. I'm surprised you're the first to have complained :)
>> +struct waitfd_ctx {
>> + int ops;
>> + int which;
>> + pid_t upid;
>> +};
>
> please align structure fields vertically, similarly to how you aligned
> other definitions in your patch. Something like:
>
>> +struct waitfd_ctx {
>> + int ops;
>> + int which;
>> + pid_t upid;
>> +};
>
> (otherwise it looks all a bit too crammed together)
>
>> +static int waitfd_release(struct inode *inode, struct file *file)
>> +{
>> + kfree(file->private_data);
>> + return 0;
>> +}
>> +
>> +static unsigned int waitfd_poll(struct file *file, poll_table *wait)
>> +{
>> + struct waitfd_ctx *ctx = file->private_data;
>> + long value;
>> +
>> + poll_wait(file, ¤t->signal->wait_chldexit, wait);
>> +
>> + value = do_waitid(ctx->which, ctx->upid, NULL,
>> + ctx->ops | WNOHANG | WNOWAIT, NULL);
>> + if (value > 0 || value == -ECHILD)
>> + return POLLIN;
>> +
>> + return 0;
>> +}
>> +
>> +/*
>> + * Returns a multiple of the size of a struct siginfo, or a negative
>> + * error code. The "count" parameter must be at least sizeof(struct siginfo)
>> + */
>> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
>> + loff_t *ppos)
>> +{
>> + struct waitfd_ctx *ctx = file->private_data;
>> + struct siginfo __user *info_addr = (struct siginfo *)buf;
>> + int flags = ctx->ops;
>> + ssize_t ret, total = 0;
>> +
>> + count /= sizeof(struct siginfo);
>> + if (!count)
>> + return -EINVAL;
>> +
>> + do {
>> + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
>> + if (ret == 0)
>> + ret = -EAGAIN;
>> + if (ret == -ECHILD)
>> + ret = 0;
>> + if (ret <= 0)
>> + break;
>> +
>> + info_addr++;
>> + total += sizeof(struct siginfo);
>> + flags |= WNOHANG;
>> + } while (--count);
>> +
>> + return total ? total: ret;
>
> please use symmetric spacing, i.e.:
>
>> + return total ? total : ret;
>
>> +}
>> +
>> +static const struct file_operations waitfd_fops = {
>> + .release = waitfd_release,
>> + .poll = waitfd_poll,
>> + .read = waitfd_read,
>> +};
>> +
>> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
>> +{
>> + int ufd;
>> + struct waitfd_ctx *ctx;
>> +
>> + /* Just to make sure we don't end up with a sys_waitfd4 */
>> + (void)unused;
>
> looks a bit silly ...
Do you mean the principle of having an unused argument around for future use or the cast to void? The cast to void is there to suppress the "Waning: unused argument" messages and make gcc happy.
>> +
>> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
>> + return -EINVAL;
>> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
>> + return -EINVAL;
>> +
>> + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
>> + if (!ctx)
>> + return -ENOMEM;
>> +
>> + ctx->ops = options;
>> + ctx->upid = upid;
>> + ctx->which = which;
>> +
>> + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
>> + (options & WNOHANG) ? O_NONBLOCK : 0);
>> + if (ufd < 0)
>> + kfree(ctx);
>> +
>> + return ufd;
>> +}
>> diff --git a/init/Kconfig b/init/Kconfig
>> index f763762..bc34871 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -683,6 +683,16 @@ config EPOLL
>> Disabling this option will cause the kernel to be built without
>> support for epoll family of system calls.
>>
>> +config WAITFD
>> + bool "Enable waitfd() system call" if EMBEDDED
>> + select ANON_INODES
>> + default y
>> + help
>> + Enable the waitfd() system call that allows receving child state
>
> typo.
>
>> + changes from a file descriptor.
>> +
>> + If unsure, say Y.
>> +
>> config SIGNALFD
>> bool "Enable signalfd() system call" if EMBEDDED
>> select ANON_INODES
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index 2d8be7e..b53e8ba 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
>> int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
>>
>> put_task_struct(p);
>> - if (!retval)
>> - retval = put_user(SIGCHLD, &infop->si_signo);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_errno);
>> - if (!retval)
>> - retval = put_user((short)why, &infop->si_code);
>> - if (!retval)
>> - retval = put_user(pid, &infop->si_pid);
>> - if (!retval)
>> - retval = put_user(uid, &infop->si_uid);
>> - if (!retval)
>> - retval = put_user(status, &infop->si_status);
>> + if (infop) {
>> + if (!retval)
>> + retval = put_user(SIGCHLD, &infop->si_signo);
>> + if (!retval)
>> + retval = put_user(0, &infop->si_errno);
>> + if (!retval)
>> + retval = put_user((short)why, &infop->si_code);
>> + if (!retval)
>> + retval = put_user(pid, &infop->si_pid);
>> + if (!retval)
>> + retval = put_user(uid, &infop->si_uid);
>> + if (!retval)
>> + retval = put_user(status, &infop->si_status);
>> + }
>> if (!retval)
>> retval = pid;
>> return retval;
>> @@ -1727,35 +1729,12 @@ repeat:
>> end:
>> current->state = TASK_RUNNING;
>> remove_wait_queue(¤t->signal->wait_chldexit,&wait);
>> - if (infop) {
>> - if (retval > 0)
>> - retval = 0;
>> - else {
>> - /*
>> - * For a WNOHANG return, clear out all the fields
>> - * we would set so the user can easily tell the
>> - * difference.
>> - */
>> - if (!retval)
>> - retval = put_user(0, &infop->si_signo);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_errno);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_code);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_pid);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_uid);
>> - if (!retval)
>> - retval = put_user(0, &infop->si_status);
>> - }
>> - }
>> return retval;
>> }
>>
>> -asmlinkage long sys_waitid(int which, pid_t upid,
>> - struct siginfo __user *infop, int options,
>> - struct rusage __user *ru)
>> +long do_waitid(int which, pid_t upid,
>> + struct siginfo __user *infop, int options,
>> + struct rusage __user *ru)
>> {
>> struct pid *pid = NULL;
>> enum pid_type type;
>> @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
>> ret = do_wait(type, pid, options, infop, NULL, ru);
>> put_pid(pid);
>>
>> + return ret;
>> +}
>> +
>> +asmlinkage long sys_waitid(int which, pid_t upid,
>> + struct siginfo __user *infop, int options,
>> + struct rusage __user *ru)
>> +{
>> + long ret;
>> +
>> + ret = do_waitid(which, upid, infop, options, ru);
>> +
>> + if (ret > 0)
>> + ret = 0;
>> + else {
>> + /*
>> + * For a WNOHANG return, clear out all the fields
>> + * we would set so the user can easily tell the
>> + * difference.
>> + */
>> + if (!ret)
>> + ret = put_user(0, &infop->si_signo);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_errno);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_code);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_pid);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_uid);
>> + if (!ret)
>> + ret = put_user(0, &infop->si_status);
>
> even if this just moves existing code around, if we touch this, it would
> be far cleaner (and faster as well) to do what other bits of the signal
> code do:
>
>> + ret = put_user(0, &infop->si_signo);
>> + ret |= put_user(0, &infop->si_errno);
>> + ret |= put_user(0, &infop->si_code);
>> + ret |= put_user(0, &infop->si_pid);
>> + ret |= put_user(0, &infop->si_uid);
>> + ret |= put_user(0, &infop->si_status);
>
> Since put_user() can only return -EFAULT or zero.
>
> (same for wait_noreap_copyout())
>
I'll change this in both of those places. I'm hoping to find time at some point to submit some general cleanups to do_wait. There's a lot about it I don't like.
--CJD
* Casey Dahlin <[email protected]> wrote:
> >> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
> >> +{
> >> + int ufd;
> >> + struct waitfd_ctx *ctx;
> >> +
> >> + /* Just to make sure we don't end up with a sys_waitfd4 */
> >> + (void)unused;
> >
> > looks a bit silly ...
>
>
> Do you mean the principle of having an unused argument around for future
> use or the cast to void? The cast to void is there to suppress the
> "Waning: unused argument" messages and make gcc happy.
gcc will not warn about unused function arguments - only about unused
local variables. The 'unused' argument should either be removed
altogether, or replaced with a properly named parameter and a check
returning -ENOSYS if the argument is not zero (or something like that).
(It's generally better to keep such syscalls extensible via such trivial
means, if there's a remote chance for ever needing to extend that
syscall.)
Ingo
On 01/07, Ingo Molnar wrote:
>
> (Cc:-ed a few more folks who might be interested in this)
>
> * Casey Dahlin <[email protected]> wrote:
>
> > +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
> > +{
> > + int ufd;
> > + struct waitfd_ctx *ctx;
> > +
> > + /* Just to make sure we don't end up with a sys_waitfd4 */
> > + (void)unused;
>
> looks a bit silly ...
>
> > +
> > + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
> > + return -EINVAL;
> > + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
> > + return -EINVAL;
> > +
> > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> > + if (!ctx)
> > + return -ENOMEM;
> > +
> > + ctx->ops = options;
> > + ctx->upid = upid;
> > + ctx->which = which;
> > +
> > + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
> > + (options & WNOHANG) ? O_NONBLOCK : 0);
minor nit...
Please note that unlike other sys_...fd() syscalls, sys_waitfd()
doesn't allow to pass O_CLOEXEC. Looks like we need a separate
"flags" argument...
Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
waitfd, not very good.
I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
(->f_flags & O_NONBLOCK) as WNOHANG.
(can't resist, ->ops is not the best name ;)
Oleg.
* Oleg Nesterov <[email protected]> wrote:
> On 01/07, Ingo Molnar wrote:
> >
> > (Cc:-ed a few more folks who might be interested in this)
> >
> > * Casey Dahlin <[email protected]> wrote:
> >
> > > +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
> > > +{
> > > + int ufd;
> > > + struct waitfd_ctx *ctx;
> > > +
> > > + /* Just to make sure we don't end up with a sys_waitfd4 */
> > > + (void)unused;
> >
> > looks a bit silly ...
> >
> > > +
> > > + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
> > > + return -EINVAL;
> > > + if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
> > > + return -EINVAL;
> > > +
> > > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> > > + if (!ctx)
> > > + return -ENOMEM;
> > > +
> > > + ctx->ops = options;
> > > + ctx->upid = upid;
> > > + ctx->which = which;
> > > +
> > > + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
> > > + (options & WNOHANG) ? O_NONBLOCK : 0);
>
> minor nit...
>
> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
> "flags" argument...
>
> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
> waitfd, not very good.
>
> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
> (->f_flags & O_NONBLOCK) as WNOHANG.
>
> (can't resist, ->ops is not the best name ;)
yeah, ->ops should be ->options. The name ->ops is generally use for
method vectors and so.
Ingo
On Wed, 7 Jan 2009, Oleg Nesterov wrote:
> minor nit...
>
> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
> "flags" argument...
I said this even during the first post. Not a minor nit unless we want to
have a sys_waitfd2 soon after.
> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
> waitfd, not very good.
Ack.
> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
> (->f_flags & O_NONBLOCK) as WNOHANG.
Ack.
> (can't resist, ->ops is not the best name ;)
Ack. "flags" looks a pretty nice shot at it.
- Davide
Davide Libenzi wrote:
> On Wed, 7 Jan 2009, Oleg Nesterov wrote:
>
>> minor nit...
>>
>> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
>> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
>> "flags" argument...
>
> I said this even during the first post. Not a minor nit unless we want to
> have a sys_waitfd2 soon after.
>
I don't think I properly understood the complaint last time you made it.
Fix on the way
--CJD
New syscall should have gone to linux-api, I think.
Do we really need another one for this? How about using signalfd plus
setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
SIGCHLD? It's slightly more magical for the userland process to know to do
that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
really have to add, maybe better.
Thanks,
Roland
* Roland McGrath <[email protected]> wrote:
> New syscall should have gone to linux-api, I think.
>
> Do we really need another one for this? How about using signalfd plus
> setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead
> of SIGCHLD? It's slightly more magical for the userland process to know
> to do that (fork -> clone SIGRTMIN). But compared to adding a syscall
> we don't really have to add, maybe better.
hm, i think it's cleaner conceptually than trying to wrap this into
signalfd. Since we already have:
#define __NR_signalfd 321
#define __NR_timerfd_create 322
#define __NR_timerfd_settime 325
#define __NR_timerfd_gettime 326
#define __NR_signalfd4 327
is one more really such an issue?
Ingo
On Wed, Jan 7, 2009 at 12:53 PM, Roland McGrath <[email protected]> wrote:
> Do we really need another one for this? How about using signalfd plus
> setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
> SIGCHLD? It's slightly more magical for the userland process to know to do
> that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
> really have to add, maybe better.
Since waitfd shouldn't consume the child termination notification
waitfd should be more widely usable than the wait*() interfaces.
I.e., it's not necessary to restrict the use to parents. Any process
with the same UID should be allowed to call waitfd. This would allow
some new user cases.
On Wed, 7 Jan 2009, Ingo Molnar wrote:
>
> * Roland McGrath <[email protected]> wrote:
>
> > New syscall should have gone to linux-api, I think.
> >
> > Do we really need another one for this? How about using signalfd plus
> > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead
> > of SIGCHLD? It's slightly more magical for the userland process to know
> > to do that (fork -> clone SIGRTMIN). But compared to adding a syscall
> > we don't really have to add, maybe better.
>
> hm, i think it's cleaner conceptually than trying to wrap this into
> signalfd. Since we already have:
>
> #define __NR_signalfd 321
> #define __NR_timerfd_create 322
> #define __NR_timerfd_settime 325
> #define __NR_timerfd_gettime 326
> #define __NR_signalfd4 327
>
> is one more really such an issue?
And what did eventfd do to you? :)
I partially agree with Roland (and I stated this during Casey's first
post), this can be achieved in a not too troublesome way already.
A new dedicated interface is easier for the challenged userspace coder,
but I dunno if it's worth the new code (although it does have little
footprint). Both ways are fine from my POV.
- Davide
* Davide Libenzi <[email protected]> wrote:
> On Wed, 7 Jan 2009, Ingo Molnar wrote:
>
> >
> > * Roland McGrath <[email protected]> wrote:
> >
> > > New syscall should have gone to linux-api, I think.
> > >
> > > Do we really need another one for this? How about using signalfd plus
> > > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead
> > > of SIGCHLD? It's slightly more magical for the userland process to know
> > > to do that (fork -> clone SIGRTMIN). But compared to adding a syscall
> > > we don't really have to add, maybe better.
> >
> > hm, i think it's cleaner conceptually than trying to wrap this into
> > signalfd. Since we already have:
> >
> > #define __NR_signalfd 321
> > #define __NR_timerfd_create 322
> > #define __NR_timerfd_settime 325
> > #define __NR_timerfd_gettime 326
> > #define __NR_signalfd4 327
> >
> > is one more really such an issue?
>
> And what did eventfd do to you? :)
:)
> I partially agree with Roland (and I stated this during Casey's first
> post), this can be achieved in a not too troublesome way already. A new
> dedicated interface is easier for the challenged userspace coder, but I
> dunno if it's worth the new code (although it does have little
> footprint). Both ways are fine from my POV.
i think we should be defensive and generous and do a clean separate
syscall for this - we have a pretty bad track record in syscall interface
cleanliness, today's borderline hack is tomorrow's limitation causing
headaches. We never know what that extra flexibility that will buy us in
the future.
Ingo
On 01/07, Ulrich Drepper wrote:
>
> On Wed, Jan 7, 2009 at 12:53 PM, Roland McGrath <[email protected]> wrote:
> > Do we really need another one for this? How about using signalfd plus
> > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
> > SIGCHLD? It's slightly more magical for the userland process to know to do
> > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
> > really have to add, maybe better.
>
> Since waitfd shouldn't consume the child termination notification
> waitfd should be more widely usable than the wait*() interfaces.
yes, it doesn't eat the notification (SIGCHLD), but it reaps a
zombie, clears ->exit_code TASK_STOPPED/TASK_TRACED tasks, clears
SIGNAL_STOP_CONTINUED.
> I.e., it's not necessary to restrict the use to parents. Any process
> with the same UID should be allowed to call waitfd. This would allow
> some new user cases.
I don't see how it is possible to implement this...
The parent can sleep on ->wait_chldexit and it will be notified, but
how can we wait for the unrelated process with the same UID ?
Even if sys_waitfd() uses P_PID, we can't use task->parent->signal->wait_chldexit,
task->parent can exit before task exits.
Or I misunderstood you?
Oleg.
> > Since waitfd shouldn't consume the child termination notification
> > waitfd should be more widely usable than the wait*() interfaces.
waitid can be used that way with WNOWAIT.
Roland McGrath wrote:
>>> Since waitfd shouldn't consume the child termination notification
>>> waitfd should be more widely usable than the wait*() interfaces.
>
> waitid can be used that way with WNOWAIT.
Yes, but waitfd does not have this flag. The reason being waitfd just calls waitid internally, and there is no guarantee (afaik) that calling waitid with WNOWAIT multiple times in succession will yield different results each time. This breaks the streaming behavior of the descriptor.
--CJD
On 01/08, Casey Dahlin wrote:
>
> Roland McGrath wrote:
> >>> Since waitfd shouldn't consume the child termination notification
> >>> waitfd should be more widely usable than the wait*() interfaces.
> >
> > waitid can be used that way with WNOWAIT.
>
> Yes, but waitfd does not have this flag. The reason being waitfd just
> calls waitid internally, and there is no guarantee (afaik) that calling
> waitid with WNOWAIT multiple times in succession will yield different
> results each time. This breaks the streaming behavior of the descriptor.
Yes, do_wait(WNOWAIT) will return the same result on each call. Until
the next child dies, and this child is closer to the head of ->children
list.
But the reason is not that waitfd just calls waitid() internally. Let's
suppose we add another helper (or waitfd_read() can do this by hand) so
that read(waitfd_with_WNOWAIT_flag) correctly returns the info about all
childs. Now, what should the next read() do?
Btw. It is not that I am trying to argue against sys_waitfd(), but do
you have the "real life" example when it can be useful? Yes, poll().
But we have signalfd. SIGCHLD is not rt signal, but afaics this is not
the problem actually. Just curious.
Oleg.
On Thu, Jan 8, 2009 at 9:53 AM, Roland McGrath <[email protected]> wrote:
> New syscall should have gone to linux-api, I think.
Yes, precisely. This requirement has been documented in
SubmittingPatches for several months now. More details here:
http://thread.gmane.org/gmane.linux.ltp/5658
Casey, *please* don't submit a patch for a system call without also
providing a test program, and some attempt at userspace documentation.
(Andi already pointed this out. From my POV, I don't need you to
write a full blown man page -- if you send me the text, I'll do the
*roff stuff. But that text should accompany the patch that implements
the syscall.)
Cheers,
Michael
>
> Do we really need another one for this? How about using signalfd plus
> setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
> SIGCHLD? It's slightly more magical for the userland process to know to do
> that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
> really have to add, maybe better.
>
>
> Thanks,
> Roland
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/ Found a documentation bug?
http://www.kernel.org/doc/man-pages/reporting_bugs.html
On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote:
> New syscall should have gone to linux-api, I think.
>
> Do we really need another one for this? How about using signalfd plus
> setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
> SIGCHLD? It's slightly more magical for the userland process to know to do
> that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
> really have to add, maybe better.
>
Doesn't that also change the wait() options you need as well?
Scott
--
Scott James Remnant
[email protected]
On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote:
> New syscall should have gone to linux-api, I think.
>
> Do we really need another one for this? How about using signalfd plus
> setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
> SIGCHLD? It's slightly more magical for the userland process to know to do
> that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
> really have to add, maybe better.
>
This wouldn't help the init daemon case:
- the exit_signal is set on the child, not on the parent.
While the init daemon could clone() every new process and set
exit_signal, this would not be set for processes reparented to init.
Even if we had a new syscall to change the exit_signal of a given
process, *and* had the init reparent notification patch, this still
wouldn't be sufficient; you'd have a race condition between the time
you were notified of the reparent, and the time you set exit_signal,
in which the child could die.
Since exit_signal is always reset to SIGCHLD before reparenting, this
could be done by resetting it to a different signal; but at this point
we're getting into a rather twisty method full of traps.
- exit_signal is reset to SIGCHLD on exec().
Pretty much a plan-killer ;)
Scott
--
Scott James Remnant
[email protected]
On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote:
> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
> "flags" argument...
>
> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
> waitfd, not very good.
>
> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
> (->f_flags & O_NONBLOCK) as WNOHANG.
>
> (can't resist, ->ops is not the best name ;)
>
Definitely agree here, waitfd() doesn't need WNOHANG - we already have
ONONBLOCK.
That also solves one of the strangest behaves of waitid when you use
WNOHANG (it returns zero and you have to check whether it changed the
struct), now you just read() - if no child you get EAGAIN, if a child
you read a struct.
Scott
--
Scott James Remnant
[email protected]
On Thu, 2009-01-08 at 15:36 -0500, Casey Dahlin wrote:
> Roland McGrath wrote:
> >>> Since waitfd shouldn't consume the child termination notification
> >>> waitfd should be more widely usable than the wait*() interfaces.
> >
> > waitid can be used that way with WNOWAIT.
>
> Yes, but waitfd does not have this flag. The reason being waitfd just
> calls waitid internally, and there is no guarantee (afaik) that
> calling waitid with WNOWAIT multiple times in succession will yield
> different results each time. This breaks the streaming behavior of the
> descriptor.
>
This would definitely be a Nice To Have though!
Being able to use waitid() on another process of the same uid, with
WNOHANG, in a streaming fashion would be a *very* cool thing.
Scott
--
Scott James Remnant
[email protected]
On Thu, 2009-01-08 at 22:39 +0100, Oleg Nesterov wrote:
> Btw. It is not that I am trying to argue against sys_waitfd(), but do
> you have the "real life" example when it can be useful? Yes, poll().
> But we have signalfd. SIGCHLD is not rt signal, but afaics this is not
> the problem actually. Just curious.
>
signalfd() can't currently be made to work in the way you describe.
Scott
--
Scott James Remnant
[email protected]
On 01/10, Scott James Remnant wrote:
>
> On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote:
>
> > New syscall should have gone to linux-api, I think.
> >
> > Do we really need another one for this? How about using signalfd plus
> > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
> > SIGCHLD? It's slightly more magical for the userland process to know to do
> > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't
> > really have to add, maybe better.
> >
> This wouldn't help the init daemon case:
>
> - the exit_signal is set on the child, not on the parent.
>
> While the init daemon could clone() every new process and set
> exit_signal, this would not be set for processes reparented to init.
>
> Even if we had a new syscall to change the exit_signal of a given
> process, *and* had the init reparent notification patch, this still
> wouldn't be sufficient; you'd have a race condition between the time
> you were notified of the reparent, and the time you set exit_signal,
> in which the child could die.
>
> Since exit_signal is always reset to SIGCHLD before reparenting, this
> could be done by resetting it to a different signal; but at this point
> we're getting into a rather twisty method full of traps.
>
> - exit_signal is reset to SIGCHLD on exec().
>
> Pretty much a plan-killer ;)
I can't understand why should we change ->exit_signal if we want to
use signalfd. Yes, SIGCHLD is not rt. So what?
We do not need multiple signals in queue if we want to reap multiple
zombies. Once we have a single SIGCHLD (reported by signalfd or
whatever) we can do do_wait(WNOHANG) in a loop.
Confused.
Oleg.
On 01/10, Scott James Remnant wrote:
>
> On Thu, 2009-01-08 at 22:39 +0100, Oleg Nesterov wrote:
>
> > Btw. It is not that I am trying to argue against sys_waitfd(), but do
> > you have the "real life" example when it can be useful? Yes, poll().
> > But we have signalfd. SIGCHLD is not rt signal, but afaics this is not
> > the problem actually. Just curious.
> >
> signalfd() can't currently be made to work in the way you describe.
Hmm. Could you clarify?
I am not sure we are talking about the same thing, but afaics poll() +
signalfd can work to (say) reap the childs. Actually, ppoll() alone is
enough.
No?
Oleg.
On Sat, 2009-01-10 at 16:57 +0100, Oleg Nesterov wrote:
> I can't understand why should we change ->exit_signal if we want to
> use signalfd. Yes, SIGCHLD is not rt. So what?
>
> We do not need multiple signals in queue if we want to reap multiple
> zombies. Once we have a single SIGCHLD (reported by signalfd or
> whatever) we can do do_wait(WNOHANG) in a loop.
>
Well, a good reason why is that it makes things much easier to do in
userspace. The NOTES sections of many of the syscall manpages are
already too long with strange behaviours that you have to take into
account every time (and which most people fail to do).
You may as well ask why we have signalfd() at all, and what was wrong
with sigaction() and ordinary signal handlers? Well, lots of things
actually; for a start, you couldn't really do much in the signal
handler, so from userspace all we ended up doing was writing a byte to a
pipe so we could wake up our main loop.
You then had to remember to exhaust this pipe *before* checking what
signals were triggered, just in case a signal was delivered while you
were checking (so at least you'd wake the main loop up once more to
check).
signalfd() improves matters a lot; we don't have to worry about any
strange behaviour, we just read() to know a signal is pending. But
there were two competing implementations: one would just poll for read
if signals were pending, but have no other detail; the other (which is
what we have) gives us a siginfo_t for each pending signal.
Except or non-RT signals of course, in which it just gives us the
siginfo_t for the first pending signal of that type; future ones are
discarded.
I've suggested before that that's a bug in of itself, and that signalfd
should always queue signals even if they're non-RT. But since, other
than SIGCHLD, the only other signals with useful data are SIGILL,
SIGFPE, SIGSEGV and SIGBUS (which have si_addr) which you kinda need to
catch in a signal handler; and SIGPOLL (which has si_band and si_fd)
which is negated by being in poll anyway, that's kinda hard to argue.
So let's compare userspace code for trying to reap children using
signalfd(); to save space, I've omitted error handling and the
select()/poll()/etc. call - assuming that the top half is
initialisation, and the bottom half is inside the select() handler.
First, what we have today:
sigemptyset (&mask);
sigaddset (&mask, SIGCHLD);
/* Block normal delivery and receive by sigfd instead */
sigprocmask (SIG_BLOCK, &mask, NULL);
sigfd = signalfd (-1, &mask, 0);
for (;;) {
read (sigfd, &fd_siginfo, sizeof siginfo);
/* throw away fd_siginfo, we're reading SIGCHLD and
* can't use it :-(
*/
/* SIGCHLD means _at_least_one_ child is pending, there
* may be more; so we have to loop AND expect to find
* nothing
*/
for (;;) {
/* ARGH! waitid returns 0 with WNOHANG if there
* are no children.
*
* AND the structure, despite being logically
* the same, isn't the same as the signalfd
* one :-/
*/
memset (&w_siginfo, 0, sizeof w_siginfo);
waitid (P_ALL, 0, &w_siginfo,
WEXITED | WNOHANG);
/* Did we find anything? */
if (! w_siginfo.si_pid)
break;
/* NOW we have the siginfo_t for a recently
* deceased process
*/
mourn (&w_siginfo);
}
/* Oh-HO!
* While we were looping in waitid(), other children may
* have died, and we probably already cleaned them up!
*
* But we'll still have a pending SIGCHLD, it might be
* tempting to clear that *BUT* the child could have
* died after the last waitid() call but before we clear
* it.
*
* We have no choice in this situation but to loop back
* through our entire main loop again, and do nothing.
*/
}
Pros:
- code exists today
Cons:
- having siginfo_t returned by read() is pointless, we can't use it
- double loop isn't pretty
- strange waitid() API in case of WNOHANG and no child
- incompatible structures for signalfd()'s read result and waitid(),
despite being logically the same structure! :-/
- can't simultaneously clear pending signal and wait, so we always have
to go back round the main loop if a child dies after the read()
In fact, that's not what userspace code does at all.
Since there's no point listening to SIGCHLD, it's a complete no-op, we
don't respond to it at all. We only need to use it to wake up the main
loop. The wait() loop tends to be at the bottom of the main loop
somewhere, completely outside of the fd processing.
Now, what if signalfd() would always queue pending signals even if
they're non-RT? This would also be the same code if we could change
SIGCHLD to SIGRTMIN for the *waiting* process's children:
sigemptyset (&mask);
sigaddset (&mask, SIGCHLD);
/* Block normal delivery and receive by sigfd instead */
sigprocmask (SIG_BLOCK, &mask, NULL);
sigfd = signalfd (-1, &mask, 0);
for (;;) {
read (sigfd, &siginfo, sizeof siginfo);
/* siginfo is immediately useful!
*/
mourn (&siginfo);
/* we didn't clear off the wait queue, but that's easy
* since we have the pid from signalfd()
*/
waitid (P_PID, siginfo.si_pid, NULL, WEXITED);
}
Pros:
- single siginfo_t structure type returned by read()
- we get information for each signal, we don't need a signal loop to
find out what the signal is telling us
- exact match between the signal and the wait call
- no need to go round the main loop again just in case!
- child signal can now be processed just like anything else. This
makes "standard main loop functions" (g_main_loop, etc.) much easier
to write.
Cons:
- needs kernel patch
Personally, I think the fact this solves the case where you wait on a
process that wasn't part of the original set the signal was pending for,
so you have to process an extra SIGCHLD with nothing to do, is a good
enough reason in of itself.
The overhead of going back through a main loop of a userspace process
just to find out whether you already responded to a notification is a
waste of cycles.
That's pretty neat, much nicer than what we had before. So what about
waitfd() [I think I've slightly changed Casey's API here, or he changed
my proposed one <g>]?
wfd = waitfd (-1, P_ALL, 0, WEXITED);
for (;;) {
read (wfd, &siginfo, sizeof siginfo);
/* siginfo is immediately useful AND the process has
* been reaped!
*/
mourn (&siginfo);
}
Pros:
- we don't need to care about SIGCHLD anymore - why should we listen to
a notification to call wait, if we can just call wait?
- the absolute easiest for a generic main loop; signals, timers and
child death (as well as inotify, netlink, etc.) are now just cases of
"select an fd, read structs of known size, process them"
- possibility to allow waitfd (WNOWAIT) on children outside of your own
usual process tree - notification without reaping? Maybe too much?
Cons:
- needs kernel patch
- new API
Scott
--
Scott James Remnant
[email protected]
On Sat, 2009-01-10 at 17:19 +0100, Oleg Nesterov wrote:
> I am not sure we are talking about the same thing, but afaics poll() +
> signalfd can work to (say) reap the childs. Actually, ppoll() alone is
> enough.
>
Last time I checked, ppoll() was not actually implemented across all
architectures in a manner that solved the race it was intended to solve.
I'd be delighted to learn that this had been fixed? :-)
Soctt
--
Scott James Remnant
[email protected]
On 01/10, Scott James Remnant wrote:
>
> On Sat, 2009-01-10 at 16:57 +0100, Oleg Nesterov wrote:
>
> > I can't understand why should we change ->exit_signal if we want to
> > use signalfd. Yes, SIGCHLD is not rt. So what?
> >
> > We do not need multiple signals in queue if we want to reap multiple
> > zombies. Once we have a single SIGCHLD (reported by signalfd or
> > whatever) we can do do_wait(WNOHANG) in a loop.
> >
> Well, a good reason why is that it makes things much easier to do in
> userspace.
I never argued with this. And, let me repeat. I am not arguing against
waitfd! Actually, I always try to avoid the "do we need this feature"
discussions.
What I disagree with is that waitfd adds the functionality which does
not exists currently.
> You may as well ask why we have signalfd() at all, and what was wrong
> with sigaction() and ordinary signal handlers? Well, lots of things
Cough. You don't have to explain me why signalfd is nice ;) I participated
in discussion when it was created.
> So let's compare userspace code for trying to reap children using
> signalfd();
>
> First, what we have today:
>
> [...snip the code...]
>
> Pros:
> - code exists today
That is what I meant. Not more.
> Cons:
> - having siginfo_t returned by read() is pointless, we can't use it
Indeed. We use read() only to wait for the signal death.
> - double loop isn't pretty
Nice argument to add the new syscall ;)
> - strange waitid() API in case of WNOHANG and no child
Heh. I also don't like this ;) A reason for waitfd ?
> - incompatible structures for signalfd()'s read result and waitid(),
> despite being logically the same structure! :-/
I could blaim waitfd because it fills siginfo in the manner which
is not compatible with signalfd, despite logically the same structure.
> - can't simultaneously clear pending signal and wait, so we always have
> to go back round the main loop if a child dies after the read()
Can't understand... waitfd doesn't clear the signal too?
And you forget to mention another drwaback with the current code:
a lot of pathetic comments ;)
> Since there's no point listening to SIGCHLD, it's a complete no-op, we
> don't respond to it at all. We only need to use it to wake up the main
> loop.
Yes, sure, indeed, of course.
> The wait() loop tends to be at the bottom of the main loop
> somewhere, completely outside of the fd processing.
Huh.
> Now, what if signalfd() would always queue pending signals even if
> they're non-RT?
Well, I think this is off-topic, and more importantly I don't think
this change is possible.
> So what about
> waitfd()
Yes, the user-space code (for this particular artificial example)
becomes simpler. Following this logic, let's add sys_copyfile()
to kernel? From time to time I regret we don't have it...
(from another thread)
> > I am not sure we are talking about the same thing, but afaics poll() +
> > signalfd can work to (say) reap the childs. Actually, ppoll() alone is
> > enough.
> >
> Last time I checked, ppoll() was not actually implemented across all
> architectures in a manner that solved the race it was intended to solve.
>
> I'd be delighted to learn that this had been fixed? :-)
Scott, this is unfair. Yes, some arches do not implement restore_sigmask()
logic. So what? Let's suppose ppoll() has a bug. So, this means we should
add waitfd? No, let's fix ppol(), and waitfd is orthogonal. Imho.
Again, again, again. Please don't forget about "I am not arguing against".
But I don't buy your arguments.
Oleg.
On 01/10, Scott James Remnant wrote:
>
> On Sat, 2009-01-10 at 17:19 +0100, Oleg Nesterov wrote:
>
> > I am not sure we are talking about the same thing, but afaics poll() +
> > signalfd can work to (say) reap the childs. Actually, ppoll() alone is
> > enough.
> >
> Last time I checked, ppoll() was not actually implemented across all
> architectures in a manner that solved the race it was intended to solve.
>
As I said, this is imho unfair. But I mentioned ppol() "just in case".
My questiong was why do you think that "signalfd() can't currently be
made to work in the way you describe". You have dropped this part to
change the topic?
Oleg.
On Sat, 2009-01-10 at 19:21 +0100, Oleg Nesterov wrote:
> On 01/10, Scott James Remnant wrote:
> >
> > On Sat, 2009-01-10 at 17:19 +0100, Oleg Nesterov wrote:
> >
> > > I am not sure we are talking about the same thing, but afaics poll() +
> > > signalfd can work to (say) reap the childs. Actually, ppoll() alone is
> > > enough.
> > >
> > Last time I checked, ppoll() was not actually implemented across all
> > architectures in a manner that solved the race it was intended to solve.
> >
>
> As I said, this is imho unfair. But I mentioned ppol() "just in case".
>
> My questiong was why do you think that "signalfd() can't currently be
> made to work in the way you describe". You have dropped this part to
> change the topic?
>
Sorry, I may not be following LKML etiquette correctly. These couple of
recent threads (other than some bugs I found in wait last year) are my
first real attempt to participate here.
I wasn't intending to "change the topic" or dropping the parts about
changing signalfd() to somehow sweet it under the carpet.
Rather than posting repeatedly across the thread, I tried to consolidate
my responses into the other post you've replied to.
You made an interesting point about ppoll here, so I only responded to
that to find out whether the situation of that syscall had been
improved.
Not so much changing the topic, but asking a side-bar question ;)
Scott
--
Scott James Remnant
[email protected]
On Sat, 2009-01-10 at 19:13 +0100, Oleg Nesterov wrote:
> On 01/10, Scott James Remnant wrote:
> >
> > On Sat, 2009-01-10 at 16:57 +0100, Oleg Nesterov wrote:
> >
> > > I can't understand why should we change ->exit_signal if we want to
> > > use signalfd. Yes, SIGCHLD is not rt. So what?
> > >
> > > We do not need multiple signals in queue if we want to reap multiple
> > > zombies. Once we have a single SIGCHLD (reported by signalfd or
> > > whatever) we can do do_wait(WNOHANG) in a loop.
> > >
> > Well, a good reason why is that it makes things much easier to do in
> > userspace.
>
> I never argued with this. And, let me repeat. I am not arguing against
> waitfd! Actually, I always try to avoid the "do we need this feature"
> discussions.
>
Unless I'm misinterpreting you, you're saying that you don't understand
why we should change any current behaviour? My post is attempting to
illustrate why we should.
> What I disagree with is that waitfd adds the functionality which does
> not exists currently.
>
I'm not saying that it doesn't at all; in fact I gave an example of how
you implement the exact same functionality today.
Changing the behaviour of signalfd() or introducing a call like waitfd()
makes it easier to do things in userspace.
> > Cons:
> > - having siginfo_t returned by read() is pointless, we can't use it
>
> Indeed. We use read() only to wait for the signal death.
>
In fact, because main loops use select()/poll(), for the SIGCHLD case
you'd never use signalfd() at all!
Unless I'm missing something, the following two examples are identical
in behaviour:
using signalfd:
sigemptyset (&mask);
sigaddset (&mask, SIGCHLD);
sigprocmask (SIG_BLOCK, &mask, NULL);
sigfd = signalfd (-1, &mask, 0);
/* prepare signal set */
FD_SET (sigfd, &readfds);
nfds = sigfd > nfds ? sigfd + 1 : nfds;
for (;;) {
select (nfds, &readfds, NULL, NULL, NULL);
if (FD_ISSET (sigfd, &readfds))
for (;;)
read (sigfd, &fdsi, sizeof fdsi);
/* SIGCHLD only notifies us that wait() can be called
* at least once without blocking.
*
* Since we have to call it multiple times anyway,
* and since it may block, all SIGCHLD really does it
* wakes up our main loop.
*
* Thus unconditionally loop on wait every time through
*/
while ((pid = waitpid (-1, &status, WNOHANG)) == 0)
mourn (pid, status);
}
using pselect:
sigemptyset (&mask);
sigaddset (&mask, SIGCHLD);
sigprocmask (SIG_BLOCK, &mask, &origmask);
/* prepare signal set */
for (;;) {
pselect (nfds, &readnfds, NULL, NULL, NULL,
&origmask);
/* SIGCHLD only needs to wake up our main loop,
* unconditionally loop on wait every time through
*/
while ((pid = waitpid (-1, &status, WNOHANG)) == 0)
mourn (pid, status);
}
But the pselect() version is neater. Which is why I started the
previous reply off with "why have signalfd() at all?"
> > - can't simultaneously clear pending signal and wait, so we always have
> > to go back round the main loop if a child dies after the read()
>
> Can't understand... waitfd doesn't clear the signal too?
>
> And you forget to mention another drwaback with the current code:
> a lot of pathetic comments ;)
>
The comments were illustrating the sources of userspace frustration with
the current syscalls.
One of them was attempting to explain what you don't understand here,
I'll try and be more verbose...
We only want to be woken up in our main loop if we have something to do.
One of those things that we need to do is reap dead child processes.
The kernel provides a couple of methods of being woken up because of
that event; we can use signalfd() for SIGCHLD, or we could use pselect()
and otherwise mask SIGCHLD so that any pending signal wakes up the main
loop.
That notification only tells us that *at least one* process has died;
SIGCHLD may only be pending once at a time. If further children die
before we clear the signal, nothing will happen.
We clear the signal with read() on the signalfd, or it's inherently
cleared inside pselect() by allowing it to be delivered to ourselves.
It will not be cleared again until we read() again, or call pselect()
again.
Because it only tells us that *at least one* process has died, we have
to call waitpid() repeatedly until we have exhausted the wait queue.
~~Calling waitpid() does not clear the pending signal.~~
This is the important bit.
If a further process dies while we're inside the waitpid() loop, we will
most likely reap that straight away. But this does not clear the
pending signal. The main loop will be woken up again, even though it
does not need to be.
Thus:
- child process #1 dies
- main loop woken up by SIGCHLD
- pending status of signal cleared
- enter wait loop
- child process #2 dies
- SIGCHLD pending again
- waitpid() called first time, child process #1 reaped
- waitpid() called second time, child process #2 reaped
(SIGCHLD still pending)
- waitpid() called third time, no child processes remain
- exit wait loop
- back to top of main loop, immediately woken up by pending SIGCHLD
- pending status of signal cleared
- enter wait loop
- waitpid() called first time, but no child processes remain
(we reaped it last time round)
- exit wait loop
- back to top of main loop, sleep
A rookie mistake is to assume that you receive SIGCHLD every time a
child process dies. Nearly everyone writes this first time out:
signal (SIGCHLD, reaper);
void
reaper (int signum)
{
pid = waitpid (-1, &status, 0);
printf ("%d died! :-(\n", pid); // yes, I know
}
You may even be encouraged to believe that will work, because you've
read that SIGCHLD is masked while the signal handler is running. Of
course, we all know that many children may die between the time the
first child dies (that makes the signal pending) and the signal handler
actually executing.
W. Richard Stevens even makes this mistake in Advanced Programming in
the UNIX Environment (p. 605), he even justifies called waitpid() in the
signal handler because multiple children can die.
He doesn't help matters in the second edition of UNIX Network
Programming v1 where he defines exactly the same function (p. 122)
though he at least goes on to explain all the errors and define a
correct function that loops on waitpid() (p. 128)
Seasoned developers might smile wryly, but I've seen them make another
error. Presumably because they know about select()/poll(), they appear
to treat SIGCHLD accordingly.
"SIGCHLD is to waitpid() as select() is to read()"
or perhaps more verbosely:
"A pending SIGCHLD means that a call to waitpid() won't block"
We know that this is completely wrong.
SIGCHLD and the wait queue are entirely separate, the only connection is
that SIGCHLD will be made pending, if not already, when a new entry is
added to the wait queue.
However since the two are cleared by different means, the following
behaviours can all be exhibited:
- SIGCHLD not pending, but waitpid() will not block
This is true in all example usage; after you've called the read() on
the signalfd - or the pselect() has woken, SIGCHLD is probably no
longer pending but waitpid() will not block
Compare with select() behaviour; if you fail to read() from the fd,
select() wakes up yet again
- SIGCHLD pending, but waitpid() will block
This is true if you exhaust the wait queue in a loop,
All SIGCHLD is useful for is to get your main loop out of
select()/poll(); you must always exhaust the wait queue every time you
have woken up.
(I think we agreed on this, I'm trying to explain the next paragraph.)
Unfortunately because the wake up, and the wait queue, are *not*
connected; we can be woken up when we do not need to be.
> > The wait() loop tends to be at the bottom of the main loop
> > somewhere, completely outside of the fd processing.
>
> Huh.
>
From the above reasoning, SIGCHLD is only there to wake up your main
loop. If you fail to exhaust the wait queue, you will not be woken up
again until another child process dies, so you must exhaust the wait
queue.
Now, we can try and work out whether it was SIGCHLD that woke up our
main loop. If using signalfd(), we would read() and check ssi_pid; if
using pselect() we'd have to have a signal handler that sets a
sigatomic_t.
We could check that, and then only loop on waitpid() if set.
But why?
We've proved above that SIGCHLD being pending does not mean that
waitpid() will not block. We're not actually saving ourselves from the
extra iteration of the main loop, or even saving ourselves from the
first call to waitpid() that returns -1.
Arguably the extra effort to check whether SIGCHLD was pending is more
expensive (certainly in code readability) than just calling waitpid()
anyway.
Thus it's most common (certainly in the standard libraries I've read) to
just always loop on waitpid().
> > Now, what if signalfd() would always queue pending signals even if
> > they're non-RT?
>
> Well, I think this is off-topic, and more importantly I don't think
> this change is possible.
>
From the argument above, it should be reasonably clear that what this
patch is attempting to do is join up the "waking up the main loop" from
the "clearing of the wait queue".
In simpler words, make wait behave more like select().
waitfd() does that absolutely directly. It allows you to select() on
the wait queue itself. Clearing the wait queue, and the main loop
wake-up, are directly connected.
We eliminate the problem of the extra main loop wake-up and iteration,
because if we exhaust the wait queue then the waitfd socket will not
poll for reading.
We also eliminate the problem where if we fail to exhaust the wait
queue, we will not be woken up, because the waitfd socket will still
poll for reading.
If you're using waitfd, SIGCHLD is unnecessary.
I'm not personally arguing for waitfd() *itself*, just that there be
some way of coupling the main loop wake up with the actual contents of
the wait queue.
If signalfd() can be used for this, I'd be happy as a clam.
But today, signalfd() doesn't help at all because it still has all the
same drawbacks.
We have an extra main loop wake-up and iteration if a process dies while
we're exhausting the wait queue because SIGCHLD is pending again.
If we fail to exhaust the wait queue, the main loop will not be woken up
because SIGCHLD is not pending - despite the wait queue not being empty.
It's actually completely possible, in fact, it's almost a one-line
patch.
--- kernel/signal.c~ 2009-01-10 20:04:50.000000000 +0000
+++ kernel/signal.c 2009-01-10 20:05:24.000000000 +0000
@@ -816,8 +816,10 @@
* exactly one non-rt signal, so that we can get more
* detailed information about the cause of the signal.
*/
- if (legacy_queue(pending, sig))
+ if (legacy_queue(pending, sig)) {
+ signalfd_notify(t, sig);
return 0;
+ }
/*
* fast-pathed signals for kernel-internal things like SIGSTOP
* or SIGKILL.
There might be considerations here I haven't thought of, but it worked
pretty well for me.
This isn't waitfd(), you're not actually selecting on the wait queue.
But it means you'll have a queue of SIGCHLDs for each entry in the wait
queue.
You still have to remember that for each SIGCHLD you read from the
signalfd, you must call waitpid on that pid, but if you don't do that -
it'll still poll and you can go round again.
Personally I think waitfd() is more elegant, and has a rather nice
symmetry with the other system calls.
> > So what about
> > waitfd()
>
> Yes, the user-space code (for this particular artificial example)
> becomes simpler. Following this logic, let's add sys_copyfile()
> to kernel? From time to time I regret we don't have it...
>
Well, I don't think that argument is quite orthogonal because copyfile()
can be implemented in userspace with identical behaviour to a kernel
syscall (open, fstat, sendfile, chown/etc.).
A more orthogonal example would be pselect(). That implemented, in the
kernel, a syscall that it actually wasn't possible to implement in
userspace in _quite_ the same way. The procmask/select sequence was
known to be racy, so we all worked around it with an interrupt pipe.
The argument for waitfd() or similar in the kernel is because there are
races in userspace that we can't solve.
Hopefully you can see my argument now? :-)
> (from another thread)
>
As noted there, I replied separately and cut the discussion about this
because I didn't want to confuse the matter.
Scott
--
Scott James Remnant
[email protected]
Scott James Remnant wrote:
> On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote:
>
>
>> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
>> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
>> "flags" argument...
>>
>> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
>> waitfd, not very good.
>>
>> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
>> (->f_flags & O_NONBLOCK) as WNOHANG.
>>
>> (can't resist, ->ops is not the best name ;)
>>
>>
> Definitely agree here, waitfd() doesn't need WNOHANG - we already have
> ONONBLOCK.
>
> That also solves one of the strangest behaves of waitid when you use
> WNOHANG (it returns zero and you have to check whether it changed the
> struct), now you just read() - if no child you get EAGAIN, if a child
> you read a struct.
>
> Scott
>
From the perspective of waitfd, the only difference between WNOHANG and
O_NONBLOCK is which argument you put the flags in. The API should only
support one or the other, but internally they would imply the same thing.
--CJD
On Sat, 2009-01-10 at 16:14 -0500, Casey Dahlin wrote:
> From the perspective of waitfd, the only difference between WNOHANG and
> O_NONBLOCK is which argument you put the flags in. The API should only
> support one or the other, but internally they would imply the same thing.
>
Well, you get O_NONBLOCK for free by having a file descriptor; and you
can't turn off people trying to turn it on/off with fcntl() - so you may
as well just use that, no? :-)
Scott
--
Scott James Remnant
[email protected]
Scott James Remnant wrote:
> On Thu, 2009-01-08 at 15:36 -0500, Casey Dahlin wrote:
>
>
>> Roland McGrath wrote:
>>
>>>>> Since waitfd shouldn't consume the child termination notification
>>>>> waitfd should be more widely usable than the wait*() interfaces.
>>>>>
>>> waitid can be used that way with WNOWAIT.
>>>
>> Yes, but waitfd does not have this flag. The reason being waitfd just
>> calls waitid internally, and there is no guarantee (afaik) that
>> calling waitid with WNOWAIT multiple times in succession will yield
>> different results each time. This breaks the streaming behavior of the
>> descriptor.
>>
>>
> This would definitely be a Nice To Have though!
>
> Being able to use waitid() on another process of the same uid, with
> WNOHANG, in a streaming fashion would be a *very* cool thing.
>
> Scott
>
That could be done in a separate patch for waitid itself, and its a
simple change to propagate it through waitfd (just one less EINVAL
case). In terms of waitfd we'd be supersetting the interface, so its no
big deal to add. In terms of waitid I'd have to check and make sure that
we wouldn't be breaking compliance by allowing it to return something
different on every call (this would likely affect cases where
waitid(WNOWAIT) was used even in a normal fashion as well).
--CJD
Scott James Remnant wrote:
> On Sat, 2009-01-10 at 16:14 -0500, Casey Dahlin wrote:
>
>
>> From the perspective of waitfd, the only difference between WNOHANG and
>> O_NONBLOCK is which argument you put the flags in. The API should only
>> support one or the other, but internally they would imply the same thing.
>>
>>
> Well, you get O_NONBLOCK for free by having a file descriptor; and you
> can't turn off people trying to turn it on/off with fcntl() - so you may
> as well just use that, no? :-)
>
> Scott
>
Its purely an api question. We could easily take the WNOHANG flag and
just unset it when we get it and set O_NONBLOCK instead. We need
O_CLOEXEC anyway though, and the only reason to do it would be to get
rid of the O_ options and take only one type of flag (that and just a
little more waitid consistency).
--CJD
Scott James Remnant wrote:
> That's pretty neat, much nicer than what we had before. So what about
> waitfd() [I think I've slightly changed Casey's API here, or he changed
> my proposed one <g>]?
>
I think you might have changed the argument order slighly, but other
than that, counting the corrections I've made from feedback here, its
about the same.
--CJD
On 01/10, Scott James Remnant wrote:
>
> On Sat, 2009-01-10 at 19:13 +0100, Oleg Nesterov wrote:
>
> > I never argued with this. And, let me repeat. I am not arguing against
> > waitfd! Actually, I always try to avoid the "do we need this feature"
> > discussions.
> >
> Unless I'm misinterpreting you, you're saying that you don't understand
> why we should change any current behaviour? My post is attempting to
> illustrate why we should.
Scott. How many times should I repeat: I am _not_ arguing against
waitfd.
But to clarify, neither I vote for it. I don't really care. Except
I do care about the code if it will be merged, that is why I entered
this thread.
> > What I disagree with is that waitfd adds the functionality which does
> > not exists currently.
> >
> I'm not saying that it doesn't at all; in fact I gave an example of how
> you implement the exact same functionality today.
This means I was confused. Because I thought you point is we can't poll
for childs without signalfd. And all I asked was: why do you think so.
I do understand that waitfd can be handy.
> In fact, because main loops use select()/poll(), for the SIGCHLD case
> you'd never use signalfd() at all!
>
> Unless I'm missing something, the following two examples are identical
> in behaviour:
>
> using signalfd:
> ...
> using pselect:
Yes, and that is why I mentioned that ppoll() alone is enough.
> But the pselect() version is neater. Which is why I started the
> previous reply off with "why have signalfd() at all?"
Unlike waitfd, there are things which we just can not do without signalfd,
even if we have ppol/pselect. For example: wait for the signal, but not
dequeue it.
> One of them was attempting to explain what you don't understand here,
> I'll try and be more verbose...
> ...
> ~~Calling waitpid() does not clear the pending signal.~~
>
> This is the important bit.
>
> If a further process dies while we're inside the waitpid() loop, we will
> most likely reap that straight away. But this does not clear the
> pending signal. The main loop will be woken up again, even though it
> does not need to be.
>
> Thus:
>
> - child process #1 dies
> - main loop woken up by SIGCHLD
> - pending status of signal cleared
> - enter wait loop
> - child process #2 dies
> - SIGCHLD pending again
> - waitpid() called first time, child process #1 reaped
> - waitpid() called second time, child process #2 reaped
> (SIGCHLD still pending)
> - waitpid() called third time, no child processes remain
> - exit wait loop
> - back to top of main loop, immediately woken up by pending SIGCHLD
> - pending status of signal cleared
> - enter wait loop
> - waitpid() called first time, but no child processes remain
> (we reaped it last time round)
> - exit wait loop
> - back to top of main loop, sleep
Scott, I don't really understand why are you trying to explain this
all to me. I do understand this. At least I hope ;)
Yes this is possible, and I see no problems here.
> - SIGCHLD not pending, but waitpid() will not block
>
> This is true in all example usage; after you've called the read() on
> the signalfd - or the pselect() has woken, SIGCHLD is probably no
> longer pending but waitpid() will not block
>
> Compare with select() behaviour; if you fail to read() from the fd,
> select() wakes up yet again
>
> - SIGCHLD pending, but waitpid() will block
>
> This is true if you exhaust the wait queue in a loop,
... and this too.
> All SIGCHLD is useful for is to get your main loop out of
> select()/poll(); you must always exhaust the wait queue every time you
> have woken up.
Yes, and yes, and yes. Scott, I am sorry, I failed to read to the end
so perhaps I missed something ;)
> --- kernel/signal.c~ 2009-01-10 20:04:50.000000000 +0000
> +++ kernel/signal.c 2009-01-10 20:05:24.000000000 +0000
> @@ -816,8 +816,10 @@
> * exactly one non-rt signal, so that we can get more
> * detailed information about the cause of the signal.
> */
> - if (legacy_queue(pending, sig))
> + if (legacy_queue(pending, sig)) {
> + signalfd_notify(t, sig);
> return 0;
> + }
I'd prefer to not discuss this here, but I am not sure I understand.
There should not be no threads which need the wakeup from here, and
I can't see how this change can help.
> A more orthogonal example would be pselect(). That implemented, in the
> kernel, a syscall that it actually wasn't possible to implement in
> userspace
Yes, exactly,
> The argument for waitfd() or similar in the kernel is because there are
> races in userspace that we can't solve.
And now I don't understand you again. Please show me which races we
_can not_ solve in userspace without waitfd?
Yes we can race with the exiting childs while doing waitpid() in a loop,
so we can make the unnecessary syscall. But please do not tell me _this_
is the race we can't solve. This is _harmless_. Unlike the problems
with the poor user-space implementations of pselect/ppol.
Oleg.
On 01/10, Casey Dahlin wrote:
>
> Scott James Remnant wrote:
>> On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote:
>>
>>> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
>>> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
>>> "flags" argument...
>>>
>>> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
>>> waitfd, not very good.
>>>
>>> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
>>> (->f_flags & O_NONBLOCK) as WNOHANG.
>>>
>>> (can't resist, ->ops is not the best name ;)
>>>
>>>
>> Definitely agree here, waitfd() doesn't need WNOHANG - we already have
>> ONONBLOCK.
>>
>> That also solves one of the strangest behaves of waitid when you use
>> WNOHANG (it returns zero and you have to check whether it changed the
>> struct), now you just read() - if no child you get EAGAIN, if a child
>> you read a struct.
>>
> From the perspective of waitfd, the only difference between WNOHANG and
> O_NONBLOCK is which argument you put the flags in.
No. Please see the note about ioctl/fcntl above.
Oleg.
Oleg Nesterov wrote:
> On 01/10, Casey Dahlin wrote:
>
>> Scott James Remnant wrote:
>>
>>> On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote:
>>>
>>>
>>>> Please note that unlike other sys_...fd() syscalls, sys_waitfd()
>>>> doesn't allow to pass O_CLOEXEC. Looks like we need a separate
>>>> "flags" argument...
>>>>
>>>> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on
>>>> waitfd, not very good.
>>>>
>>>> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat
>>>> (->f_flags & O_NONBLOCK) as WNOHANG.
>>>>
>>>> (can't resist, ->ops is not the best name ;)
>>>>
>>>>
>>>>
>>> Definitely agree here, waitfd() doesn't need WNOHANG - we already have
>>> ONONBLOCK.
>>>
>>> That also solves one of the strangest behaves of waitid when you use
>>> WNOHANG (it returns zero and you have to check whether it changed the
>>> struct), now you just read() - if no child you get EAGAIN, if a child
>>> you read a struct.
>>>
>>>
>> From the perspective of waitfd, the only difference between WNOHANG and
>> O_NONBLOCK is which argument you put the flags in.
>>
>
> No. Please see the note about ioctl/fcntl above.
>
> Oleg.
>
Yes but the actual waitfd call could simply set O_NONBLOCK on the
descriptor when it receive WNOHANG in the flags, and read the descriptor
flags going forward.
--CJD
On 01/10, Casey Dahlin wrote:
>
> Oleg Nesterov wrote:
>>> From the perspective of waitfd, the only difference between WNOHANG and
>>> O_NONBLOCK is which argument you put the flags in.
>>
>> No. Please see the note about ioctl/fcntl above.
>>
>> Oleg.
>>
> Yes but the actual waitfd call could simply set O_NONBLOCK on the
> descriptor when it receive WNOHANG in the flags, and read the descriptor
> flags going forward.
Ah, I misunderstood your message as if we shouldn't check f_flags at all.
Yes sure.
Oleg.
On Sat, 10 Jan 2009, Scott James Remnant wrote:
> First, what we have today:
>
> sigemptyset (&mask);
> sigaddset (&mask, SIGCHLD);
>
> /* Block normal delivery and receive by sigfd instead */
> sigprocmask (SIG_BLOCK, &mask, NULL);
> sigfd = signalfd (-1, &mask, 0);
>
> for (;;) {
> read (sigfd, &fd_siginfo, sizeof siginfo);
>
> /* throw away fd_siginfo, we're reading SIGCHLD and
> * can't use it :-(
> */
>
> /* SIGCHLD means _at_least_one_ child is pending, there
> * may be more; so we have to loop AND expect to find
> * nothing
> */
> for (;;) {
> /* ARGH! waitid returns 0 with WNOHANG if there
> * are no children.
> *
> * AND the structure, despite being logically
> * the same, isn't the same as the signalfd
> * one :-/
> */
> memset (&w_siginfo, 0, sizeof w_siginfo);
>
> waitid (P_ALL, 0, &w_siginfo,
> WEXITED | WNOHANG);
>
> /* Did we find anything? */
> if (! w_siginfo.si_pid)
> break;
>
> /* NOW we have the siginfo_t for a recently
> * deceased process
> */
>
> mourn (&w_siginfo);
> }
>
That, once all the glamorous comments are removed, can be solved pretty
much with this much userspace code:
int waitfd(int flags) {
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
sigprocmask(SIG_BLOCK, &mask, NULL);
return signalfd(-1, &mask, flags);
}
int waitfd_read(int fd, siginfo_t *si) {
for (;;) {
if (read(sigfd, &fd_siginfo,
sizeof fd_siginfo) != sizeof(fd_siginfo)
retrun 0;
memset(si, 0, sizeof *si);
waitid(P_ALL, 0, si, WEXITED | WNOHANG);
if (si->si_pid)
break;
}
return sizeof *si;
}
About the exit_signal, that would have made probably more sense for
it to be a parent property, instead of childs one. And be:
do_notify_parent(p, p->parent->exit_signal);
instead of:
do_notify_parent(p, p->exit_signal);
After all, is the parent that is going to make use of the notification,
not the child.
- Davide
On Sat, 10 Jan 2009, Oleg Nesterov wrote:
> > Thus:
> >
> > - child process #1 dies
> > - main loop woken up by SIGCHLD
> > - pending status of signal cleared
> > - enter wait loop
> > - child process #2 dies
> > - SIGCHLD pending again
> > - waitpid() called first time, child process #1 reaped
> > - waitpid() called second time, child process #2 reaped
> > (SIGCHLD still pending)
> > - waitpid() called third time, no child processes remain
> > - exit wait loop
> > - back to top of main loop, immediately woken up by pending SIGCHLD
> > - pending status of signal cleared
> > - enter wait loop
> > - waitpid() called first time, but no child processes remain
> > (we reaped it last time round)
> > - exit wait loop
> > - back to top of main loop, sleep
>
> Scott, I don't really understand why are you trying to explain this
> all to me. I do understand this. At least I hope ;)
Indeed :)
Scott, you're teaching Linux signals to the guy that is likely the
de-facto mantainer of the subsystem. See the irony?
- Davide
On Sat, Jan 10, 2009 at 4:57 PM, Oleg Nesterov <[email protected]> wrote:
> On 01/10, Scott James Remnant wrote:
>> On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote:
>> > Do we really need another one for this? ?How about using signalfd plus
>> > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of
>> > SIGCHLD? ?It's slightly more magical for the userland process to know to do
>> > that (fork -> clone SIGRTMIN). ?But compared to adding a syscall we don't
>> > really have to add, maybe better.
>> >
>> This wouldn't help the init daemon case:
>>
>> - the exit_signal is set on the child, not on the parent.
>>
>> ? While the init daemon could clone() every new process and set
>> ? exit_signal, this would not be set for processes reparented to init.
>>
>> ? Even if we had a new syscall to change the exit_signal of a given
>> ? process, *and* had the init reparent notification patch, this still
>> ? wouldn't be sufficient; you'd have a race condition between the time
>> ? you were notified of the reparent, and the time you set exit_signal,
>> ? in which the child could die.
>>
>> ? Since exit_signal is always reset to SIGCHLD before reparenting, this
>> ? could be done by resetting it to a different signal; but at this point
>> ? we're getting into a rather twisty method full of traps.
>>
>> - exit_signal is reset to SIGCHLD on exec().
>>
>> ? Pretty much a plan-killer ;)
>
> I can't understand why should we change ->exit_signal if we want to
> use signalfd. Yes, SIGCHLD is not rt. So what?
>
> We do not need multiple signals in queue if we want to reap multiple
> zombies. Once we have a single SIGCHLD (reported by signalfd or
> whatever) we can do do_wait(WNOHANG) in a loop.
>
> Confused.
I know I am terribly late for the party :)
"do_wait(WNOHANG) in a loop" is a performance problem.
Oleg, do you remember that strace bug when it was swamped
with gazillions of stop notifications from a multithreaded
task, then by dealing with them one-by-one it was causing
unfairness and ultimately "this program never finishes
when run under strace" bug?
And another typical nuisance that running multithreaded
stuff under strace is much slower, even with -e option
which limits the set of decoded syscalls?
Having waitfd would help both cases: strace can gulp
a lot of waitpid notifications in one go, and
batch process them.
--
vda
On 03/02, Denys Vlasenko wrote:
>
> On Sat, Jan 10, 2009 at 4:57 PM, Oleg Nesterov <[email protected]> wrote:
> >
> > We do not need multiple signals in queue if we want to reap multiple
> > zombies. Once we have a single SIGCHLD (reported by signalfd or
> > whatever) we can do do_wait(WNOHANG) in a loop.
> >
> > Confused.
>
> I know I am terribly late for the party :)
>
> "do_wait(WNOHANG) in a loop" is a performance problem.
Yes.
> Oleg, do you remember that strace bug when it was swamped
> with gazillions of stop notifications from a multithreaded
> task, then by dealing with them one-by-one it was causing
> unfairness and ultimately "this program never finishes
> when run under strace" bug?
Yes. But, iirc, this was not connected to the performance problems
with do_wait(). The problem was, strace did a single do_wait()
instead of wait-them-all.
> And another typical nuisance that running multithreaded
> stuff under strace is much slower, even with -e option
> which limits the set of decoded syscalls?
IIUC, this is also because strace is single-threaded, I mean it
doesn't scale well.
> Having waitfd would help both cases: strace can gulp
> a lot of waitpid notifications in one go, and
> batch process them.
Perhaps.
I do not know how much do_wait() contributes to the slowness
though. And it is not exactly clear how we can implement the
"fast" waitfd.
For example, this patch (iirc!) just calls do_wait() in a loop.
I doubt very much it can really help to improve the performance.
Oh. Can't resist. The real problem is that ptrace API should
not be per-thread, and it should not use wait() at all. But
this is offtopic.
Oleg.