2008-12-09 17:00:43

by Casey Dahlin

[permalink] [raw]
Subject: [RFC PATCH] waitfd: file descriptor to wait on child processes

This is essentially my first kernel patch, so be nice :)

Linux already has signalfd, timerfd, and eventfd to expose signals,
timers, and events via a file descriptor. This patch is a working
prototype for a fourth: waitfd. It pretty much does what the name
suggests: reading from it yields a series of status ints (as would be
written into the second argument of waitpid) for child processes that
have changed state. It takes essentially the same arguments as waitpid
(for now) and supports the same set of features.

This is far from ready for merge. Some things that are wrong with it:
* Waitpid's argument scheme probably isn't the best for this. By default
it makes it easiest to wait on a single child, which is not often useful
in this case. Waiting on all children or children in a particular
process group is possible, but not a particular, explicit set of
children, which we probably want (and which will complicate the
implementation significantly).
* The prototype for peek_waitpid is obviously in the wrong place, but I
haven't found a good home for it.
* Waitid's semantics have slightly altered: passing NULL as the pointer
to siginfo_t with WNOWAIT will now return successfully instead of
throwing EFAULT. I don't know if that means I broke it or fixed it :)
* peek_waitpid may not be required at all now. I can probably trick
sys_wait4 or sys_waitid into giving me what I want (or I could always
just make do_wait non-static).

Please provide thoughts.

--CJD

arch/x86/include/asm/unistd_32.h | 1 +
arch/x86/include/asm/unistd_64.h | 2 +
arch/x86/kernel/syscall_table_32.S | 1 +
fs/Makefile | 1 +
fs/waitfd.c | 113
++++++++++++++++++++++++++++++++++++
init/Kconfig | 10 +++
kernel/exit.c | 59 +++++++++++++++----
kernel/sys_ni.c | 1 +
8 files changed, 176 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/unistd_32.h
b/arch/x86/include/asm/unistd_32.h
index f2bba78..134d83c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_waitfd 333

#ifdef __KERNEL__

diff --git a/arch/x86/include/asm/unistd_64.h
b/arch/x86/include/asm/unistd_64.h
index d2e415e..b28eb07 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_waitfd 295
+__SYSCALL(__NR_waitfd, sys_waitfd)


#ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S
b/arch/x86/kernel/syscall_table_32.S
index d44395f..c796a8b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_waitfd
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe..74c31fb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
+obj-$(CONFIG_WAITFD) += waitfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
diff --git a/fs/waitfd.c b/fs/waitfd.c
new file mode 100644
index 0000000..25b54d1
--- /dev/null
+++ b/fs/waitfd.c
@@ -0,0 +1,113 @@
+/*
+ * fs/waitfd.c
+ *
+ * Copyright (C) 2008 Red Hat, Casey Dahlin <[email protected]>
+ *
+ * Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long peek_waitpid(pid_t upid, int options);
+
+struct waitfd_ctx {
+ int ops;
+ pid_t upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ long value;
+
+ poll_wait(file, &current->signal->wait_chldexit, wait);
+
+ value = peek_waitpid(ctx->upid, ctx->ops);
+ if (value > 0) {
+ return POLLIN;
+ } if (value == -ECHILD) {
+ return POLLIN;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a multiple of the size of a "struct waitfd_siginfo", or a
negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct waitfd_siginfo".
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t
count,
+ loff_t *ppos)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ int __user *stat_addr = (int *)buf;
+ int nonblock = file->f_flags & O_NONBLOCK ? WNOHANG: 0;
+ ssize_t ret, total = 0;
+
+ count /= sizeof(int);
+ if (!count)
+ return -EINVAL;
+
+ do {
+ ret = sys_wait4(ctx->upid, stat_addr, ctx->ops | nonblock,
+ NULL);
+ if (ret == 0)
+ ret = -EAGAIN;
+ if (ret == -ECHILD)
+ ret = 0;
+ if (ret <= 0)
+ break;
+
+ stat_addr++;
+ total += sizeof(struct siginfo);
+ nonblock = WNOHANG;
+ } while (--count);
+
+ return total ? total: ret;
+}
+
+static const struct file_operations waitfd_fops = {
+ .release = waitfd_release,
+ .poll = waitfd_poll,
+ .read = waitfd_read,
+};
+
+asmlinkage long sys_waitfd(pid_t upid, int ops)
+{
+ int ufd;
+ struct waitfd_ctx *ctx;
+
+ if (ops & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = ops;
+ ctx->upid = upid;
+
+ ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+ (ops & WNOHANG) ? O_NONBLOCK : 0);
+ if (ufd < 0)
+ kfree(ctx);
+
+ return ufd;
+}
diff --git a/init/Kconfig b/init/Kconfig
index f763762..bc34871 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config EPOLL
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.

+config WAITFD
+ bool "Enable waitfd() system call" if EMBEDDED
+ select ANON_INODES
+ default y
+ help
+ Enable the waitfd() system call that allows receving child state
+ changes from a file descriptor.
+
+ If unsure, say Y.
+
config SIGNALFD
bool "Enable signalfd() system call" if EMBEDDED
select ANON_INODES
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..e69044b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct
task_struct *p, pid_t pid, uid_t uid,
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;

put_task_struct(p);
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
+ if (infop) {
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
if (!retval)
retval = pid;
return retval;
@@ -1794,6 +1796,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
return ret;
}

+long peek_waitpid(pid_t upid, int options)
+{
+ struct pid *pid = NULL;
+ enum pid_type type;
+ long ret;
+
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ options |= WNOHANG | WNOWAIT;
+
+ if (upid == -1)
+ type = PIDTYPE_MAX;
+ else if (upid < 0) {
+ type = PIDTYPE_PGID;
+ pid = find_get_pid(-upid);
+ } else if (upid == 0) {
+ type = PIDTYPE_PGID;
+ pid = get_pid(task_pgrp(current));
+ } else /* upid > 0 */ {
+ type = PIDTYPE_PID;
+ pid = find_get_pid(upid);
+ }
+
+ ret = do_wait(type, pid, options | WEXITED, NULL, NULL, NULL);
+ put_pid(pid);
+
+ /* avoid REGPARM breakage on x86: */
+ asmlinkage_protect(4, ret, upid, options);
+ return ret;
+}
+
asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
int options, struct rusage __user *ru)
{
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232..e8d4da6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
cond_syscall(sys_ioprio_get);

/* New file descriptors */
+cond_syscall(sys_waitfd);
cond_syscall(sys_signalfd);
cond_syscall(sys_signalfd4);
cond_syscall(compat_sys_signalfd);


2008-12-09 17:06:03

by Alan

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

> prototype for a fourth: waitfd. It pretty much does what the name
> suggests: reading from it yields a series of status ints (as would be
> written into the second argument of waitpid) for child processes that
> have changed state. It takes essentially the same arguments as waitpid
> (for now) and supports the same set of features.

This propogates the fundamental braindamage of waitpid - the fact the
notification only works on child process trees.

Here is a more elegant suggestion - use epoll, inotify and friends fully
on /proc process nodes.

Alan

2008-12-09 17:50:27

by Scott James Remnant

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

On Tue, 2008-12-09 at 17:05 +0000, Alan Cox wrote:

> Here is a more elegant suggestion - use epoll, inotify and friends fully
> on /proc process nodes.
>
Could you give an example?

What is supported?

Scott
--
Have you ever, ever felt like this?
Had strange things happen? Are you going round the twist?


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2008-12-09 18:46:54

by Casey Dahlin

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

Alan Cox wrote:
>> prototype for a fourth: waitfd. It pretty much does what the name
>> suggests: reading from it yields a series of status ints (as would be
>> written into the second argument of waitpid) for child processes that
>> have changed state. It takes essentially the same arguments as waitpid
>> (for now) and supports the same set of features.
>>
>
> This propogates the fundamental braindamage of waitpid - the fact the
> notification only works on child process trees.
>
> Here is a more elegant suggestion - use epoll, inotify and friends fully
> on /proc process nodes.
>
> Alan
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Last I checked inotify was not supported in /proc, or at least most of
it. What kind of work load is it to change that?

--CJD

2008-12-09 19:05:13

by Alan

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

> > This propogates the fundamental braindamage of waitpid - the fact the
> > notification only works on child process trees.
> >
> > Here is a more elegant suggestion - use epoll, inotify and friends fully
> > on /proc process nodes.

> Last I checked inotify was not supported in /proc, or at least most of
> it. What kind of work load is it to change that?

I don't know but I think it would be the better approach to find it. That
also separates notification of state to parents from the general problem
of wanting to know when a service has died, which seems to be an ever
more common point of interest on the desktop in particular.

File content change notification for /proc is hard because the contents
don't exist in the normal way and get updated but can be done if there is
a wait queue for the job. Actual changes to structure (new directories
etc) is in part a similar problem but there are clear points already in
existence when the proc nodes are created/destroyed and thus notification
can occur.

2008-12-09 19:21:30

by Casey Dahlin

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

Alan Cox wrote:
>>> This propogates the fundamental braindamage of waitpid - the fact the
>>> notification only works on child process trees.
>>>
>>> Here is a more elegant suggestion - use epoll, inotify and friends fully
>>> on /proc process nodes.
>>>
>
>
>> Last I checked inotify was not supported in /proc, or at least most of
>> it. What kind of work load is it to change that?
>>
>
> I don't know but I think it would be the better approach to find it. That
> also separates notification of state to parents from the general problem
> of wanting to know when a service has died, which seems to be an ever
> more common point of interest on the desktop in particular.
>
>
Of course, using inotify on proc will not (and should not) actually reap
dead processes, meaning waitpid() isn't obviated by the change (though
now it is always called on a specific pid and is never expected to block
or EAGAIN). We also introduce a new gotcha for userspace programs: this
mechanism works identically for child and non-child processes, so a
process may or may not be waitable when returned. A simple "do not shoot
self in foot" should suffice for this though.

Also, it doesn't work if /proc hasn't been mounted, which just so
happens to matter for my particular use cases :)
> File content change notification for /proc is hard because the contents
> don't exist in the normal way and get updated but can be done if there is
> a wait queue for the job. Actual changes to structure (new directories
> etc) is in part a similar problem but there are clear points already in
> existence when the proc nodes are created/destroyed and thus notification
> can occur.
>
>
Changes to structure are more interesting in terms of this particular
problem anyway, and definitely simpler to capture.

--CJD

2008-12-09 19:41:54

by Davide Libenzi

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

On Tue, 9 Dec 2008, Casey Dahlin wrote:

> This is essentially my first kernel patch, so be nice :)
>
> Linux already has signalfd, timerfd, and eventfd to expose signals, timers,
> and events via a file descriptor. This patch is a working prototype for a
> fourth: waitfd. It pretty much does what the name suggests: reading from it
> yields a series of status ints (as would be written into the second argument
> of waitpid) for child processes that have changed state. It takes essentially
> the same arguments as waitpid (for now) and supports the same set of features.
>
> This is far from ready for merge. Some things that are wrong with it:
> * Waitpid's argument scheme probably isn't the best for this. By default it
> makes it easiest to wait on a single child, which is not often useful in this
> case. Waiting on all children or children in a particular process group is
> possible, but not a particular, explicit set of children, which we probably
> want (and which will complicate the implementation significantly).
> * The prototype for peek_waitpid is obviously in the wrong place, but I
> haven't found a good home for it.
> * Waitid's semantics have slightly altered: passing NULL as the pointer to
> siginfo_t with WNOWAIT will now return successfully instead of throwing
> EFAULT. I don't know if that means I broke it or fixed it :)
> * peek_waitpid may not be required at all now. I can probably trick sys_wait4
> or sys_waitid into giving me what I want (or I could always just make do_wait
> non-static).
>
> Please provide thoughts.

What's wrong in having a signalfd on SIGCHLD, than doing waitpid() once
you get the signal? Do you have cases where this wouldn't be OK?
About the code, eventually, you really want to report the PID of the
exited child, together with the status. So maybe a non-compat-requiring
struct would be better to be returned by read(2).
Also ...




> +static unsigned int waitfd_poll(struct file *file, poll_table *wait)
> +{
> + struct waitfd_ctx *ctx = file->private_data;
> + long value;
> +
> + poll_wait(file, &current->signal->wait_chldexit, wait);
> +
> + value = peek_waitpid(ctx->upid, ctx->ops);
> + if (value > 0) {
> + return POLLIN;
> + } if (value == -ECHILD) {
> + return POLLIN;
> + }

Trust the compiler, it's pretty good in not having you to add Perl-like
extra brackets ;)
This also looks wierd:

} if (value == -ECHILD) {

So maybe:

return value > 0 || value == -ECHILD ? POLLIN: 0;





> +/*
> + * Returns a multiple of the size of a "struct waitfd_siginfo", or a negative
> + * error code. The "count" parameter must be at least the size of a
> + * "struct waitfd_siginfo".
> + */

Really? ...



> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
> + loff_t *ppos)
> +{
> + struct waitfd_ctx *ctx = file->private_data;
> + int __user *stat_addr = (int *)buf;
> + int nonblock = file->f_flags & O_NONBLOCK ? WNOHANG: 0;
> + ssize_t ret, total = 0;
> +
> + count /= sizeof(int);
> + if (!count)
> + return -EINVAL;
> +
> + do {
> + ret = sys_wait4(ctx->upid, stat_addr, ctx->ops | nonblock,
> + NULL);
> + if (ret == 0)
> + ret = -EAGAIN;
> + if (ret == -ECHILD)
> + ret = 0;
> + if (ret <= 0)
> + break;
> +
> + stat_addr++;
> + total += sizeof(struct siginfo);
> + nonblock = WNOHANG;
> + } while (--count);
> +
> + return total ? total: ret;
> +}

... looks like you're returning a sequence of status ints, with wrong data
size returned by read(2).




> +
> +static const struct file_operations waitfd_fops = {
> + .release = waitfd_release,
> + .poll = waitfd_poll,
> + .read = waitfd_read,
> +};
> +
> +asmlinkage long sys_waitfd(pid_t upid, int ops)

Please leave space for extra flags for fds, otherwise Uli will have to
make another sys_waitfd3().



> +long peek_waitpid(pid_t upid, int options)
> +{
> + struct pid *pid = NULL;
> + enum pid_type type;
> + long ret;
> +
> + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
> + __WNOTHREAD|__WCLONE|__WALL))
> + return -EINVAL;
> +
> + options |= WNOHANG | WNOWAIT;
> +
> + if (upid == -1)
> + type = PIDTYPE_MAX;
> + else if (upid < 0) {
> + type = PIDTYPE_PGID;
> + pid = find_get_pid(-upid);
> + } else if (upid == 0) {
> + type = PIDTYPE_PGID;
> + pid = get_pid(task_pgrp(current));
> + } else /* upid > 0 */ {
> + type = PIDTYPE_PID;
> + pid = find_get_pid(upid);
> + }
> +
> + ret = do_wait(type, pid, options | WEXITED, NULL, NULL, NULL);
> + put_pid(pid);
> +
> + /* avoid REGPARM breakage on x86: */
> + asmlinkage_protect(4, ret, upid, options);
> + return ret;
> +}

Given that you blinded copied this from sys_wait4(), you may want to at
least try to make sys_wait4() to re-use the new function.
Also, your patch does not apply to latest Linus tree. Which one was the
base?



- Davide

2008-12-09 20:09:27

by Casey Dahlin

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

Davide Libenzi wrote:
> On Tue, 9 Dec 2008, Casey Dahlin wrote:
>
>
>> This is essentially my first kernel patch, so be nice :)
>>
>> Linux already has signalfd, timerfd, and eventfd to expose signals, timers,
>> and events via a file descriptor. This patch is a working prototype for a
>> fourth: waitfd. It pretty much does what the name suggests: reading from it
>> yields a series of status ints (as would be written into the second argument
>> of waitpid) for child processes that have changed state. It takes essentially
>> the same arguments as waitpid (for now) and supports the same set of features.
>>
>> This is far from ready for merge. Some things that are wrong with it:
>> * Waitpid's argument scheme probably isn't the best for this. By default it
>> makes it easiest to wait on a single child, which is not often useful in this
>> case. Waiting on all children or children in a particular process group is
>> possible, but not a particular, explicit set of children, which we probably
>> want (and which will complicate the implementation significantly).
>> * The prototype for peek_waitpid is obviously in the wrong place, but I
>> haven't found a good home for it.
>> * Waitid's semantics have slightly altered: passing NULL as the pointer to
>> siginfo_t with WNOWAIT will now return successfully instead of throwing
>> EFAULT. I don't know if that means I broke it or fixed it :)
>> * peek_waitpid may not be required at all now. I can probably trick sys_wait4
>> or sys_waitid into giving me what I want (or I could always just make do_wait
>> non-static).
>>
>> Please provide thoughts.
>>
>
> What's wrong in having a signalfd on SIGCHLD, than doing waitpid() once
> you get the signal? Do you have cases where this wouldn't be OK?
>

When the child doesn't send SIGCHLD (clone() lets you do evil things :)

Seriously though, that may be an option. Scott (CC'd ) is primarily the
consumer of this, so he can better comment.

> About the code, eventually, you really want to report the PID of the
> exited child, together with the status. So maybe a non-compat-requiring
> struct would be better to be returned by read(2).
> Also ...
>
>
>
>
>
>> +static unsigned int waitfd_poll(struct file *file, poll_table *wait)
>> +{
>> + struct waitfd_ctx *ctx = file->private_data;
>> + long value;
>> +
>> + poll_wait(file, &current->signal->wait_chldexit, wait);
>> +
>> + value = peek_waitpid(ctx->upid, ctx->ops);
>> + if (value > 0) {
>> + return POLLIN;
>> + } if (value == -ECHILD) {
>> + return POLLIN;
>> + }
>>
>
> Trust the compiler, it's pretty good in not having you to add Perl-like
> extra brackets ;)
> This also looks wierd:
>
> } if (value == -ECHILD) {
>
> So maybe:
>
> return value > 0 || value == -ECHILD ? POLLIN: 0;
>
>
>

Yeah, I ripped this up because during debugging poll was behaving oddly,
and I had been instrumenting the code with printks. It can go back to
being 1 line now.

>
>
>
>> +/*
>> + * Returns a multiple of the size of a "struct waitfd_siginfo", or a negative
>> + * error code. The "count" parameter must be at least the size of a
>> + * "struct waitfd_siginfo".
>> + */
>>
>
> Really? ...
>
>
>

Wow... and I read this patch before I sent it, too *facepalm*

The function did briefly return a siginfo_t (I implemented on top of
waitid first).

>
>> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count,
>> + loff_t *ppos)
>> +{
>> + struct waitfd_ctx *ctx = file->private_data;
>> + int __user *stat_addr = (int *)buf;
>> + int nonblock = file->f_flags & O_NONBLOCK ? WNOHANG: 0;
>> + ssize_t ret, total = 0;
>> +
>> + count /= sizeof(int);
>> + if (!count)
>> + return -EINVAL;
>> +
>> + do {
>> + ret = sys_wait4(ctx->upid, stat_addr, ctx->ops | nonblock,
>> + NULL);
>> + if (ret == 0)
>> + ret = -EAGAIN;
>> + if (ret == -ECHILD)
>> + ret = 0;
>> + if (ret <= 0)
>> + break;
>> +
>> + stat_addr++;
>> + total += sizeof(struct siginfo);
>> + nonblock = WNOHANG;
>> + } while (--count);
>> +
>> + return total ? total: ret;
>> +}
>>
>
> ... looks like you're returning a sequence of status ints, with wrong data
> size returned by read(2).
>
>
>
More leftovers from above.
>
>
>> +
>> +static const struct file_operations waitfd_fops = {
>> + .release = waitfd_release,
>> + .poll = waitfd_poll,
>> + .read = waitfd_read,
>> +};
>> +
>> +asmlinkage long sys_waitfd(pid_t upid, int ops)
>>
>
> Please leave space for extra flags for fds, otherwise Uli will have to
> make another sys_waitfd3().
>
>
>
ack'd.
>
>> +long peek_waitpid(pid_t upid, int options)
>> +{
>> + struct pid *pid = NULL;
>> + enum pid_type type;
>> + long ret;
>> +
>> + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
>> + __WNOTHREAD|__WCLONE|__WALL))
>> + return -EINVAL;
>> +
>> + options |= WNOHANG | WNOWAIT;
>> +
>> + if (upid == -1)
>> + type = PIDTYPE_MAX;
>> + else if (upid < 0) {
>> + type = PIDTYPE_PGID;
>> + pid = find_get_pid(-upid);
>> + } else if (upid == 0) {
>> + type = PIDTYPE_PGID;
>> + pid = get_pid(task_pgrp(current));
>> + } else /* upid > 0 */ {
>> + type = PIDTYPE_PID;
>> + pid = find_get_pid(upid);
>> + }
>> +
>> + ret = do_wait(type, pid, options | WEXITED, NULL, NULL, NULL);
>> + put_pid(pid);
>> +
>> + /* avoid REGPARM breakage on x86: */
>> + asmlinkage_protect(4, ret, upid, options);
>> + return ret;
>> +}
>>
>
> Given that you blinded copied this from sys_wait4(), you may want to at
> least try to make sys_wait4() to re-use the new function.
>

Good idea.

> Also, your patch does not apply to latest Linus tree. Which one was the
> base?
>
>
This is the last commit before mine in my git repo:

commit f7a8db89c1f42e504bb12d2ae399cd96f755a7db
Merge: 6f84b4d... c49b9f2...
Author: Linus Torvalds <[email protected]>
Date: Mon Dec 8 19:52:43 2008 -0800

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6:
tproxy: fixe a possible read from an invalid location in the
socket match
zd1211rw: use unaligned safe memcmp() in-place of compare_ether_addr()
(...lots and lots of changes snipped...)

--CJD

2008-12-13 02:58:10

by Scott James Remnant

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

On Tue, 2008-12-09 at 11:41 -0800, Davide Libenzi wrote:

> On Tue, 9 Dec 2008, Casey Dahlin wrote:
>
> > Linux already has signalfd, timerfd, and eventfd to expose signals, timers,
> > and events via a file descriptor. This patch is a working prototype for a
> > fourth: waitfd. It pretty much does what the name suggests: reading from it
> > yields a series of status ints (as would be written into the second argument
> > of waitpid) for child processes that have changed state. It takes essentially
> > the same arguments as waitpid (for now) and supports the same set of features.
> >
> What's wrong in having a signalfd on SIGCHLD, than doing waitpid() once
> you get the signal?
>
Because SIGCHLD isn't a POSIX realtime signal, only one copy of it will
be queued at any one time -- even with signalfd(), and even though they
have different (useful) siginfo_t.

So if you have three children die in rapid succession, you only get the
siginfo for the first one. Thus you still have to call
waitid()/waitpid() in a loop, and wait on everything.

Could the fact that you don't get signalfd notification of the
additional signals be considered a bug? Or possibly a useful additional
feature?

If we were able to read all the queued SIGCHLD signals with signalfd
(preserving the one pending only behaviour of ordinary delivery), then a
loop like the following would be possible:

sigemptyset (&mask);
sigaddset (&mask, SIGCHLD);

sfd = signalfd (-1, &mask, 0);

for (;;) {
read (sfd, &fdsi, sizeof (struct signalfd_siginfo));

waitpid (fdsi.ssi_pid, 0, 0);
}

So you only need to wait for each one individually.

Scott
--
Have you ever, ever felt like this?
Had strange things happen? Are you going round the twist?


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2008-12-13 04:29:55

by Davide Libenzi

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

On Fri, 12 Dec 2008, Scott James Remnant wrote:

> On Tue, 2008-12-09 at 11:41 -0800, Davide Libenzi wrote:
>
> > On Tue, 9 Dec 2008, Casey Dahlin wrote:
> >
> > > Linux already has signalfd, timerfd, and eventfd to expose signals, timers,
> > > and events via a file descriptor. This patch is a working prototype for a
> > > fourth: waitfd. It pretty much does what the name suggests: reading from it
> > > yields a series of status ints (as would be written into the second argument
> > > of waitpid) for child processes that have changed state. It takes essentially
> > > the same arguments as waitpid (for now) and supports the same set of features.
> > >
> > What's wrong in having a signalfd on SIGCHLD, than doing waitpid() once
> > you get the signal?
> >
> Because SIGCHLD isn't a POSIX realtime signal, only one copy of it will
> be queued at any one time -- even with signalfd(), and even though they
> have different (useful) siginfo_t.
>
> So if you have three children die in rapid succession, you only get the
> siginfo for the first one. Thus you still have to call
> waitid()/waitpid() in a loop, and wait on everything.
>
> Could the fact that you don't get signalfd notification of the
> additional signals be considered a bug? Or possibly a useful additional
> feature?
>
> If we were able to read all the queued SIGCHLD signals with signalfd
> (preserving the one pending only behaviour of ordinary delivery), then a
> loop like the following would be possible:
>
> sigemptyset (&mask);
> sigaddset (&mask, SIGCHLD);
>
> sfd = signalfd (-1, &mask, 0);
>
> for (;;) {
> read (sfd, &fdsi, sizeof (struct signalfd_siginfo));
>
> waitpid (fdsi.ssi_pid, 0, 0);
> }

And how about this?

sfd = signalfd(SIGCHLD);
for (;;) {
poll(sfd, POLLIN);
while ((pid = waitpid(0, &status, WNOHANG)) != -1)
process_child_death(pid);
}



- Davide

2008-12-13 08:43:28

by Scott James Remnant

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

On Fri, 2008-12-12 at 20:29 -0800, Davide Libenzi wrote:

> And how about this?
>
> sfd = signalfd(SIGCHLD);
>
> for (;;) {
> poll(sfd, POLLIN);
> while ((pid = waitpid(0, &status, WNOHANG)) != -1)
> process_child_death(pid);
> }
>
At this point, why have signalfd()'s read() return siginfo_t at all?

You have to discard the entire information since it's only ever the
first signal that matched, all subsequent ones are thrown away.

Scott
--
Have you ever, ever felt like this?
Had strange things happen? Are you going round the twist?


Attachments:
signature.asc (197.00 B)
This is a digitally signed message part

2008-12-13 18:39:27

by Davide Libenzi

[permalink] [raw]
Subject: Re: [RFC PATCH] waitfd: file descriptor to wait on child processes

On Sat, 13 Dec 2008, Scott James Remnant wrote:

> On Fri, 2008-12-12 at 20:29 -0800, Davide Libenzi wrote:
>
> > And how about this?
> >
> > sfd = signalfd(SIGCHLD);
> >
> > for (;;) {
> > poll(sfd, POLLIN);
> > while ((pid = waitpid(0, &status, WNOHANG)) != -1)
> > process_child_death(pid);
> > }
> >
> At this point, why have signalfd()'s read() return siginfo_t at all?
>
> You have to discard the entire information since it's only ever the
> first signal that matched, all subsequent ones are thrown away.

Signalfd is a wakeup gate available to the poll/select/epoll subsystems.
Signals that are queued inside the kernel, get de-queueing behavior, while
signals that are overlapping, get overlapping behavior.
The code above enable you to wait for signals inside a poll/select/epoll
dispatch loop, and get all the info waitpid() can return to you - w/out
missing anything.
In a real app case, you're probably waiting for more than simple SIGCHLD,
so you likely will have one (or more) read(2) of the signalfd to find out
which signal you got, and do the waitpid(WNOHANG) loop in case of SIGCHLD.



- Davide