This patch adds a new file under /proc/pid, /proc/pid/exithand.
Attempting to read from an exithand file will block until the
corresponding process exits, at which point the read will successfully
complete with EOF. The file descriptor supports both blocking
operations and poll(2). It's intended to be a minimal interface for
allowing a program to wait for the exit of a process that is not one
of its children.
Why might we want this interface? Android's lmkd kills processes in
order to free memory in response to various memory pressure
signals. It's desirable to wait until a killed process actually exits
before moving on (if needed) to killing the next process. Since the
processes that lmkd kills are not lmkd's children, lmkd currently
lacks a way to wait for a proces to actually die after being sent
SIGKILL; today, lmkd resorts to polling the proc filesystem pid
entry. This interface allow lmkd to give up polling and instead block
and wait for process death.
Signed-off-by: Daniel Colascione <[email protected]>
---
fs/proc/Makefile | 1 +
fs/proc/base.c | 1 +
fs/proc/exithand.c | 117 +++++++++++++++++++++++++++++++++++
fs/proc/internal.h | 4 ++
include/linux/sched/signal.h | 7 +++
kernel/exit.c | 2 +
kernel/signal.c | 3 +
7 files changed, 135 insertions(+)
create mode 100644 fs/proc/exithand.c
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e80510..21322280a2c1 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,7 @@ proc-y += softirqs.o
proc-y += namespaces.o
proc-y += self.o
proc-y += thread_self.o
+proc-y += exithand.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07bf260d..31bc6bbb6dc4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3006,6 +3006,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+ REG("exithand", S_IRUGO, proc_tgid_exithand_operations),
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/exithand.c b/fs/proc/exithand.c
new file mode 100644
index 000000000000..358b08da6a08
--- /dev/null
+++ b/fs/proc/exithand.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Synchronous exit notification of non-child processes
+ *
+ * Simple file descriptor /proc/pid/exithand. Read blocks (and poll
+ * reports non-readable) until process either dies or becomes
+ * a zombie.
+ */
+#include <linux/printk.h>
+#include <linux/sched/signal.h>
+#include <linux/poll.h>
+#include "internal.h"
+
+static int proc_tgid_exithand_open(struct inode *inode, struct file *file)
+{
+ struct task_struct* task = get_proc_task(inode);
+ /* If get_proc_task failed, it means the task is dead, which
+ * is fine, since a subsequent read will return
+ * immediately. */
+ if (task && !thread_group_leader(task))
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t proc_tgid_exithand_read(struct file * file,
+ char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct* task = NULL;
+ wait_queue_entry_t wait;
+ ssize_t res = 0;
+ bool locked = false;
+
+ for (;;) {
+ /* Retrieve the task from the struct pid each time
+ * through the loop in case the exact struct task
+ * changes underneath us (e.g., if in exec.c, the
+ * execing process kills the group leader and starts
+ * using its PID). The struct signal should be the
+ * same though even in this case.
+ */
+ task = get_proc_task(file_inode(file));
+ res = 0;
+ if (!task)
+ goto out; /* No task? Must have died. */
+
+ BUG_ON(!thread_group_leader(task));
+
+ /* Synchronizes with exit.c machinery. */
+ read_lock(&tasklist_lock);
+ locked = true;
+
+ res = 0;
+ if (task->exit_state)
+ goto out;
+
+ res = -EAGAIN;
+ if (file->f_flags & O_NONBLOCK)
+ goto out;
+
+ /* Tell exit.c to go to the trouble of waking our
+ * runqueue when this process gets around to
+ * exiting. */
+ task->signal->exithand_is_interested = true;
+
+ /* Even if the task identity changes, task->signal
+ * should be invariant across the wait, making it safe
+ * to go remove our wait record from the wait queue
+ * after we come back from schedule. */
+
+ init_waitqueue_entry(&wait, current);
+ add_wait_queue(&wait_exithand, &wait);
+
+ read_unlock(&tasklist_lock);
+ locked = false;
+
+ put_task_struct(task);
+ task = NULL;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&wait_exithand, &wait);
+
+ res = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto out;
+ }
+out:
+ if (locked)
+ read_unlock(&tasklist_lock);
+ if (task)
+ put_task_struct(task);
+ return res;
+}
+
+static __poll_t proc_tgid_exithand_poll(struct file *file, poll_table *wait)
+{
+ __poll_t mask = 0;
+ struct task_struct* task = get_proc_task(file_inode(file));
+ if (!task) {
+ mask |= POLLIN;
+ } else if (READ_ONCE(task->exit_state)) {
+ mask |= POLLIN;
+ } else {
+ read_lock(&tasklist_lock);
+ task->signal->exithand_is_interested = true;
+ read_unlock(&tasklist_lock);
+ poll_wait(file, &wait_exithand, wait);
+ }
+ return mask;
+}
+
+const struct file_operations proc_tgid_exithand_operations = {
+ .open = proc_tgid_exithand_open,
+ .read = proc_tgid_exithand_read,
+ .poll = proc_tgid_exithand_poll,
+};
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..1009d20475bc 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -304,3 +304,7 @@ extern unsigned long task_statm(struct mm_struct *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);
+
+/* exithand.c */
+
+extern const struct file_operations proc_tgid_exithand_operations;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 13789d10a50e..44131cb6c7f4 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -74,6 +74,10 @@ struct multiprocess_signals {
struct hlist_node node;
};
+/* Need to stick the waitq for exithand outside process structures in
+ * case a process disappears across a poll. */
+extern wait_queue_head_t wait_exithand;
+
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
@@ -87,6 +91,9 @@ struct signal_struct {
int nr_threads;
struct list_head thread_head;
+ /* Protected with tasklist_lock. */
+ bool exithand_is_interested;
+
wait_queue_head_t wait_chldexit; /* for wait4() */
/* current thread group signal load-balancing target: */
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..44a4e3796f8b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1485,6 +1485,8 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
TASK_INTERRUPTIBLE, 1, p);
+ if (p->signal->exithand_is_interested)
+ __wake_up_sync(&wait_exithand, TASK_INTERRUPTIBLE, 0);
}
static long do_wait(struct wait_opts *wo)
diff --git a/kernel/signal.c b/kernel/signal.c
index 17565240b1c6..e156d48da70a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -454,6 +454,9 @@ void flush_sigqueue(struct sigpending *queue)
}
}
+wait_queue_head_t wait_exithand =
+ __WAIT_QUEUE_HEAD_INITIALIZER(wait_exithand);
+
/*
* Flush all pending signals for this kthread.
*/
--
2.19.1.568.g152ad8e336-goog
This patch adds a new file under /proc/pid, /proc/pid/exithand.
Attempting to read from an exithand file will block until the
corresponding process exits, at which point the read will successfully
complete with EOF. The file descriptor supports both blocking
operations and poll(2). It's intended to be a minimal interface for
allowing a program to wait for the exit of a process that is not one
of its children.
Why might we want this interface? Android's lmkd kills processes in
order to free memory in response to various memory pressure
signals. It's desirable to wait until a killed process actually exits
before moving on (if needed) to killing the next process. Since the
processes that lmkd kills are not lmkd's children, lmkd currently
lacks a way to wait for a process to actually die after being sent
SIGKILL; today, lmkd resorts to polling the proc filesystem pid
entry. This interface allow lmkd to give up polling and instead block
and wait for process death.
Signed-off-by: Daniel Colascione <[email protected]>
---
fs/proc/Makefile | 1 +
fs/proc/base.c | 1 +
fs/proc/exithand.c | 128 +++++++++++++++++++++++++++++++++++
fs/proc/internal.h | 4 ++
include/linux/sched/signal.h | 8 +++
kernel/exit.c | 2 +
kernel/signal.c | 3 +
7 files changed, 147 insertions(+)
create mode 100644 fs/proc/exithand.c
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e80510..21322280a2c1 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,7 @@ proc-y += softirqs.o
proc-y += namespaces.o
proc-y += self.o
proc-y += thread_self.o
+proc-y += exithand.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07bf260d..31bc6bbb6dc4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3006,6 +3006,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+ REG("exithand", S_IRUGO, proc_tgid_exithand_operations),
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/exithand.c b/fs/proc/exithand.c
new file mode 100644
index 000000000000..c2ac31c52ff9
--- /dev/null
+++ b/fs/proc/exithand.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Synchronous exit notification of non-child processes
+ *
+ * Simple file descriptor /proc/pid/exithand. Read blocks (and poll
+ * reports non-readable) until process either dies or becomes
+ * a zombie.
+ */
+#include <linux/printk.h>
+#include <linux/sched/signal.h>
+#include <linux/poll.h>
+#include "internal.h"
+
+static int proc_tgid_exithand_open(struct inode *inode, struct file *file)
+{
+ /* If get_proc_task fails, it means the task is dead, which is
+ * fine, since a subsequent read will return immediately and
+ * indicate that, yes, the indicated process is dead.
+ */
+ int res = 0;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (task) {
+ if (!thread_group_leader(task))
+ res = -EINVAL;
+ put_task_struct(task);
+ }
+ return res;
+}
+
+static ssize_t proc_tgid_exithand_read(struct file *file,
+ char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task = NULL;
+ wait_queue_entry_t wait;
+ ssize_t res = 0;
+ bool locked = false;
+
+ for (;;) {
+ /* Retrieve the task from the struct pid each time
+ * through the loop in case the exact struct task
+ * changes underneath us (e.g., if in exec.c, the
+ * execing process kills the group leader and starts
+ * using its PID). The struct signal should be the
+ * same though even in this case.
+ */
+ task = get_proc_task(file_inode(file));
+ res = 0;
+ if (!task)
+ goto out; /* No task? Must have died. */
+
+ BUG_ON(!thread_group_leader(task));
+
+ /* Synchronizes with exit.c machinery. */
+ read_lock(&tasklist_lock);
+ locked = true;
+
+ res = 0;
+ if (task->exit_state)
+ goto out;
+
+ res = -EAGAIN;
+ if (file->f_flags & O_NONBLOCK)
+ goto out;
+
+ /* Tell exit.c to go to the trouble of waking our
+ * runqueue when this process gets around to
+ * exiting.
+ */
+ task->signal->exithand_is_interested = true;
+
+ /* Even if the task identity changes, task->signal
+ * should be invariant across the wait, making it safe
+ * to go remove our wait record from the wait queue
+ * after we come back from schedule.
+ */
+
+ init_waitqueue_entry(&wait, current);
+ add_wait_queue(&wait_exithand, &wait);
+
+ read_unlock(&tasklist_lock);
+ locked = false;
+
+ put_task_struct(task);
+ task = NULL;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&wait_exithand, &wait);
+
+ res = -ERESTARTSYS;
+ if (signal_pending(current))
+ goto out;
+ }
+out:
+ if (locked)
+ read_unlock(&tasklist_lock);
+ if (task)
+ put_task_struct(task);
+ return res;
+}
+
+static __poll_t proc_tgid_exithand_poll(struct file *file, poll_table *wait)
+{
+ __poll_t mask = 0;
+ struct task_struct *task = get_proc_task(file_inode(file));
+
+ if (!task) {
+ mask |= POLLIN;
+ } else if (READ_ONCE(task->exit_state)) {
+ mask |= POLLIN;
+ } else {
+ read_lock(&tasklist_lock);
+ task->signal->exithand_is_interested = true;
+ read_unlock(&tasklist_lock);
+ poll_wait(file, &wait_exithand, wait);
+ }
+ if (task)
+ put_task_struct(task);
+ return mask;
+}
+
+const struct file_operations proc_tgid_exithand_operations = {
+ .open = proc_tgid_exithand_open,
+ .read = proc_tgid_exithand_read,
+ .poll = proc_tgid_exithand_poll,
+};
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..1009d20475bc 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -304,3 +304,7 @@ extern unsigned long task_statm(struct mm_struct *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);
+
+/* exithand.c */
+
+extern const struct file_operations proc_tgid_exithand_operations;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 13789d10a50e..f44397055429 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -74,6 +74,11 @@ struct multiprocess_signals {
struct hlist_node node;
};
+/* Need to stick the waitq for exithand outside process structures in
+ * case a process disappears across a poll.
+ */
+extern wait_queue_head_t wait_exithand;
+
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
@@ -87,6 +92,9 @@ struct signal_struct {
int nr_threads;
struct list_head thread_head;
+ /* Protected with tasklist_lock. */
+ bool exithand_is_interested;
+
wait_queue_head_t wait_chldexit; /* for wait4() */
/* current thread group signal load-balancing target: */
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d21f35..44a4e3796f8b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1485,6 +1485,8 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
TASK_INTERRUPTIBLE, 1, p);
+ if (p->signal->exithand_is_interested)
+ __wake_up_sync(&wait_exithand, TASK_INTERRUPTIBLE, 0);
}
static long do_wait(struct wait_opts *wo)
diff --git a/kernel/signal.c b/kernel/signal.c
index 17565240b1c6..e156d48da70a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -454,6 +454,9 @@ void flush_sigqueue(struct sigpending *queue)
}
}
+wait_queue_head_t wait_exithand =
+ __WAIT_QUEUE_HEAD_INITIALIZER(wait_exithand);
+
/*
* Flush all pending signals for this kthread.
*/
--
2.19.1.568.g152ad8e336-goog
On Mon, Oct 29, 2018 at 10:53 AM Daniel Colascione <[email protected]> wrote:
>
> This patch adds a new file under /proc/pid, /proc/pid/exithand.
> Attempting to read from an exithand file will block until the
> corresponding process exits, at which point the read will successfully
> complete with EOF. The file descriptor supports both blocking
> operations and poll(2). It's intended to be a minimal interface for
> allowing a program to wait for the exit of a process that is not one
> of its children.
>
> Why might we want this interface? Android's lmkd kills processes in
> order to free memory in response to various memory pressure
> signals. It's desirable to wait until a killed process actually exits
> before moving on (if needed) to killing the next process. Since the
> processes that lmkd kills are not lmkd's children, lmkd currently
> lacks a way to wait for a proces to actually die after being sent
> SIGKILL; today, lmkd resorts to polling the proc filesystem pid
Any idea why it needs to wait and then send SIGKILL? Why not do
SIGKILL and look for errno == ESRCH in a loop with a delay.
> entry. This interface allow lmkd to give up polling and instead block
> and wait for process death.
Can we use ptrace(2) for the exit notifications? I am assuming you
already though about it but I'm curious what is the reason this is
better.
thanks,
-Joel
On Mon, Oct 29, 2018 at 12:45 PM Joel Fernandes <[email protected]> wrote:
>
> On Mon, Oct 29, 2018 at 10:53 AM Daniel Colascione <[email protected]> wrote:
> >
> > This patch adds a new file under /proc/pid, /proc/pid/exithand.
> > Attempting to read from an exithand file will block until the
> > corresponding process exits, at which point the read will successfully
> > complete with EOF. The file descriptor supports both blocking
> > operations and poll(2). It's intended to be a minimal interface for
> > allowing a program to wait for the exit of a process that is not one
> > of its children.
> >
> > Why might we want this interface? Android's lmkd kills processes in
> > order to free memory in response to various memory pressure
> > signals. It's desirable to wait until a killed process actually exits
> > before moving on (if needed) to killing the next process. Since the
> > processes that lmkd kills are not lmkd's children, lmkd currently
> > lacks a way to wait for a proces to actually die after being sent
> > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
>
> Any idea why it needs to wait and then send SIGKILL? Why not do
> SIGKILL and look for errno == ESRCH in a loop with a delay.
>
Sorry I take that back, I see it needs to wait after sending the kill,
not before (duh). Anyway if the polling is ever rewritten, another way
could be to do kill(pid, 0) and then check for return of -1 and errno
== ESRCH; instead of looking at /proc/
Thanks for taking a look.
On Mon, Oct 29, 2018 at 7:45 PM, Joel Fernandes <[email protected]> wrote:
>
> On Mon, Oct 29, 2018 at 10:53 AM Daniel Colascione <[email protected]> wrote:
> >
> > This patch adds a new file under /proc/pid, /proc/pid/exithand.
> > Attempting to read from an exithand file will block until the
> > corresponding process exits, at which point the read will successfully
> > complete with EOF. The file descriptor supports both blocking
> > operations and poll(2). It's intended to be a minimal interface for
> > allowing a program to wait for the exit of a process that is not one
> > of its children.
> >
> > Why might we want this interface? Android's lmkd kills processes in
> > order to free memory in response to various memory pressure
> > signals. It's desirable to wait until a killed process actually exits
> > before moving on (if needed) to killing the next process. Since the
> > processes that lmkd kills are not lmkd's children, lmkd currently
> > lacks a way to wait for a proces to actually die after being sent
> > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
>
> Any idea why it needs to wait and then send SIGKILL? Why not do
> SIGKILL and look for errno == ESRCH in a loop with a delay.
I want to get polling loops out of the system. Polling loops are bad
for wakeup attribution, bad for power, bad for priority inheritance,
and bad for latency. There's no right answer to the question "How long
should I wait before checking $CONDITION again?". If we can have an
explicit waitqueue interface to something, we should. Besides, PID
polling is vulnerable to PID reuse, whereas this mechanism (just like
anything based on struct pid) is immune to it.
> > entry. This interface allow lmkd to give up polling and instead block
> > and wait for process death.
>
> Can we use ptrace(2) for the exit notifications? I am assuming you
> already though about it but I'm curious what is the reason this is
> better.
Only one process can ptrace a given process at a time, so I don't like
ptrace as a mechanism for anything except debugging.
Relying on ptrace for exit notification would interfere with things
like debuggers and crash dump collection systems. Besides, ptrace can
do too much (like read and write process memory) and so requires very
strong privileges not necessary for this mechanism. Besides: ptrace's
interface is complicated and relies on repeated calls to various wait
functions, whereas the interface in this patch is simple enough to use
from the shell.
On Mon, Oct 29, 2018 at 1:01 PM Daniel Colascione <[email protected]> wrote:
>
> Thanks for taking a look.
>
> On Mon, Oct 29, 2018 at 7:45 PM, Joel Fernandes <[email protected]> wrote:
> >
> > On Mon, Oct 29, 2018 at 10:53 AM Daniel Colascione <[email protected]> wrote:
> > >
> > > This patch adds a new file under /proc/pid, /proc/pid/exithand.
> > > Attempting to read from an exithand file will block until the
> > > corresponding process exits, at which point the read will successfully
> > > complete with EOF. The file descriptor supports both blocking
> > > operations and poll(2). It's intended to be a minimal interface for
> > > allowing a program to wait for the exit of a process that is not one
> > > of its children.
> > >
> > > Why might we want this interface? Android's lmkd kills processes in
> > > order to free memory in response to various memory pressure
> > > signals. It's desirable to wait until a killed process actually exits
> > > before moving on (if needed) to killing the next process. Since the
> > > processes that lmkd kills are not lmkd's children, lmkd currently
> > > lacks a way to wait for a proces to actually die after being sent
> > > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
> >
> > Any idea why it needs to wait and then send SIGKILL? Why not do
> > SIGKILL and look for errno == ESRCH in a loop with a delay.
>
> I want to get polling loops out of the system. Polling loops are bad
> for wakeup attribution, bad for power, bad for priority inheritance,
> and bad for latency. There's no right answer to the question "How long
> should I wait before checking $CONDITION again?". If we can have an
> explicit waitqueue interface to something, we should. Besides, PID
> polling is vulnerable to PID reuse, whereas this mechanism (just like
> anything based on struct pid) is immune to it.
The argument sounds Ok to me. I would also more details in the commit
message about the alternate methods to do this (such as kill polling
or ptrace) and why they don't work well etc so no one asks any
questions. Like maybe under a "other ways to do this" section. A bit
of googling also showed a netlink way of doing it without polling
(though I don't look into that much and wouldn't be surprised if its
more complicated)
Also I guess when you send a patch, it'd be good to pass
"--cc-cmd='./scripts/get_maintainer.pl" to git-send-email so it
automatically CCs the maintainers who maintain this.
thanks,
- Joel
On Tue, Oct 30, 2018 at 3:06 AM, Joel Fernandes <[email protected]> wrote:
> On Mon, Oct 29, 2018 at 1:01 PM Daniel Colascione <[email protected]> wrote:
>>
>> Thanks for taking a look.
>>
>> On Mon, Oct 29, 2018 at 7:45 PM, Joel Fernandes <[email protected]> wrote:
>> >
>> > On Mon, Oct 29, 2018 at 10:53 AM Daniel Colascione <[email protected]> wrote:
>> > >
>> > > This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> > > Attempting to read from an exithand file will block until the
>> > > corresponding process exits, at which point the read will successfully
>> > > complete with EOF. The file descriptor supports both blocking
>> > > operations and poll(2). It's intended to be a minimal interface for
>> > > allowing a program to wait for the exit of a process that is not one
>> > > of its children.
>> > >
>> > > Why might we want this interface? Android's lmkd kills processes in
>> > > order to free memory in response to various memory pressure
>> > > signals. It's desirable to wait until a killed process actually exits
>> > > before moving on (if needed) to killing the next process. Since the
>> > > processes that lmkd kills are not lmkd's children, lmkd currently
>> > > lacks a way to wait for a proces to actually die after being sent
>> > > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
>> >
>> > Any idea why it needs to wait and then send SIGKILL? Why not do
>> > SIGKILL and look for errno == ESRCH in a loop with a delay.
>>
>> I want to get polling loops out of the system. Polling loops are bad
>> for wakeup attribution, bad for power, bad for priority inheritance,
>> and bad for latency. There's no right answer to the question "How long
>> should I wait before checking $CONDITION again?". If we can have an
>> explicit waitqueue interface to something, we should. Besides, PID
>> polling is vulnerable to PID reuse, whereas this mechanism (just like
>> anything based on struct pid) is immune to it.
>
> The argument sounds Ok to me. I would also more details in the commit
> message about the alternate methods to do this (such as kill polling
> or ptrace) and why they don't work well etc so no one asks any
> questions. Like maybe under a "other ways to do this" section. A bit
> of googling also showed a netlink way of doing it without polling
> (though I don't look into that much and wouldn't be surprised if its
> more complicated)
Thanks for taking a look. I'll add to the commit message.
Re: netlink isn't enabled everywhere and is subject to lossy buffy
overruns, AIUI. You could also monitor process exit by setting up
ftrace and watching events, or by installing BPF that watched for
process exit and sent a perf event. :-) All of these interfaces feel
like abusing a "monitoring" API for controlling system operations, and
this kind of abuse tends to have ugly failure modes. I'm looking for
something a bit more explicit and robust.
>
> Also I guess when you send a patch, it'd be good to pass
> "--cc-cmd='./scripts/get_maintainer.pl" to git-send-email so it
> automatically CCs the maintainers who maintain this.
Thanks for the tip!
On Tue, Oct 30, 2018 at 08:59:25AM +0000, Daniel Colascione wrote:
> On Tue, Oct 30, 2018 at 3:06 AM, Joel Fernandes <[email protected]> wrote:
> > On Mon, Oct 29, 2018 at 1:01 PM Daniel Colascione <[email protected]> wrote:
> >>
> >> Thanks for taking a look.
> >>
> >> On Mon, Oct 29, 2018 at 7:45 PM, Joel Fernandes <[email protected]> wrote:
> >> >
> >> > On Mon, Oct 29, 2018 at 10:53 AM Daniel Colascione <[email protected]> wrote:
> >> > >
> >> > > This patch adds a new file under /proc/pid, /proc/pid/exithand.
> >> > > Attempting to read from an exithand file will block until the
> >> > > corresponding process exits, at which point the read will successfully
> >> > > complete with EOF. The file descriptor supports both blocking
> >> > > operations and poll(2). It's intended to be a minimal interface for
> >> > > allowing a program to wait for the exit of a process that is not one
> >> > > of its children.
> >> > >
> >> > > Why might we want this interface? Android's lmkd kills processes in
> >> > > order to free memory in response to various memory pressure
> >> > > signals. It's desirable to wait until a killed process actually exits
> >> > > before moving on (if needed) to killing the next process. Since the
> >> > > processes that lmkd kills are not lmkd's children, lmkd currently
> >> > > lacks a way to wait for a proces to actually die after being sent
> >> > > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
> >> >
> >> > Any idea why it needs to wait and then send SIGKILL? Why not do
> >> > SIGKILL and look for errno == ESRCH in a loop with a delay.
> >>
> >> I want to get polling loops out of the system. Polling loops are bad
> >> for wakeup attribution, bad for power, bad for priority inheritance,
> >> and bad for latency. There's no right answer to the question "How long
> >> should I wait before checking $CONDITION again?". If we can have an
> >> explicit waitqueue interface to something, we should. Besides, PID
> >> polling is vulnerable to PID reuse, whereas this mechanism (just like
> >> anything based on struct pid) is immune to it.
> >
> > The argument sounds Ok to me. I would also more details in the commit
> > message about the alternate methods to do this (such as kill polling
> > or ptrace) and why they don't work well etc so no one asks any
> > questions. Like maybe under a "other ways to do this" section. A bit
> > of googling also showed a netlink way of doing it without polling
> > (though I don't look into that much and wouldn't be surprised if its
> > more complicated)
>
> Thanks for taking a look. I'll add to the commit message.
>
> Re: netlink isn't enabled everywhere and is subject to lossy buffy
> overruns, AIUI. You could also monitor process exit by setting up
> ftrace and watching events, or by installing BPF that watched for
> process exit and sent a perf event. :-) All of these interfaces feel
> like abusing a "monitoring" API for controlling system operations, and
> this kind of abuse tends to have ugly failure modes. I'm looking for
> something a bit more explicit and robust.
Sounds good to me!
- Joel
From: Daniel Colascione
> Sent: 29 October 2018 17:53
>
> This patch adds a new file under /proc/pid, /proc/pid/exithand.
> Attempting to read from an exithand file will block until the
> corresponding process exits, at which point the read will successfully
> complete with EOF. The file descriptor supports both blocking
> operations and poll(2). It's intended to be a minimal interface for
> allowing a program to wait for the exit of a process that is not one
> of its children.
Why do you need an extra file?
It ought to be possible to use poll() to wait for POLLERR having set
'events' to zero on any of the nodes in /proc/pid - or even on
the directory itself.
Indeed, to avoid killing the wrong process you need to have opened
some node of /proc/pid/* (maybe cmdline) before sending the kill
signal.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Wed, Oct 31, 2018 at 12:27 PM, David Laight <[email protected]> wrote:
> From: Daniel Colascione
>> Sent: 29 October 2018 17:53
>>
>> This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> Attempting to read from an exithand file will block until the
>> corresponding process exits, at which point the read will successfully
>> complete with EOF. The file descriptor supports both blocking
>> operations and poll(2). It's intended to be a minimal interface for
>> allowing a program to wait for the exit of a process that is not one
>> of its children.
>
> Why do you need an extra file?
Because no current file suffices.
> It ought to be possible to use poll() to wait for POLLERR having set
> 'events' to zero on any of the nodes in /proc/pid - or even on
> the directory itself.
That doesn't actually work today. And waiting on a directory with
POLLERR would be very weird, since directories in general don't do
things like blocking reads or poll support. A separate file with
self-contained, well-defined semantics is cleaner.
> Indeed, to avoid killing the wrong process you need to have opened
> some node of /proc/pid/* (maybe cmdline) before sending the kill
> signal.
The kernel really needs better documentation of the semantics of
procfs file descriptors. You're not the only person to think,
mistakenly, that keeping a reference to a /proc/$PID/something FD
reserves $PID and prevents it being used for another process. Procfs
FDs do no such thing. kill(2) is unsafe whether or not
/proc/pid/cmdline or any other /proc file is open.
From: Daniel Colascione
> Sent: 31 October 2018 12:56
> On Wed, Oct 31, 2018 at 12:27 PM, David Laight <[email protected]> wrote:
> > From: Daniel Colascione
> >> Sent: 29 October 2018 17:53
> >>
> >> This patch adds a new file under /proc/pid, /proc/pid/exithand.
> >> Attempting to read from an exithand file will block until the
> >> corresponding process exits, at which point the read will successfully
> >> complete with EOF. The file descriptor supports both blocking
> >> operations and poll(2). It's intended to be a minimal interface for
> >> allowing a program to wait for the exit of a process that is not one
> >> of its children.
> >
> > Why do you need an extra file?
>
> Because no current file suffices.
That doesn't stop you making something work on any/all of the existing files.
> > It ought to be possible to use poll() to wait for POLLERR having set
> > 'events' to zero on any of the nodes in /proc/pid - or even on
> > the directory itself.
>
> That doesn't actually work today. And waiting on a directory with
> POLLERR would be very weird, since directories in general don't do
> things like blocking reads or poll support. A separate file with
> self-contained, well-defined semantics is cleaner.
Device drivers will (well ought to) return POLLERR when a device
is removed.
Making procfs behave the same way wouldn't be too stupid.
> > Indeed, to avoid killing the wrong process you need to have opened
> > some node of /proc/pid/* (maybe cmdline) before sending the kill
> > signal.
>
> The kernel really needs better documentation of the semantics of
> procfs file descriptors. You're not the only person to think,
> mistakenly, that keeping a reference to a /proc/$PID/something FD
> reserves $PID and prevents it being used for another process. Procfs
> FDs do no such thing. kill(2) is unsafe whether or not
> /proc/pid/cmdline or any other /proc file is open.
Interesting.
Linux 'fixed' the problem of pid reuse in the kernel by adding (IIRC)
'struct pid' that reference counts the pid stopping reuse.
But since the pids are still allocated sequentially userspace can
still reference a pid that is freed and immediately reused.
I'd have thought that procfs nodes held a reference count on the 'struct pid'.
There's probably no reason why it shouldn't.
Annoyingly non-GPL drivers can't release references to 'struct pid' so
are very constrained about which processes they can signal.
I also managed to use a stale 'struct pid' and kill the wrong process
- much more likely that the pid number being reused.
If you look at the NetBSD pid allocator you'll see that it uses the
low pid bits to index an array and the high bits as a sequence number.
The array slots are also reused LIFO, so you always need a significant
number of pid allocate/free before a number is reused.
The non-sequential allocation also makes it significantly more difficult
to predict when a pid will be reused.
The table size is doubled when it gets nearly full.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Wed, Oct 31, 2018 at 7:25 AM, David Laight <[email protected]> wrote:
> From: Daniel Colascione
>> Sent: 31 October 2018 12:56
>> On Wed, Oct 31, 2018 at 12:27 PM, David Laight <[email protected]> wrote:
>> > From: Daniel Colascione
>> >> Sent: 29 October 2018 17:53
>> >>
>> >> This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> >> Attempting to read from an exithand file will block until the
>> >> corresponding process exits, at which point the read will successfully
>> >> complete with EOF. The file descriptor supports both blocking
>> >> operations and poll(2). It's intended to be a minimal interface for
>> >> allowing a program to wait for the exit of a process that is not one
>> >> of its children.
>> >
>> > Why do you need an extra file?
>>
>> Because no current file suffices.
>
> That doesn't stop you making something work on any/all of the existing files.
>
>> > It ought to be possible to use poll() to wait for POLLERR having set
>> > 'events' to zero on any of the nodes in /proc/pid - or even on
>> > the directory itself.
>>
>> That doesn't actually work today. And waiting on a directory with
>> POLLERR would be very weird, since directories in general don't do
>> things like blocking reads or poll support. A separate file with
>> self-contained, well-defined semantics is cleaner.
>
> Device drivers will (well ought to) return POLLERR when a device
> is removed.
> Making procfs behave the same way wouldn't be too stupid.
>
>> > Indeed, to avoid killing the wrong process you need to have opened
>> > some node of /proc/pid/* (maybe cmdline) before sending the kill
>> > signal.
>>
>> The kernel really needs better documentation of the semantics of
>> procfs file descriptors. You're not the only person to think,
>> mistakenly, that keeping a reference to a /proc/$PID/something FD
>> reserves $PID and prevents it being used for another process. Procfs
>> FDs do no such thing. kill(2) is unsafe whether or not
>> /proc/pid/cmdline or any other /proc file is open.
>
> Interesting.
> Linux 'fixed' the problem of pid reuse in the kernel by adding (IIRC)
> 'struct pid' that reference counts the pid stopping reuse.
This is incorrect if you mean numeric pids. See the end of these
comments in include/linux/pid.h . A pid value can be reused, it just
works Ok because it causes a new struct pid allocation. That doesn't
mean there isn't a numeric reuse. There's also no where in pid_alloc()
where we prevent the numeric reuse AFAICT.
/*
* What is struct pid?
*
* A struct pid is the kernel's internal notion of a process identifier.
* It refers to individual tasks, process groups, and sessions. While
* there are processes attached to it the struct pid lives in a hash
* table, so it and then the processes that it refers to can be found
* quickly from the numeric pid value. The attached processes may be
* quickly accessed by following pointers from struct pid.
*
* Storing pid_t values in the kernel and referring to them later has a
* problem. The process originally with that pid may have exited and the
* pid allocator wrapped, and another process could have come along
* and been assigned that pid.
*
* Referring to user space processes by holding a reference to struct
* task_struct has a problem. When the user space process exits
* the now useless task_struct is still kept. A task_struct plus a
* stack consumes around 10K of low kernel memory. More precisely
* this is THREAD_SIZE + sizeof(struct task_struct). By comparison
* a struct pid is about 64 bytes.
*
* Holding a reference to struct pid solves both of these problems.
* It is small so holding a reference does not consume a lot of
* resources, and since a new struct pid is allocated when the numeric pid
* value is reused (when pids wrap around) we don't mistakenly refer to new
* processes.
*/
On Wed, Oct 31, 2018 at 7:41 AM, Joel Fernandes <[email protected]> wrote:
[...]
>>> > Indeed, to avoid killing the wrong process you need to have opened
>>> > some node of /proc/pid/* (maybe cmdline) before sending the kill
>>> > signal.
>>>
>>> The kernel really needs better documentation of the semantics of
>>> procfs file descriptors. You're not the only person to think,
>>> mistakenly, that keeping a reference to a /proc/$PID/something FD
>>> reserves $PID and prevents it being used for another process. Procfs
>>> FDs do no such thing. kill(2) is unsafe whether or not
>>> /proc/pid/cmdline or any other /proc file is open.
>>
>> Interesting.
>> Linux 'fixed' the problem of pid reuse in the kernel by adding (IIRC)
>> 'struct pid' that reference counts the pid stopping reuse.
>
> This is incorrect if you mean numeric pids. See the end of these
> comments in include/linux/pid.h . A pid value can be reused, it just
> works Ok because it causes a new struct pid allocation. That doesn't
> mean there isn't a numeric reuse. There's also no where in pid_alloc()
> where we prevent the numeric reuse AFAICT.
Bleh, I mean alloc_pid().
On Wed, Oct 31, 2018 at 2:25 PM, David Laight <[email protected]> wrote:
> From: Daniel Colascione
>> Sent: 31 October 2018 12:56
>> On Wed, Oct 31, 2018 at 12:27 PM, David Laight <[email protected]> wrote:
>> > From: Daniel Colascione
>> >> Sent: 29 October 2018 17:53
>> >>
>> >> This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> >> Attempting to read from an exithand file will block until the
>> >> corresponding process exits, at which point the read will successfully
>> >> complete with EOF. The file descriptor supports both blocking
>> >> operations and poll(2). It's intended to be a minimal interface for
>> >> allowing a program to wait for the exit of a process that is not one
>> >> of its children.
>> >
>> > Why do you need an extra file?
>>
>> Because no current file suffices.
>
> That doesn't stop you making something work on any/all of the existing files.
Why overload?
>> > It ought to be possible to use poll() to wait for POLLERR having set
>> > 'events' to zero on any of the nodes in /proc/pid - or even on
>> > the directory itself.
>>
>> That doesn't actually work today. And waiting on a directory with
>> POLLERR would be very weird, since directories in general don't do
>> things like blocking reads or poll support. A separate file with
>> self-contained, well-defined semantics is cleaner.
>
> Device drivers will (well ought to) return POLLERR when a device
> is removed.
> Making procfs behave the same way wouldn't be too stupid.
That's overkill. You'd need to add poll support to files throughout
/proc/pid, and I don't think doing so would add any new capabilities
over keeping the changes localized to one new place.
>> > Indeed, to avoid killing the wrong process you need to have opened
>> > some node of /proc/pid/* (maybe cmdline) before sending the kill
>> > signal.
>>
>> The kernel really needs better documentation of the semantics of
>> procfs file descriptors. You're not the only person to think,
>> mistakenly, that keeping a reference to a /proc/$PID/something FD
>> reserves $PID and prevents it being used for another process. Procfs
>> FDs do no such thing. kill(2) is unsafe whether or not
>> /proc/pid/cmdline or any other /proc file is open.
>
> Interesting.
> Linux 'fixed' the problem of pid reuse in the kernel by adding (IIRC)
> 'struct pid' that reference counts the pid stopping reuse.
Struct pid doesn't stop PID reuse. It just allows the kernel to
distinguish between a stale and current reference to a given PID. In a
sense, the "struct pid*" is the "real" name of a process and the
numeric PID is just a convenient way to find a struct pid.
> But since the pids are still allocated sequentially userspace can
> still reference a pid that is freed and immediately reused.
> I'd have thought that procfs nodes held a reference count on the 'struct pid'.
> There's probably no reason why it shouldn't.
>
> Annoyingly non-GPL drivers can't release references to 'struct pid' so
> are very constrained about which processes they can signal.
If I had my way, I'd s/EXPORT_SYMBOL_GPL/EXPORT_SYMBOL/ in pid.c.
These routines seem generally useful. As an alternative, I suppose
drivers could just use the new /proc/pid/kill interface, with
sufficient contortions, just as userspace can.
> I also managed to use a stale 'struct pid' and kill the wrong process
> - much more likely that the pid number being reused.
That shouldn't be possible, unless by "stale 'struct pid'" you mean a
struct pid* referring to a struct pid that's been released and reused
for some other object (possibly a different struct pid instances), and
that's UB.
> If you look at the NetBSD pid allocator you'll see that it uses the
> low pid bits to index an array and the high bits as a sequence number.
> The array slots are also reused LIFO, so you always need a significant
> number of pid allocate/free before a number is reused.
> The non-sequential allocation also makes it significantly more difficult
> to predict when a pid will be reused.
> The table size is doubled when it gets nearly full.
NetBSD is still just papering over the problem. The real issue is that
the whole PID-based process API model is unsafe, and a clever PID
allocator doesn't address the fundamental race condition. As long as
PID reuse is possible at all, there's a potential race condition, and
correctness depends on hope. The only way you could address the PID
race problem while not changing the Unix process API is by making
pid_t ridiculously wide so that it never wraps around.
+ linux-api
On Mon, Oct 29, 2018 at 5:53 PM, Daniel Colascione <[email protected]> wrote:
> This patch adds a new file under /proc/pid, /proc/pid/exithand.
> Attempting to read from an exithand file will block until the
> corresponding process exits, at which point the read will successfully
> complete with EOF. The file descriptor supports both blocking
> operations and poll(2). It's intended to be a minimal interface for
> allowing a program to wait for the exit of a process that is not one
> of its children.
>
> Why might we want this interface? Android's lmkd kills processes in
> order to free memory in response to various memory pressure
> signals. It's desirable to wait until a killed process actually exits
> before moving on (if needed) to killing the next process. Since the
> processes that lmkd kills are not lmkd's children, lmkd currently
> lacks a way to wait for a proces to actually die after being sent
> SIGKILL; today, lmkd resorts to polling the proc filesystem pid
> entry. This interface allow lmkd to give up polling and instead block
> and wait for process death.
>
> Signed-off-by: Daniel Colascione <[email protected]>
> ---
> fs/proc/Makefile | 1 +
> fs/proc/base.c | 1 +
> fs/proc/exithand.c | 117 +++++++++++++++++++++++++++++++++++
> fs/proc/internal.h | 4 ++
> include/linux/sched/signal.h | 7 +++
> kernel/exit.c | 2 +
> kernel/signal.c | 3 +
> 7 files changed, 135 insertions(+)
> create mode 100644 fs/proc/exithand.c
>
> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index ead487e80510..21322280a2c1 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -27,6 +27,7 @@ proc-y += softirqs.o
> proc-y += namespaces.o
> proc-y += self.o
> proc-y += thread_self.o
> +proc-y += exithand.o
> proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
> proc-$(CONFIG_NET) += proc_net.o
> proc-$(CONFIG_PROC_KCORE) += kcore.o
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 7e9f07bf260d..31bc6bbb6dc4 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3006,6 +3006,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> #ifdef CONFIG_LIVEPATCH
> ONE("patch_state", S_IRUSR, proc_pid_patch_state),
> #endif
> + REG("exithand", S_IRUGO, proc_tgid_exithand_operations),
> };
>
> static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
> diff --git a/fs/proc/exithand.c b/fs/proc/exithand.c
> new file mode 100644
> index 000000000000..358b08da6a08
> --- /dev/null
> +++ b/fs/proc/exithand.c
> @@ -0,0 +1,117 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Synchronous exit notification of non-child processes
> + *
> + * Simple file descriptor /proc/pid/exithand. Read blocks (and poll
> + * reports non-readable) until process either dies or becomes
> + * a zombie.
> + */
> +#include <linux/printk.h>
> +#include <linux/sched/signal.h>
> +#include <linux/poll.h>
> +#include "internal.h"
> +
> +static int proc_tgid_exithand_open(struct inode *inode, struct file *file)
> +{
> + struct task_struct* task = get_proc_task(inode);
> + /* If get_proc_task failed, it means the task is dead, which
> + * is fine, since a subsequent read will return
> + * immediately. */
> + if (task && !thread_group_leader(task))
> + return -EINVAL;
> + return 0;
> +}
> +
> +static ssize_t proc_tgid_exithand_read(struct file * file,
> + char __user * buf,
> + size_t count, loff_t *ppos)
> +{
> + struct task_struct* task = NULL;
> + wait_queue_entry_t wait;
> + ssize_t res = 0;
> + bool locked = false;
> +
> + for (;;) {
> + /* Retrieve the task from the struct pid each time
> + * through the loop in case the exact struct task
> + * changes underneath us (e.g., if in exec.c, the
> + * execing process kills the group leader and starts
> + * using its PID). The struct signal should be the
> + * same though even in this case.
> + */
> + task = get_proc_task(file_inode(file));
> + res = 0;
> + if (!task)
> + goto out; /* No task? Must have died. */
> +
> + BUG_ON(!thread_group_leader(task));
> +
> + /* Synchronizes with exit.c machinery. */
> + read_lock(&tasklist_lock);
> + locked = true;
> +
> + res = 0;
> + if (task->exit_state)
> + goto out;
> +
> + res = -EAGAIN;
> + if (file->f_flags & O_NONBLOCK)
> + goto out;
> +
> + /* Tell exit.c to go to the trouble of waking our
> + * runqueue when this process gets around to
> + * exiting. */
> + task->signal->exithand_is_interested = true;
> +
> + /* Even if the task identity changes, task->signal
> + * should be invariant across the wait, making it safe
> + * to go remove our wait record from the wait queue
> + * after we come back from schedule. */
> +
> + init_waitqueue_entry(&wait, current);
> + add_wait_queue(&wait_exithand, &wait);
> +
> + read_unlock(&tasklist_lock);
> + locked = false;
> +
> + put_task_struct(task);
> + task = NULL;
> +
> + set_current_state(TASK_INTERRUPTIBLE);
> + schedule();
> + set_current_state(TASK_RUNNING);
> + remove_wait_queue(&wait_exithand, &wait);
> +
> + res = -ERESTARTSYS;
> + if (signal_pending(current))
> + goto out;
> + }
> +out:
> + if (locked)
> + read_unlock(&tasklist_lock);
> + if (task)
> + put_task_struct(task);
> + return res;
> +}
> +
> +static __poll_t proc_tgid_exithand_poll(struct file *file, poll_table *wait)
> +{
> + __poll_t mask = 0;
> + struct task_struct* task = get_proc_task(file_inode(file));
> + if (!task) {
> + mask |= POLLIN;
> + } else if (READ_ONCE(task->exit_state)) {
> + mask |= POLLIN;
> + } else {
> + read_lock(&tasklist_lock);
> + task->signal->exithand_is_interested = true;
> + read_unlock(&tasklist_lock);
> + poll_wait(file, &wait_exithand, wait);
> + }
> + return mask;
> +}
> +
> +const struct file_operations proc_tgid_exithand_operations = {
> + .open = proc_tgid_exithand_open,
> + .read = proc_tgid_exithand_read,
> + .poll = proc_tgid_exithand_poll,
> +};
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 5185d7f6a51e..1009d20475bc 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -304,3 +304,7 @@ extern unsigned long task_statm(struct mm_struct *,
> unsigned long *, unsigned long *,
> unsigned long *, unsigned long *);
> extern void task_mem(struct seq_file *, struct mm_struct *);
> +
> +/* exithand.c */
> +
> +extern const struct file_operations proc_tgid_exithand_operations;
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 13789d10a50e..44131cb6c7f4 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -74,6 +74,10 @@ struct multiprocess_signals {
> struct hlist_node node;
> };
>
> +/* Need to stick the waitq for exithand outside process structures in
> + * case a process disappears across a poll. */
> +extern wait_queue_head_t wait_exithand;
> +
> /*
> * NOTE! "signal_struct" does not have its own
> * locking, because a shared signal_struct always
> @@ -87,6 +91,9 @@ struct signal_struct {
> int nr_threads;
> struct list_head thread_head;
>
> + /* Protected with tasklist_lock. */
> + bool exithand_is_interested;
> +
> wait_queue_head_t wait_chldexit; /* for wait4() */
>
> /* current thread group signal load-balancing target: */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 0e21e6d21f35..44a4e3796f8b 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1485,6 +1485,8 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
> {
> __wake_up_sync_key(&parent->signal->wait_chldexit,
> TASK_INTERRUPTIBLE, 1, p);
> + if (p->signal->exithand_is_interested)
> + __wake_up_sync(&wait_exithand, TASK_INTERRUPTIBLE, 0);
> }
>
> static long do_wait(struct wait_opts *wo)
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 17565240b1c6..e156d48da70a 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -454,6 +454,9 @@ void flush_sigqueue(struct sigpending *queue)
> }
> }
>
> +wait_queue_head_t wait_exithand =
> + __WAIT_QUEUE_HEAD_INITIALIZER(wait_exithand);
> +
> /*
> * Flush all pending signals for this kthread.
> */
> --
> 2.19.1.568.g152ad8e336-goog
>
On Wed, Oct 31, 2018 at 5:25 PM, Andy Lutomirski <[email protected]> wrote:
> I had an old patch to do much the same thing:
It's a perennial idea. :-)
> https://lore.kernel.org/patchwork/patch/345098/
>
> Can you comment as to how your API compares to my old patch?
Sure. Basically, my approach is sort-of eventfd-esque, whereas your
approach involves adding a very unusual operation (poll support) to a
type of file (a directory) that normally doesn't support it. My
approach feels a bit more "conventional" than poll on a dfd.
Additionally, my approach is usable from the shell. In your model,
poll(2) returning *is* the notification, whereas in my approach, the
canonical notification is read() yielding EOF, with poll(2) acting
like a wakeup hint, just like for eventfd. (You can set O_NONBLOCK on
the exithand FD just like you would any other FD.)
The use of read() for notification of exit also allows for a simple
extension in which we return a siginfo_t with exit information to the
waiter, without changing the API model. My initial patch doesn't
include this feature because I wanted to keep the initial version as
simple as possible.
> You’re using
> some fairly gnarly global synchronization
The global synchronization only kicks for a particular process exit if
somebody has used an exithand FD to wait on that process. (Or more
precisely, that process's struct signal.) Since most process exits
don't require global synchronization, I don't think the global
waitqueue for exithand is a big problem, but if it is, there are
options for fixing it.
> , and that seems unnecessary
It is necessary, and I don't see how your patch is correct. In your
proc_task_base_poll, you call poll_wait() with &task->detach_wqh. What
prevents that waitqueue disappearing (and the poll table waitqueue
pointer dangling) immediately after proc_task_base_poll returns? The
proc_inode maintains a reference to a struct pid, not a task_struct,
but your waitqueue lives in task_struct.
The waitqueue living in task_struct is also wrong in the case that a
multithreaded program execs from a non-main thread; in this case (if
I'm reading the code in exec.c right) we destroy the old main thread
task_struct and have the caller-of-exec's task_struct adopt the old
main thread's struct pid. That is, identity-continuity of struct task
is not the same as identity-continuity of the logical thread group.
On 2018-10-29, Daniel Colascione <[email protected]> wrote:
> This patch adds a new file under /proc/pid, /proc/pid/exithand.
> Attempting to read from an exithand file will block until the
> corresponding process exits, at which point the read will successfully
> complete with EOF. The file descriptor supports both blocking
> operations and poll(2). It's intended to be a minimal interface for
> allowing a program to wait for the exit of a process that is not one
> of its children.
>
> Why might we want this interface? Android's lmkd kills processes in
> order to free memory in response to various memory pressure
> signals. It's desirable to wait until a killed process actually exits
> before moving on (if needed) to killing the next process. Since the
> processes that lmkd kills are not lmkd's children, lmkd currently
> lacks a way to wait for a process to actually die after being sent
> SIGKILL; today, lmkd resorts to polling the proc filesystem pid
> entry. This interface allow lmkd to give up polling and instead block
> and wait for process death.
I agree with the need for this interface (with a few caveats), but there
are a few points I'd like to make:
* I don't think that making a new procfile is necessary. When you open
/proc/$pid you already have a handle for the underlying process, and
you can already poll to check whether the process has died (fstatat
fails for instance). What if we just used an inotify event to tell
userspace that the process has died -- to avoid userspace doing a
poll loop?
* There is a fairly old interface called the proc_connector which gives
you global fork+exec+exit events (similar to kevents from FreeBSD
though much less full-featured). I was working on some patches to
extend proc_connector so that it could be used inside containers as
well as unprivileged users. This would be another way we could
implement this.
I'm really not a huge fan of the "blocking read" semantic (though if we
have to have it, can we at least provide as much information as you get
from proc_connector -- such as the exit status?). Also maybe we should
integrate this into the exit machinery instead of this loop...
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
On 2018-11-01, Aleksa Sarai <[email protected]> wrote:
> On 2018-10-29, Daniel Colascione <[email protected]> wrote:
> > This patch adds a new file under /proc/pid, /proc/pid/exithand.
> > Attempting to read from an exithand file will block until the
> > corresponding process exits, at which point the read will successfully
> > complete with EOF. The file descriptor supports both blocking
> > operations and poll(2). It's intended to be a minimal interface for
> > allowing a program to wait for the exit of a process that is not one
> > of its children.
> >
> > Why might we want this interface? Android's lmkd kills processes in
> > order to free memory in response to various memory pressure
> > signals. It's desirable to wait until a killed process actually exits
> > before moving on (if needed) to killing the next process. Since the
> > processes that lmkd kills are not lmkd's children, lmkd currently
> > lacks a way to wait for a process to actually die after being sent
> > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
> > entry. This interface allow lmkd to give up polling and instead block
> > and wait for process death.
>
> I agree with the need for this interface (with a few caveats), but there
> are a few points I'd like to make:
>
> * I don't think that making a new procfile is necessary. When you open
> /proc/$pid you already have a handle for the underlying process, and
> you can already poll to check whether the process has died (fstatat
> fails for instance). What if we just used an inotify event to tell
> userspace that the process has died -- to avoid userspace doing a
> poll loop?
>
> * There is a fairly old interface called the proc_connector which gives
> you global fork+exec+exit events (similar to kevents from FreeBSD
> though much less full-featured). I was working on some patches to
> extend proc_connector so that it could be used inside containers as
> well as unprivileged users. This would be another way we could
> implement this.
>
> I'm really not a huge fan of the "blocking read" semantic (though if we
> have to have it, can we at least provide as much information as you get
> from proc_connector -- such as the exit status?). Also maybe we should
> integrate this into the exit machinery instead of this loop...
In addition, given that you've posted two patches in the similar vein
but as separate patchsets -- would you mind re-sending them as a single
patchset (with all the relevant folks added to Cc)?
If the idea is to extend /proc/$pid to allow for various
fd-as-process-handle operations (which I agree with in principle), then
they should be a single patchset. I'm also a bit cautious about how
many procfiles the eventual goal is to add.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
On November 1, 2018 8:06:52 AM GMT+01:00, Aleksa Sarai <[email protected]> wrote:
>On 2018-11-01, Aleksa Sarai <[email protected]> wrote:
>> On 2018-10-29, Daniel Colascione <[email protected]> wrote:
>> > This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> > Attempting to read from an exithand file will block until the
>> > corresponding process exits, at which point the read will
>successfully
>> > complete with EOF. The file descriptor supports both blocking
>> > operations and poll(2). It's intended to be a minimal interface for
>> > allowing a program to wait for the exit of a process that is not
>one
>> > of its children.
>> >
>> > Why might we want this interface? Android's lmkd kills processes in
>> > order to free memory in response to various memory pressure
>> > signals. It's desirable to wait until a killed process actually
>exits
>> > before moving on (if needed) to killing the next process. Since the
>> > processes that lmkd kills are not lmkd's children, lmkd currently
>> > lacks a way to wait for a process to actually die after being sent
>> > SIGKILL; today, lmkd resorts to polling the proc filesystem pid
>> > entry. This interface allow lmkd to give up polling and instead
>block
>> > and wait for process death.
>>
>> I agree with the need for this interface (with a few caveats), but
>there
>> are a few points I'd like to make:
>>
>> * I don't think that making a new procfile is necessary. When you
>open
>> /proc/$pid you already have a handle for the underlying process,
>and
>> you can already poll to check whether the process has died
>(fstatat
>> fails for instance). What if we just used an inotify event to tell
>> userspace that the process has died -- to avoid userspace doing a
>> poll loop?
>>
>> * There is a fairly old interface called the proc_connector which
>gives
>> you global fork+exec+exit events (similar to kevents from FreeBSD
>> though much less full-featured). I was working on some patches to
>> extend proc_connector so that it could be used inside containers
>as
>> well as unprivileged users. This would be another way we could
>> implement this.
>>
>> I'm really not a huge fan of the "blocking read" semantic (though if
>we
>> have to have it, can we at least provide as much information as you
>get
>> from proc_connector -- such as the exit status?). Also maybe we
>should
>> integrate this into the exit machinery instead of this loop...
>
>In addition, given that you've posted two patches in the similar vein
>but as separate patchsets -- would you mind re-sending them as a single
>patchset (with all the relevant folks added to Cc)?
Please make sure to run get_maintainers.pl against your patches
if you haven't already done so to make sure that the right people are
Cc'ed.
I would suggest to a least Cc Eric, Serge, Andy, Kees, and Oleg.
>
>If the idea is to extend /proc/$pid to allow for various
>fd-as-process-handle operations (which I agree with in principle), then
>they should be a single patchset. I'm also a bit cautious about how
>many procfiles the eventual goal is to add.
On Thu, Nov 1, 2018 at 7:00 AM, Aleksa Sarai <[email protected]> wrote:
> On 2018-10-29, Daniel Colascione <[email protected]> wrote:
>> This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> Attempting to read from an exithand file will block until the
>> corresponding process exits, at which point the read will successfully
>> complete with EOF. The file descriptor supports both blocking
>> operations and poll(2). It's intended to be a minimal interface for
>> allowing a program to wait for the exit of a process that is not one
>> of its children.
>>
>> Why might we want this interface? Android's lmkd kills processes in
>> order to free memory in response to various memory pressure
>> signals. It's desirable to wait until a killed process actually exits
>> before moving on (if needed) to killing the next process. Since the
>> processes that lmkd kills are not lmkd's children, lmkd currently
>> lacks a way to wait for a process to actually die after being sent
>> SIGKILL; today, lmkd resorts to polling the proc filesystem pid
>> entry. This interface allow lmkd to give up polling and instead block
>> and wait for process death.
>
> I agree with the need for this interface (with a few caveats), but there
> are a few points I'd like to make:
>
> * I don't think that making a new procfile is necessary. When you open
> /proc/$pid you already have a handle for the underlying process, and
> you can already poll to check whether the process has died (fstatat
> fails for instance). What if we just used an inotify event to tell
> userspace that the process has died -- to avoid userspace doing a
> poll loop?
I'm trying to make a simple interface. The basic unix data access
model is that a userspace application wants information (e.g., next
bunch of bytes in a file, next packet from a socket, next signal from
a signal FD, etc.), and tells the kernel so by making a system call on
a file descriptor. Ordinarily, the kernel returns to userspace with
the requested information when it's available, potentially after
blocking until the information is available. Sometimes userspace
doesn't want to block, so it adds O_NONBLOCK to the open file mode,
and in this mode, the kernel can tell the userspace requestor "try
again later", but the source of truth is still that
ordinarily-blocking system call. How does userspace know when to try
again in the "try again later" case? By using
select/poll/epoll/whatever, which suggests a good time for that "try
again later" retry, but is not dispositive about it, since that
ordinarily-blocking system call is still the sole source of truth, and
that poll is allowed to report spurious readabilty.
This model works fine and has a ton of mental and technical
infrastructure built around it. It's the one the system uses for
almost every bit of information useful to an application. I feel very
strongly that process exit should also adhere to this model. It's
consistent, robust, and simple to use. That's why I added a procfile
that adheres to this model. It lets processes deal with exits in
exactly the same way they do any other event, with bit of userspace
code that works with possibly-blocking file descriptors generally
(e.g. libevent) without special logic in any polling loop. The event
file I'm proposing is so ordinary, in fact, that it works from the
shell. Without some specific technical reason to do something
different, we shouldn't do something unusual.
Given that we *can*, cheaply, provide a clean and consistent API to
userspace, why would we instead want to inflict some exotic and
hard-to-use interface on userspace instead? Asking that userspace poll
on a directory file descriptor and, when poll returns, check by
looking for certain errors (we'd have to spec which ones) from fstatat
is awkward. /proc/pid is a directory. In what other context does the
kernel ask userspace to use a directory this way?
I don't want to get bogged down in a discussion of a thousand exotic
ways we could provide this feature when there's one clear approach
that works everywhere else and that we should just copy.
> * There is a fairly old interface called the proc_connector which gives
> you global fork+exec+exit events (similar to kevents from FreeBSD
> though much less full-featured). I was working on some patches to
> extend proc_connector so that it could be used inside containers as
> well as unprivileged users. This would be another way we could
> implement this.
Both netlink and the *notify APIs are intended for broad monitoring of
system activity, not for waiting for some specific event. They require
a substantial amount of setup code, and since both are event-streaming
APIs with buffers that can overflow, both need some logic for
userspace to detect buffer overrun and fall back to explicit scanning
if that happens. They're also optional part of the kernel, and as you
note, there's a lot of work needed before either is usable with /proc,
most of that work being unrelated to the race-free process management
operations I'd like to support. We don't need work on either to fix
these longstanding problems with the process API.
Ideally it wouldn't require /proc at all, but /proc is how we ask
about non-child processes generally, so we're stuck with it.
> I'm really not a huge fan of the "blocking read" semantic (though if we
> have to have it, can we at least provide as much information as you get
> from proc_connector -- such as the exit status?).
That a process *exists* is one bit of information. You can get it
today by hammering /proc, so the current exithand patch doesn't leak
information.
You're asking for the kernel to communicate additional information to
userspace when a process dies. Who should have access to that
information? If the answer is "everyone", then yes, we can just have
the read() yield a siginfo_t, just like for waitid. That's simple and
useful. But if the answer is "some users", then who? The exit status
in /proc/pid/stat is zeroed out for readers that fail do_task_stat's
ptrace_may_access call. (Falsifying the exit status in stat seems a
privilege check fails seems like a bad idea from a correctness POV.)
Should open() on exithand perform the same ptrace_may_access privilege
check? What if the process *becomes* untraceable during its lifetime
(e.g., with setuid). Should that read() on the exithand FD still yield
a siginfo_t? Just having exithand yield EOF all the time punts the
privilege problem to a later discussion because this approach doesn't
leak information. We can always add an "exithand_full" or something
that actually yields a siginfo_t.
Another option would be to make exithand's read() always yield a
siginfo_t, but have the open() just fail if the caller couldn't
ptrace_may_access it. But why shouldn't you be able to wait on other
processes? If you can see it in /proc, you should be able to wait on
it exiting.
> Also maybe we should
> integrate this into the exit machinery instead of this loop...
I don't know what you mean. It's already integrated into the exit
machinery: it's what runs the waitqueue.
On 2018-11-01, Daniel Colascione <[email protected]> wrote:
> On Thu, Nov 1, 2018 at 7:00 AM, Aleksa Sarai <[email protected]> wrote:
> > On 2018-10-29, Daniel Colascione <[email protected]> wrote:
> >> This patch adds a new file under /proc/pid, /proc/pid/exithand.
> >> Attempting to read from an exithand file will block until the
> >> corresponding process exits, at which point the read will successfully
> >> complete with EOF. The file descriptor supports both blocking
> >> operations and poll(2). It's intended to be a minimal interface for
> >> allowing a program to wait for the exit of a process that is not one
> >> of its children.
> >>
> >> Why might we want this interface? Android's lmkd kills processes in
> >> order to free memory in response to various memory pressure
> >> signals. It's desirable to wait until a killed process actually exits
> >> before moving on (if needed) to killing the next process. Since the
> >> processes that lmkd kills are not lmkd's children, lmkd currently
> >> lacks a way to wait for a process to actually die after being sent
> >> SIGKILL; today, lmkd resorts to polling the proc filesystem pid
> >> entry. This interface allow lmkd to give up polling and instead block
> >> and wait for process death.
> >
> > I agree with the need for this interface (with a few caveats), but there
> > are a few points I'd like to make:
> >
> > * I don't think that making a new procfile is necessary. When you open
> > /proc/$pid you already have a handle for the underlying process, and
> > you can already poll to check whether the process has died (fstatat
> > fails for instance). What if we just used an inotify event to tell
> > userspace that the process has died -- to avoid userspace doing a
> > poll loop?
>
> I'm trying to make a simple interface. The basic unix data access
> model is that a userspace application wants information (e.g., next
> bunch of bytes in a file, next packet from a socket, next signal from
> a signal FD, etc.), and tells the kernel so by making a system call on
> a file descriptor. Ordinarily, the kernel returns to userspace with
> the requested information when it's available, potentially after
> blocking until the information is available. Sometimes userspace
> doesn't want to block, so it adds O_NONBLOCK to the open file mode,
> and in this mode, the kernel can tell the userspace requestor "try
> again later", but the source of truth is still that
> ordinarily-blocking system call. How does userspace know when to try
> again in the "try again later" case? By using
> select/poll/epoll/whatever, which suggests a good time for that "try
> again later" retry, but is not dispositive about it, since that
> ordinarily-blocking system call is still the sole source of truth, and
> that poll is allowed to report spurious readabilty.
inotify gives you an event if a file or directory is deleted. A pid
dying semantically is similar to the idea of a /proc/$pid being deleted.
I don't see how a blocking read on a new procfile is simpler than using
the existing notification-on-file-events infrastructure -- not to
mention that the idea of "this file blocks until the thing we are
indirectly referencing by this file is gone" seems to me to be a really
strange interface.
Sure, it uses read(2) -- but is that the only constraint on designing
simple interfaces?
> The event file I'm proposing is so ordinary, in fact, that it works
> from the shell. Without some specific technical reason to do something
> different, we shouldn't do something unusual.
inotify-tools are available on effectively every distribution.
> Given that we *can*, cheaply, provide a clean and consistent API to
> userspace, why would we instead want to inflict some exotic and
> hard-to-use interface on userspace instead? Asking that userspace poll
> on a directory file descriptor and, when poll returns, check by
> looking for certain errors (we'd have to spec which ones) from fstatat
> is awkward. /proc/pid is a directory. In what other context does the
> kernel ask userspace to use a directory this way?
I'm not sure you understood my proposal. I said that we need an
interface to do this, and I was trying to explain (by noting what the
current way of doing it would be) what I think the interface should be.
To reiterate, I believe that having an inotify event (IN_DELETE_SELF on
/proc/$pid) would be in keeping with the current way of doing things but
allowing userspace to avoid all of the annoyances you just mentioned and
I was alluding to.
I *don't* think that the current scheme of looping on fstatat is the way
it should be left. And there is an argument the inotify is not
sufficient to
> > I'm really not a huge fan of the "blocking read" semantic (though if we
> > have to have it, can we at least provide as much information as you get
> > from proc_connector -- such as the exit status?).
> [...]
> The exit status in /proc/pid/stat is zeroed out for readers that fail
> do_task_stat's ptrace_may_access call. (Falsifying the exit status in
> stat seems a privilege check fails seems like a bad idea from a
> correctness POV.)
It's not clear to me what the purpose of that field is within procfs for
*dead* proceses -- which is what we're discussing here. As far as I can
tell, you will get an ESRCH when you try to read it. When testing this
it also looked like you didn't even get the exit_status as a zombie but
I might be mistaken.
So while it is masked for !ptrace_may_access, it's also zero (or
unreadable) for almost every case outside of stopped processes (AFAICS).
Am I missing something?
> Should open() on exithand perform the same ptrace_may_access privilege
> check? What if the process *becomes* untraceable during its lifetime
> (e.g., with setuid). Should that read() on the exithand FD still yield
> a siginfo_t? Just having exithand yield EOF all the time punts the
> privilege problem to a later discussion because this approach doesn't
> leak information. We can always add an "exithand_full" or something
> that actually yields a siginfo_t.
I agree that read(2) makes this hard. I don't think we should use it.
But if we have to use it, I would like us to have feature parity with
features that FreeBSD had 18 years ago.
> Another option would be to make exithand's read() always yield a
> siginfo_t, but have the open() just fail if the caller couldn't
> ptrace_may_access it. But why shouldn't you be able to wait on other
> processes? If you can see it in /proc, you should be able to wait on
> it exiting.
I would suggest looking at FreeBSD's kevent semantics for inspiration
(or at least to see an alternative way of doing things). In particular,
EVFILT_PROC+NOTE_EXIT -- which is attached to a particular process. I
wonder what their view is on these sorts of questions.
> > Also maybe we should
> > integrate this into the exit machinery instead of this loop...
>
> I don't know what you mean. It's already integrated into the exit
> machinery: it's what runs the waitqueue.
My mistake, I missed the last hunk of the patch.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
On Thu, Nov 1, 2018 at 10:47 AM, Aleksa Sarai <[email protected]> wrote:
> On 2018-11-01, Daniel Colascione <[email protected]> wrote:
>> On Thu, Nov 1, 2018 at 7:00 AM, Aleksa Sarai <[email protected]> wrote:
>> > On 2018-10-29, Daniel Colascione <[email protected]> wrote:
>> >> This patch adds a new file under /proc/pid, /proc/pid/exithand.
>> >> Attempting to read from an exithand file will block until the
>> >> corresponding process exits, at which point the read will successfully
>> >> complete with EOF. The file descriptor supports both blocking
>> >> operations and poll(2). It's intended to be a minimal interface for
>> >> allowing a program to wait for the exit of a process that is not one
>> >> of its children.
>> >>
>> >> Why might we want this interface? Android's lmkd kills processes in
>> >> order to free memory in response to various memory pressure
>> >> signals. It's desirable to wait until a killed process actually exits
>> >> before moving on (if needed) to killing the next process. Since the
>> >> processes that lmkd kills are not lmkd's children, lmkd currently
>> >> lacks a way to wait for a process to actually die after being sent
>> >> SIGKILL; today, lmkd resorts to polling the proc filesystem pid
>> >> entry. This interface allow lmkd to give up polling and instead block
>> >> and wait for process death.
>> >
>> > I agree with the need for this interface (with a few caveats), but there
>> > are a few points I'd like to make:
>> >
>> > * I don't think that making a new procfile is necessary. When you open
>> > /proc/$pid you already have a handle for the underlying process, and
>> > you can already poll to check whether the process has died (fstatat
>> > fails for instance). What if we just used an inotify event to tell
>> > userspace that the process has died -- to avoid userspace doing a
>> > poll loop?
>>
>> I'm trying to make a simple interface. The basic unix data access
>> model is that a userspace application wants information (e.g., next
>> bunch of bytes in a file, next packet from a socket, next signal from
>> a signal FD, etc.), and tells the kernel so by making a system call on
>> a file descriptor. Ordinarily, the kernel returns to userspace with
>> the requested information when it's available, potentially after
>> blocking until the information is available. Sometimes userspace
>> doesn't want to block, so it adds O_NONBLOCK to the open file mode,
>> and in this mode, the kernel can tell the userspace requestor "try
>> again later", but the source of truth is still that
>> ordinarily-blocking system call. How does userspace know when to try
>> again in the "try again later" case? By using
>> select/poll/epoll/whatever, which suggests a good time for that "try
>> again later" retry, but is not dispositive about it, since that
>> ordinarily-blocking system call is still the sole source of truth, and
>> that poll is allowed to report spurious readabilty.
>
> inotify gives you an event if a file or directory is deleted. A pid
> dying semantically is similar to the idea of a /proc/$pid being deleted.
> I don't see how a blocking read on a new procfile is simpler than using
> the existing notification-on-file-events infrastructure -- not to
> mention that the idea of "this file blocks until the thing we are
> indirectly referencing by this file is gone" seems to me to be a really
> strange interface.
There's another subtlety: we don't want to wait until a process's proc
entry is *gone*. We want to wait until the process is *dead* (since
that's when its resources are released). If a process becomes a zombie
instead of going TASK_DEAD immediately, than it dies before its
/proc/pid directory disappears, so any API that talks about /proc/pid
directory presence will do the wrong thing. inotify gets this subtlety
wrong because a process is an object, not a file, and not a directory.
Using filesystem monitoring APIs to monitor process state is simply
the wrong approach, because you're mixing up some interface label for
an object with the object itself. Confusing objects and labels is how
we got the F_SETLK mess, among other things.
In other words, a process has an entire lifecycle in its own right
independent of whatever procfs is doing. Procfs is just an interface
for learning things about processes and doesn't control process
lifetime, so basing the "source of truth" for process notifications on
whatever is happening over in procfs is going to cause problems sooner
or later. We care about the process. (That's not the same as struct
task for various reasons, but logically, a process is a coherent
entity.)
> Sure, it uses read(2) -- but is that the only constraint on designing
> simple interfaces?
No, but if an interface requires some kind of setup procedure,
listener registration, event queue draining, switching on event queue
notification types, and scan-on-queue-overflow behavior, chances are
that it's not a simple interface.
>> The event file I'm proposing is so ordinary, in fact, that it works
>> from the shell. Without some specific technical reason to do something
>> different, we shouldn't do something unusual.
>
> inotify-tools are available on effectively every distribution.
Linux is more than the big distributions. What about embedded systems?
What about Android? What about minimal containers? We're talking about
a fundamental improvement in the process management system, and that
shouldn't depend on inotify.
>> Given that we *can*, cheaply, provide a clean and consistent API to
>> userspace, why would we instead want to inflict some exotic and
>> hard-to-use interface on userspace instead? Asking that userspace poll
>> on a directory file descriptor and, when poll returns, check by
>> looking for certain errors (we'd have to spec which ones) from fstatat
>> is awkward. /proc/pid is a directory. In what other context does the
>> kernel ask userspace to use a directory this way?
>
> I'm not sure you understood my proposal. I said that we need an
> interface to do this, and I was trying to explain (by noting what the
> current way of doing it would be) what I think the interface should be.
In what way is inotify *better* than a waitable FD? It's worse in a lot of ways:
1) inotify's presence is optional
2) inotify requires a lot of code to set up and use
3) inotify events fire at the wrong time, because they're tied to
/proc filesystem entries and not to underlying process state
4) inotify can't provide process exit status information, even in
principle, because inotify_event isn't big enough
I don't understand _why_ you want to use this worse interface instead
of a conventional blocking interface that works like eventfd and that
integrates into every event processing library out there without any
special tricks. What's so bad about a blocking read() model that
justifies the use of some kind of monitoring API?
>> > I'm really not a huge fan of the "blocking read" semantic (though if we
>> > have to have it, can we at least provide as much information as you get
>> > from proc_connector -- such as the exit status?).
>> [...]
>> The exit status in /proc/pid/stat is zeroed out for readers that fail
>> do_task_stat's ptrace_may_access call. (Falsifying the exit status in
>> stat seems a privilege check fails seems like a bad idea from a
>> correctness POV.)
>
> It's not clear to me what the purpose of that field is within procfs for
> *dead* proceses -- which is what we're discussing here. As far as I can
> tell, you will get an ESRCH when you try to read it. When testing this
> it also looked like you didn't even get the exit_status as a zombie but
> I might be mistaken.
exit_status becomes nonzero on transition to either a zombie or a
fully dead process.
> So while it is masked for !ptrace_may_access, it's also zero (or
> unreadable) for almost every case outside of stopped processes (AFAICS).
> Am I missing something?
The exit field is /proc/pid/stat is largely useless for exactly this
reason, but that's not my point.
My point is that nobody's ever made it clear who should have access to
a process's exit status. A process's exit status might communicate
privileged information, after all.
Should a process's parent be able to learn its child's exit status?
Certainly. Should a fully privileged unconstrained root be able to
learn any process's exit status? Certainly. Should a different process
be able to learn the exit status of an unrelated process running under
the same user? I think so, but I suspect the YAMA people might
disagree. Should "nobody" be able to learn the exit status of a
process running as root? Eh, maybe? *My* preference is that we just
declare the exit status public information, like comm, but that may
not be feasible.
But we don't need to figure out who should be able to learn a
process's exit status to add an API that blocks and waits for a
process to exit. That's why my initial patch doesn't provide exit
status. What I want to do is this:
1) provide a non-status-providing /proc/pid/exithand now
2) when we figure out who should have access to exit status
information, provide a /proc/pid/exithand_full whose read() returns a
siginfo_t with exit information
Having both /proc/pid/exithand and /proc/pid/exithand_full is useful
because we can make the former available to more processes than the
latter.
If everyone is comfortable with process exit status being globally
visible (as it apparently is on FreeBSD --- see below), then let's
skip #1 above and just have exithand's read() unconditionally return a
siginfo_t.
>> Should open() on exithand perform the same ptrace_may_access privilege
>> check? What if the process *becomes* untraceable during its lifetime
>> (e.g., with setuid). Should that read() on the exithand FD still yield
>> a siginfo_t? Just having exithand yield EOF all the time punts the
>> privilege problem to a later discussion because this approach doesn't
>> leak information. We can always add an "exithand_full" or something
>> that actually yields a siginfo_t.
>
> I agree that read(2) makes this hard. I don't think we should use it.
It's not the use of read that makes this hard. It's that nobody's
figured out what the right access model should be. The same problem
arises if you want to make process_connector unprivileged. And your
inotify proposal sidesteps the problem because it doesn't provide exit
status either.
> But if we have to use it, I would like us to have feature parity with
> features that FreeBSD had 18 years ago.
And I want parity with a feature Windows NT had in 1989: opening a
process and blocking until it exits. It shouldn't be hard to code, it
shouldn't require optional kernel features, and it shouldn't require
exotic wait operations.
>> Another option would be to make exithand's read() always yield a
>> siginfo_t, but have the open() just fail if the caller couldn't
>> ptrace_may_access it. But why shouldn't you be able to wait on other
>> processes? If you can see it in /proc, you should be able to wait on
>> it exiting.
>
> I would suggest looking at FreeBSD's kevent semantics for inspiration
> (or at least to see an alternative way of doing things). In particular,
> EVFILT_PROC+NOTE_EXIT -- which is attached to a particular process. I
> wonder what their view is on these sorts of questions.
What does FreeBSD do about privileged exit status communicated with
NOTE_EXIT? The FreeBSD 11 man page for kqueue states that "If a
process can normally see another process, it can attach an event to
it.". Does this mean that a process running as "nobody" can
EVFILT_PROC a process running as root and learn its exit status? If
so, that's an argument, I think, for just making exit status public
knowledge.