In the last round of cleaning up the pid hash table a more
general struct pid was introduced, that can be referenced
counted.
With the more general struct pid most if not all places where
we store a pid_t we can now store a struct pid * and remove
the need for a hash table lookup, and avoid any possible problems
with pid roll over.
Looking forward to the pid namespaces struct pid * gives us
an absolute form a pid so we can compare and use them without
caring which pid namespace we are in.
This patchset introduces the infrastructure needed to use
struct pid instead of pid_t, and then it goes on to convert
two different kernel users that currently store a pid_t value.
There are a lot more places to go but this is enough to get the
basic idea.
Before we can merge a pid namespace patch all of the kernel pid_t
users need to be examined. Those that deal with user space processes
need to be converted to using a struct pid *. Those that deal with
kernel processes need to converted to using the kthread api. A rare
few that only use their current processes pid values get to be left
alone.
Eric
Currently the signal functions all either take a task or
a pid_t argument. This patch implements variants that take
a struct pid *. After all of the users have been update
it is my intention to remove the variants that take a pid_t
as using pid_t can be more work (an extra hash table lookup)
and difficult to get right in the presence of multiple pid
namespaces.
There are two kinds of functions introduced in this patch. The
are the general use functions kill_pgrp and kill_pid which take
a priv argument that is ultimately used to create the appropriate
siginfo information, Then there are _kill_pgrp_info, kill_pgrp_info,
kill_pid_info the internal implementation helpers that take an
explicit siginfo.
The distinction is made because filling out an explcit siginfo is
tricky, and will be even more tricky when pid namespaces are introduced.
Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/sched.h | 5 ++++
kernel/signal.c | 57 ++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 52 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6560dd3..e7d7bad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1290,6 +1290,11 @@ extern int send_sig_info(int, struct sig
extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
extern int force_sigsegv(int, struct task_struct *);
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
+extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
+extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
+extern int kill_pgrp(struct pid *pid, int sig, int priv);
+extern int kill_pid(struct pid *pid, int sig, int priv);
extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
extern int kill_pg_info(int, struct siginfo *, pid_t);
extern int kill_proc_info(int, struct siginfo *, pid_t);
diff --git a/kernel/signal.c b/kernel/signal.c
index c476195..9eecb2f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1181,28 +1181,44 @@ int group_send_sig_info(int sig, struct
}
/*
- * kill_pg_info() sends a signal to a process group: this is what the tty
+ * kill_pgrp_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)
*/
-int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
int retval, success;
- if (pgrp <= 0)
- return -EINVAL;
-
success = 0;
retval = -ESRCH;
- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p);
success |= !err;
retval = err;
- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return success ? 0 : retval;
}
+int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
+{
+ int retval;
+
+ read_lock(&tasklist_lock);
+ retval = __kill_pgrp_info(sig, info, pgrp);
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
+{
+ if (pgrp <= 0)
+ return -EINVAL;
+
+ return __kill_pgrp_info(sig, info, find_pid(pgrp));
+}
+
int
kill_pg_info(int sig, struct siginfo *info, pid_t pgrp)
{
@@ -1215,8 +1231,7 @@ kill_pg_info(int sig, struct siginfo *in
return retval;
}
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
{
int error;
int acquired_tasklist_lock = 0;
@@ -1227,7 +1242,7 @@ kill_proc_info(int sig, struct siginfo *
read_lock(&tasklist_lock);
acquired_tasklist_lock = 1;
}
- p = find_task_by_pid(pid);
+ p = pid_task(pid, PIDTYPE_PID);
error = -ESRCH;
if (p)
error = group_send_sig_info(sig, info, p);
@@ -1237,6 +1252,16 @@ kill_proc_info(int sig, struct siginfo *
return error;
}
+int
+kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int error;
+ rcu_read_lock();
+ error = kill_pid_info(sig, info, find_pid(pid));
+ rcu_read_unlock();
+ return error;
+}
+
/* like kill_proc_info(), but doesn't use uid/euid of "current" */
int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
uid_t uid, uid_t euid, u32 secid)
@@ -1390,6 +1415,18 @@ force_sigsegv(int sig, struct task_struc
return 0;
}
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ return kill_pgrp_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pgrp);
+
+int kill_pid(struct pid *pid, int sig, int priv)
+{
+ return kill_pid_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pid);
+
int
kill_pg(pid_t pgrp, int sig, int priv)
{
--
1.4.2.rc3.g7e18e-dirty
pids aren't something that drivers should care about.
However there are a lot of helper layers in the kernel that do
care, and are built as modules. Before I can convert them
to using struct pid instead of pid_t I need to export the
appropriate symbols so they can continue to be built.
Signed-off-by: Eric W. Biederman <[email protected]>
---
kernel/pid.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index a7ca901..40e8e4d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -153,6 +153,7 @@ fastcall void put_pid(struct pid *pid)
atomic_dec_and_test(&pid->count))
kmem_cache_free(pid_cachep, pid);
}
+EXPORT_SYMBOL_GPL(put_pid);
static void delayed_put_pid(struct rcu_head *rhp)
{
@@ -218,6 +219,7 @@ struct pid * fastcall find_pid(int nr)
}
return NULL;
}
+EXPORT_SYMBOL_GPL(find_pid);
int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
{
@@ -302,6 +304,7 @@ struct pid *find_get_pid(pid_t nr)
return pid;
}
+EXPORT_SYMBOL_GPL(find_get_pid);
/*
* The pid hash table is scaled according to the amount of memory in the
--
1.4.2.rc3.g7e18e-dirty
To avoid pid rollover confusion the kernel needs to work with
struct pid * instead of pid_t. Currently there is not an iterator
that walks through all of the tasks of a given pid type starting
with a struct pid. This prevents us replacing some pid_t instances
with struct pid. So this patch adds do_each_pid_task which walks
through the set of task for a given pid type starting with a struct pid.
Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/pid.h | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 93da7e2..4007114 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -118,4 +118,17 @@ #define while_each_task_pid(who, type, t
1; }) ); \
}
+#define do_each_pid_task(pid, type, task) \
+ if ((task = pid_task(pid, type))) { \
+ prefetch(pid_next(task, type)); \
+ do {
+
+#define while_each_pid_task(pid, type, task) \
+ } while (pid_next(task, type) && ({ \
+ task = pid_next_task(task, type); \
+ rcu_dereference(task); \
+ prefetch(pid_next(task, type)); \
+ 1; }) ); \
+ }
+
#endif /* _LINUX_PID_H */
--
1.4.2.rc3.g7e18e-dirty
As we stop storing pid_t's and move to storing struct pid *. We
need a way to get the pid_t from the struct pid to report to user
space what we have stored.
Having a clean well defined way to do this is especially important
as we move to multiple pid spaces as may need to report a different
value to the caller depending on which pid space the caller is in.
Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/pid.h | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 4007114..9fd547f 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -93,6 +93,14 @@ extern struct pid *find_get_pid(int nr);
extern struct pid *alloc_pid(void);
extern void FASTCALL(free_pid(struct pid *pid));
+static inline pid_t pid_nr(struct pid *pid)
+{
+ pid_t nr = 0;
+ if (pid)
+ nr = pid->nr;
+ return nr;
+}
+
#define pid_next(task, type) \
((task)->pids[(type)].node.next)
--
1.4.2.rc3.g7e18e-dirty
task_session returns the struct pid of a tasks session.
task_pgrp returns the struct pid of a tasks process group.
task_tgid returns the struct pid of a tasks thread group.
task_pid returns the struct pid of a tasks process id.
These can be used to avoid unnecessary hash table lookups,
and to implement safe pid comparisions in the face of a
pid namespace.
Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/sched.h | 20 ++++++++++++++++++++
1 files changed, 20 insertions(+), 0 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03d96b9..6560dd3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1042,6 +1042,26 @@ static inline pid_t process_group(struct
return tsk->signal->pgrp;
}
+static inline struct pid *task_pid(struct task_struct *task)
+{
+ return task->pids[PIDTYPE_PID].pid;
+}
+
+static inline struct pid *task_tgid(struct task_struct *task)
+{
+ return task->group_leader->pids[PIDTYPE_PID].pid;
+}
+
+static inline struct pid *task_pgrp(struct task_struct *task)
+{
+ return task->group_leader->pids[PIDTYPE_PGID].pid;
+}
+
+static inline struct pid *task_session(struct task_struct *task)
+{
+ return task->group_leader->pids[PIDTYPE_SID].pid;
+}
+
/**
* pid_alive - check that a task structure is not stale
* @p: Task structure to be checked.
--
1.4.2.rc3.g7e18e-dirty
File handles can be requested to send sigio and sigurg
to processes. By tracking the destination processes
using struct pid instead of pid_t we make the interface
safe from all potential pid wrap around problems.
Signed-off-by: Eric W. Biederman <[email protected]>
---
drivers/net/tun.c | 2 +
fs/dnotify.c | 2 +
fs/fcntl.c | 78 +++++++++++++++++++++++++++++++++-------------------
fs/file_table.c | 1 +
fs/locks.c | 2 +
include/linux/fs.h | 5 +++
kernel/futex.c | 2 +
net/socket.c | 2 +
8 files changed, 60 insertions(+), 34 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9ab8ca6..c78d79c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -697,7 +697,7 @@ static int tun_chr_fasync(int fd, struct
return ret;
if (on) {
- ret = f_setown(file, current->pid, 0);
+ ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
if (ret)
return ret;
tun->flags |= TUN_FASYNC;
diff --git a/fs/dnotify.c b/fs/dnotify.c
index f932591..2b0442d 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -92,7 +92,7 @@ int fcntl_dirnotify(int fd, struct file
prev = &odn->dn_next;
}
- error = f_setown(filp, current->pid, 0);
+ error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
if (error)
goto out_free;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index b43d821..55d387c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -250,19 +250,21 @@ static int setfl(int fd, struct file * f
return error;
}
-static void f_modown(struct file *filp, unsigned long pid,
+static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
uid_t uid, uid_t euid, int force)
{
write_lock_irq(&filp->f_owner.lock);
if (force || !filp->f_owner.pid) {
- filp->f_owner.pid = pid;
+ put_pid(filp->f_owner.pid);
+ filp->f_owner.pid = get_pid(pid);
+ filp->f_owner.pid_type = type;
filp->f_owner.uid = uid;
filp->f_owner.euid = euid;
}
write_unlock_irq(&filp->f_owner.lock);
}
-int f_setown(struct file *filp, unsigned long arg, int force)
+int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force)
{
int err;
@@ -270,15 +272,44 @@ int f_setown(struct file *filp, unsigned
if (err)
return err;
- f_modown(filp, arg, current->uid, current->euid, force);
+ f_modown(filp, pid, type, current->uid, current->euid, force);
return 0;
}
+EXPORT_SYMBOL(__f_setown);
+
+int f_setown(struct file *filp, unsigned long arg, int force)
+{
+ enum pid_type type;
+ struct pid *pid;
+ int who = arg;
+ int result;
+ type = PIDTYPE_PID;
+ if (who < 0) {
+ type = PIDTYPE_PGID;
+ who = -who;
+ }
+ rcu_read_lock();
+ pid = find_pid(who);
+ result = __f_setown(filp, pid, type, force);
+ rcu_read_unlock();
+ return result;
+}
+
EXPORT_SYMBOL(f_setown);
void f_delown(struct file *filp)
{
- f_modown(filp, 0, 0, 0, 1);
+ f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1);
+}
+
+pid_t f_getown(struct file *filp)
+{
+ pid_t pid;
+ pid = pid_nr(filp->f_owner.pid);
+ if (sock->file->f_owner.pid_type == PIDTYPE_PGID)
+ pid = -pid;
+ return pid;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -319,7 +350,7 @@ static long do_fcntl(int fd, unsigned in
* current syscall conventions, the only way
* to fix this will be in libc.
*/
- err = filp->f_owner.pid;
+ err = f_getown(filp);
force_successful_syscall_return();
break;
case F_SETOWN:
@@ -470,24 +501,19 @@ static void send_sigio_to_task(struct ta
void send_sigio(struct fown_struct *fown, int fd, int band)
{
struct task_struct *p;
- int pid;
+ enum pid_type type;
+ struct pid *pid;
read_lock(&fown->lock);
+ type = fown->pid_type;
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
read_lock(&tasklist_lock);
- if (pid > 0) {
- p = find_task_by_pid(pid);
- if (p) {
- send_sigio_to_task(p, fown, fd, band);
- }
- } else {
- do_each_task_pid(-pid, PIDTYPE_PGID, p) {
- send_sigio_to_task(p, fown, fd, band);
- } while_each_task_pid(-pid, PIDTYPE_PGID, p);
- }
+ do_each_pid_task(pid, type, p) {
+ send_sigio_to_task(p, fown, fd, band);
+ } while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
out_unlock_fown:
read_unlock(&fown->lock);
@@ -503,9 +529,12 @@ static void send_sigurg_to_task(struct t
int send_sigurg(struct fown_struct *fown)
{
struct task_struct *p;
- int pid, ret = 0;
+ enum pid_type type;
+ struct pid *pid;
+ int ret = 0;
read_lock(&fown->lock);
+ type = fown->pid_type;
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
@@ -513,16 +542,9 @@ int send_sigurg(struct fown_struct *fown
ret = 1;
read_lock(&tasklist_lock);
- if (pid > 0) {
- p = find_task_by_pid(pid);
- if (p) {
- send_sigurg_to_task(p, fown);
- }
- } else {
- do_each_task_pid(-pid, PIDTYPE_PGID, p) {
- send_sigurg_to_task(p, fown);
- } while_each_task_pid(-pid, PIDTYPE_PGID, p);
- }
+ do_each_pid_task(pid, type, p) {
+ send_sigurg_to_task(p, fown);
+ } while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
out_unlock_fown:
read_unlock(&fown->lock);
diff --git a/fs/file_table.c b/fs/file_table.c
index 2d9069c..c64764a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -181,6 +181,7 @@ #endif
fops_put(file->f_op);
if (file->f_mode & FMODE_WRITE)
put_write_access(inode);
+ put_pid(file->f_owner.pid);
file_kill(file);
file->f_dentry = NULL;
file->f_vfsmnt = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index d7c5339..b12e819 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1514,7 +1514,7 @@ int fcntl_setlease(unsigned int fd, stru
goto out_unlock;
}
- error = f_setown(filp, current->pid, 0);
+ error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
out_unlock:
unlock_kernel();
return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 50e6eb2..b74d39b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -667,7 +667,8 @@ extern struct block_device *I_BDEV(struc
struct fown_struct {
rwlock_t lock; /* protects pid, uid, euid fields */
- int pid; /* pid or -pgrp where SIGIO should be sent */
+ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
+ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
uid_t uid, euid; /* uid/euid of process setting the owner */
int signum; /* posix.1b rt signal to be delivered on IO */
};
@@ -919,8 +920,10 @@ extern void kill_fasync(struct fasync_st
/* only for net: no internal synchronization */
extern void __kill_fasync(struct fasync_struct *, int, int);
+extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, unsigned long arg, int force);
extern void f_delown(struct file *filp);
+extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);
/*
diff --git a/kernel/futex.c b/kernel/futex.c
index d4633c5..d78d898 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1589,7 +1589,7 @@ static int futex_fd(u32 __user *uaddr, i
filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
if (signal) {
- err = f_setown(filp, current->pid, 1);
+ err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
if (err < 0) {
goto error;
}
diff --git a/net/socket.c b/net/socket.c
index 2bfc60a..a66bd4c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -825,7 +825,7 @@ #endif /* CONFIG_WIRELESS_EXT */
break;
case FIOGETOWN:
case SIOCGPGRP:
- err = put_user(sock->file->f_owner.pid,
+ err = put_user(f_getown(sock->file),
(int __user *)argp);
break;
case SIOCGIFBR:
--
1.4.2.rc3.g7e18e-dirty
This keeps the wrong process from being notified if the
daemon to spawn a new console dies.
Signed-off-by: Eric W. Biederman <[email protected]>
---
drivers/char/keyboard.c | 9 ++++++---
drivers/char/vt_ioctl.c | 5 +++--
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 3e90aac..9acd142 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -108,7 +108,8 @@ const int NR_TYPES = ARRAY_SIZE(max_vals
struct kbd_struct kbd_table[MAX_NR_CONSOLES];
static struct kbd_struct *kbd = kbd_table;
-int spawnpid, spawnsig;
+struct pid *spawnpid;
+int spawnsig;
/*
* Variables exported for vt.c
@@ -579,8 +580,10 @@ static void fn_compose(struct vc_data *v
static void fn_spawn_con(struct vc_data *vc, struct pt_regs *regs)
{
if (spawnpid)
- if (kill_proc(spawnpid, spawnsig, 1))
- spawnpid = 0;
+ if (kill_pid(spawnpid, spawnsig, 1)) {
+ put_pid(spawnpid);
+ spawnpid = NULL;
+ }
}
static void fn_SAK(struct vc_data *vc, struct pt_regs *regs)
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 28eff1a..d7e0187 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -645,12 +645,13 @@ #endif
*/
case KDSIGACCEPT:
{
- extern int spawnpid, spawnsig;
+ struct pid *spawnpid;
+ extern int spawnsig;
if (!perm || !capable(CAP_KILL))
return -EPERM;
if (!valid_signal(arg) || arg < 1 || arg == SIGKILL)
return -EINVAL;
- spawnpid = current->pid;
+ spawnpid = get_pid(task_pid(current));
spawnsig = arg;
return 0;
}
--
1.4.2.rc3.g7e18e-dirty
Ar Maw, 2006-08-15 am 12:23 -0600, ysgrifennodd Eric W. Biederman:
> This keeps the wrong process from being notified if the
> daemon to spawn a new console dies.
Not sure why we count pids not task structs but within the proposed
implementation this appears correct so
Acked-by: Alan Cox <[email protected]>
On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
> +static inline pid_t pid_nr(struct pid *pid)
> +{
> + pid_t nr = 0;
> + if (pid)
> + nr = pid->nr;
> + return nr;
> +}
When is it valid to be passing around a NULL 'struct pid *'?
-- Dave
On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
> +static inline struct pid *task_pid(struct task_struct *task)
> +{
> + return task->pids[PIDTYPE_PID].pid;
> +}
Does this mean we can start to deprecate the use of tsk->pid?
-- Dave
Alan Cox <[email protected]> writes:
> Ar Maw, 2006-08-15 am 12:23 -0600, ysgrifennodd Eric W. Biederman:
>> This keeps the wrong process from being notified if the
>> daemon to spawn a new console dies.
>
> Not sure why we count pids not task structs but within the proposed
> implementation this appears correct so
Basically struct pid is relatively cheap, 64bytes or so.
struct task is expensive 10K or so, when all of the stacks
and everything are included.
Counting pids allows the task to exit in user space and free up
all of it's memory.
When /proc used to count the task struct it was fairly easy to
deliberately oom a 32bit machine just by open up directories in
/proc and then having the process exit. rlimits didn't help because
we don't count processes that have exited.
>
> Acked-by: Alan Cox <[email protected]>
Thanks. When I get to the tty portion of this I think I am going
to have to synchronize with you as last I looked you were working in
this area as well.
Eric
Dave Hansen <[email protected]> writes:
> On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
>> +static inline pid_t pid_nr(struct pid *pid)
>> +{
>> + pid_t nr = 0;
>> + if (pid)
>> + nr = pid->nr;
>> + return nr;
>> +}
>
> When is it valid to be passing around a NULL 'struct pid *'?
When you don't have one at all. Look at the fcntl case a few
patches later, or even the spawnpid case. It simplifies things to
just cope with the fact that sometimes the users just have a NULL
pointers.
Then of course there is the later chaos when we get to pid spaces
where depending on the pid namespace you are in when you call this
on a given struct pid sometimes you will get a pid value and sometimes
you won't.
Eric
Dave Hansen <[email protected]> writes:
> On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
>> +static inline struct pid *task_pid(struct task_struct *task)
>> +{
>> + return task->pids[PIDTYPE_PID].pid;
>> +}
>
> Does this mean we can start to deprecate the use of tsk->pid?
Good question. I think there are enough users in the same process
case that it might not make sense to get rid of tsk->pid. Some
of tsk->pids cousins though like tgid are definitely up for grabs.
I haven't gotten far enough to be able to say for certain.
Eric
On Tue, 2006-08-15 at 13:00 -0600, Eric W. Biederman wrote:
> Dave Hansen <[email protected]> writes:
>
> > On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
> >> +static inline pid_t pid_nr(struct pid *pid)
> >> +{
> >> + pid_t nr = 0;
> >> + if (pid)
> >> + nr = pid->nr;
> >> + return nr;
> >> +}
> >
> > When is it valid to be passing around a NULL 'struct pid *'?
>
> When you don't have one at all. Look at the fcntl case a few
> patches later, or even the spawnpid case.
Does the fcntl() one originate from anywhere other than find_pid() in
f_setown()? It seems like, perhaps, the error checking is being done at
the wrong level.
> Then of course there is the later chaos when we get to pid spaces
> where depending on the pid namespace you are in when you call this
> on a given struct pid sometimes you will get a pid value and sometimes
> you won't.
OK, I think it is makes sense to me to say 'get_pid(tsk, pidspace)' and
get back a NULL 'struct pid' if that task isn't visible in that
namespace. However, I don't get how it is handy to be able to defer the
fact that the pid wasn't found until you go do a pid_nr() on that NULL.
-- Dave
On Tuesday 15 August 2006 12:23, Eric W. Biederman wrote:
> This keeps the wrong process from being notified if the
> daemon to spawn a new console dies.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> drivers/char/keyboard.c | 9 ++++++---
> drivers/char/vt_ioctl.c | 5 +++--
> 2 files changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
> index 3e90aac..9acd142 100644
> --- a/drivers/char/keyboard.c
> +++ b/drivers/char/keyboard.c
> @@ -108,7 +108,8 @@ const int NR_TYPES = ARRAY_SIZE(max_vals
> struct kbd_struct kbd_table[MAX_NR_CONSOLES];
> static struct kbd_struct *kbd = kbd_table;
>
> -int spawnpid, spawnsig;
> +struct pid *spawnpid;
> +int spawnsig;
>
> /*
> * Variables exported for vt.c
> @@ -579,8 +580,10 @@ static void fn_compose(struct vc_data *v
> static void fn_spawn_con(struct vc_data *vc, struct pt_regs *regs)
> {
> if (spawnpid)
> - if (kill_proc(spawnpid, spawnsig, 1))
> - spawnpid = 0;
> + if (kill_pid(spawnpid, spawnsig, 1)) {
> + put_pid(spawnpid);
> + spawnpid = NULL;
> + }
> }
>
> static void fn_SAK(struct vc_data *vc, struct pt_regs *regs)
> diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
> index 28eff1a..d7e0187 100644
> --- a/drivers/char/vt_ioctl.c
> +++ b/drivers/char/vt_ioctl.c
> @@ -645,12 +645,13 @@ #endif
> */
> case KDSIGACCEPT:
> {
> - extern int spawnpid, spawnsig;
> + struct pid *spawnpid;
> + extern int spawnsig;
shouldn't spawnpid also be declared 'extern' here?
> if (!perm || !capable(CAP_KILL))
> return -EPERM;
> if (!valid_signal(arg) || arg < 1 || arg == SIGKILL)
> return -EINVAL;
> - spawnpid = current->pid;
> + spawnpid = get_pid(task_pid(current));
> spawnsig = arg;
> return 0;
> }
Quoting Eric W. Biederman ([email protected]):
> To avoid pid rollover confusion the kernel needs to work with
> struct pid * instead of pid_t. Currently there is not an iterator
> that walks through all of the tasks of a given pid type starting
> with a struct pid. This prevents us replacing some pid_t instances
> with struct pid. So this patch adds do_each_pid_task which walks
> through the set of task for a given pid type starting with a struct pid.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> include/linux/pid.h | 13 +++++++++++++
> 1 files changed, 13 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 93da7e2..4007114 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -118,4 +118,17 @@ #define while_each_task_pid(who, type, t
> 1; }) ); \
> }
>
> +#define do_each_pid_task(pid, type, task) \
Hmm, defining do_each_pid_task right after do_each_task_pid could result
in some frustration :)
Though not sure of a better name - do_each_task_structpid?
-serge
On Tue, 15 Aug 2006 22:10:43 -0500
"Serge E. Hallyn" <[email protected]> wrote:
> Quoting Eric W. Biederman ([email protected]):
> > To avoid pid rollover confusion the kernel needs to work with
> > struct pid * instead of pid_t. Currently there is not an iterator
> > that walks through all of the tasks of a given pid type starting
> > with a struct pid. This prevents us replacing some pid_t instances
> > with struct pid. So this patch adds do_each_pid_task which walks
> > through the set of task for a given pid type starting with a struct pid.
> >
> > Signed-off-by: Eric W. Biederman <[email protected]>
> > ---
> > include/linux/pid.h | 13 +++++++++++++
> > 1 files changed, 13 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/linux/pid.h b/include/linux/pid.h
> > index 93da7e2..4007114 100644
> > --- a/include/linux/pid.h
> > +++ b/include/linux/pid.h
> > @@ -118,4 +118,17 @@ #define while_each_task_pid(who, type, t
> > 1; }) ); \
> > }
> >
> > +#define do_each_pid_task(pid, type, task) \
>
> Hmm, defining do_each_pid_task right after do_each_task_pid could result
> in some frustration :)
That's all right - developers can read the comments to work out what each
one does.
<seems I'm having a sarcastic day>
Andrew Morton <[email protected]> writes:
> On Tue, 15 Aug 2006 22:10:43 -0500
> "Serge E. Hallyn" <[email protected]> wrote:
>
>> Quoting Eric W. Biederman ([email protected]):
>> > To avoid pid rollover confusion the kernel needs to work with
>> > struct pid * instead of pid_t. Currently there is not an iterator
>> > that walks through all of the tasks of a given pid type starting
>> > with a struct pid. This prevents us replacing some pid_t instances
>> > with struct pid. So this patch adds do_each_pid_task which walks
>> > through the set of task for a given pid type starting with a struct pid.
>> >
>> > Signed-off-by: Eric W. Biederman <[email protected]>
>> > ---
>> > include/linux/pid.h | 13 +++++++++++++
>> > 1 files changed, 13 insertions(+), 0 deletions(-)
>> >
>> > diff --git a/include/linux/pid.h b/include/linux/pid.h
>> > index 93da7e2..4007114 100644
>> > --- a/include/linux/pid.h
>> > +++ b/include/linux/pid.h
>> > @@ -118,4 +118,17 @@ #define while_each_task_pid(who, type, t
>> > 1; }) ); \
>> > }
>> >
>> > +#define do_each_pid_task(pid, type, task) \
>>
>> Hmm, defining do_each_pid_task right after do_each_task_pid could result
>> in some frustration :)
>
> That's all right - developers can read the comments to work out what each
> one does.
>
> <seems I'm having a sarcastic day>
The plan is to that having both of them in the same kernel should
be a short lived thing. There are few enough of these perhaps I should
just replace all of them at once.
Doing this gradually and reviewably seems to require that I have
both versions of the API simultaneously. The core problem here
is that there aren't many good names.
Eric
Dave Hansen <[email protected]> writes:
> On Tue, 2006-08-15 at 13:00 -0600, Eric W. Biederman wrote:
>> Dave Hansen <[email protected]> writes:
>>
>> > On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
>> >> +static inline pid_t pid_nr(struct pid *pid)
>> >> +{
>> >> + pid_t nr = 0;
>> >> + if (pid)
>> >> + nr = pid->nr;
>> >> + return nr;
>> >> +}
>> >
>> > When is it valid to be passing around a NULL 'struct pid *'?
>>
>> When you don't have one at all. Look at the fcntl case a few
>> patches later, or even the spawnpid case.
>
> Does the fcntl() one originate from anywhere other than find_pid() in
> f_setown()? It seems like, perhaps, the error checking is being done at
> the wrong level.
Yes. It is normally NULL as file handles don't usually have a process
to send a signal to.
>> Then of course there is the later chaos when we get to pid spaces
>> where depending on the pid namespace you are in when you call this
>> on a given struct pid sometimes you will get a pid value and sometimes
>> you won't.
>
> OK, I think it is makes sense to me to say 'get_pid(tsk, pidspace)' and
> get back a NULL 'struct pid' if that task isn't visible in that
> namespace. However, I don't get how it is handy to be able to defer the
> fact that the pid wasn't found until you go do a pid_nr() on that NULL.
If having a pid to signal is optional, which it almost always is,
being able to handle that case easily and return a 0 to user space is helpful.
So far it is my assumption that everything that looks up a pid and converts
a pid to a number will do it with respect to the current processes pidspace.
A very reasonable assumption and only with siginfo have I found a case where
it looks like I might not be able to implement it that way.
Eric
"Serge E. Hallyn" <[email protected]> writes:
> Quoting Eric W. Biederman ([email protected]):
>> To avoid pid rollover confusion the kernel needs to work with
>> struct pid * instead of pid_t. Currently there is not an iterator
>> that walks through all of the tasks of a given pid type starting
>> with a struct pid. This prevents us replacing some pid_t instances
>> with struct pid. So this patch adds do_each_pid_task which walks
>> through the set of task for a given pid type starting with a struct pid.
>>
>> Signed-off-by: Eric W. Biederman <[email protected]>
>> ---
>> include/linux/pid.h | 13 +++++++++++++
>> 1 files changed, 13 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/pid.h b/include/linux/pid.h
>> index 93da7e2..4007114 100644
>> --- a/include/linux/pid.h
>> +++ b/include/linux/pid.h
>> @@ -118,4 +118,17 @@ #define while_each_task_pid(who, type, t
>> 1; }) ); \
>> }
>>
>> +#define do_each_pid_task(pid, type, task) \
>
> Hmm, defining do_each_pid_task right after do_each_task_pid could result
> in some frustration :)
>
> Though not sure of a better name - do_each_task_structpid?
A couple of things.
- I think do_each_pid_task is probably the most descriptive name
I can come up with. As these are tasks of a pid. I don't quite understand
where the old name came from.
- Whatever we pick for the new function is the name we are going to use
for a long time so I don't want it to be clumsy. The existing function
will go away after everything is updated.
- If you pick the wrong one you will get a compile error, as integers and
pointers are not very interchangeable.
I hate the temporary confusion but I don't see a better alternative.
Eric
> Alan Cox <[email protected]> writes:
>
>
>>Ar Maw, 2006-08-15 am 12:23 -0600, ysgrifennodd Eric W. Biederman:
>>
>>>This keeps the wrong process from being notified if the
>>>daemon to spawn a new console dies.
>>
>>Not sure why we count pids not task structs but within the proposed
>>implementation this appears correct so
>
>
> Basically struct pid is relatively cheap, 64bytes or so.
> struct task is expensive 10K or so, when all of the stacks
> and everything are included.
>
> Counting pids allows the task to exit in user space and free up
> all of it's memory.
>
> When /proc used to count the task struct it was fairly easy to
> deliberately oom a 32bit machine just by open up directories in
> /proc and then having the process exit. rlimits didn't help because
> we don't count processes that have exited.
hey, hey. your patch doesn't help in this situation (much)!
inodes and dentries can still consume much memory.
it just makes the situation a bit better.
I wonder whether it is easy to have the following done with your implementation:
container tasks visible from host. theoretically having 2 pids (vpid, pid)
it should be implementable. Do you see any obstacles?
in other regards patches look pretty good. Good job!
Kirill
> On Tue, 2006-08-15 at 12:23 -0600, Eric W. Biederman wrote:
>
>>+static inline struct pid *task_pid(struct task_struct *task)
>>+{
>>+ return task->pids[PIDTYPE_PID].pid;
>>+}
>
>
> Does this mean we can start to deprecate the use of tsk->pid?
Thats the final goal imho :)))
Kirill
Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > Quoting Eric W. Biederman ([email protected]):
> >> To avoid pid rollover confusion the kernel needs to work with
> >> struct pid * instead of pid_t. Currently there is not an iterator
> >> that walks through all of the tasks of a given pid type starting
> >> with a struct pid. This prevents us replacing some pid_t instances
> >> with struct pid. So this patch adds do_each_pid_task which walks
> >> through the set of task for a given pid type starting with a struct pid.
> >>
> >> Signed-off-by: Eric W. Biederman <[email protected]>
> >> ---
> >> include/linux/pid.h | 13 +++++++++++++
> >> 1 files changed, 13 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/include/linux/pid.h b/include/linux/pid.h
> >> index 93da7e2..4007114 100644
> >> --- a/include/linux/pid.h
> >> +++ b/include/linux/pid.h
> >> @@ -118,4 +118,17 @@ #define while_each_task_pid(who, type, t
> >> 1; }) ); \
> >> }
> >>
> >> +#define do_each_pid_task(pid, type, task) \
> >
> > Hmm, defining do_each_pid_task right after do_each_task_pid could result
> > in some frustration :)
> >
> > Though not sure of a better name - do_each_task_structpid?
>
> A couple of things.
> - I think do_each_pid_task is probably the most descriptive name
> I can come up with. As these are tasks of a pid. I don't quite understand
> where the old name came from.
>
> - Whatever we pick for the new function is the name we are going to use
> for a long time so I don't want it to be clumsy. The existing function
> will go away after everything is updated.
Ok. If the old iterator is going away that's fine.
I was under the impression we'd still need to be able to do the loop
based on pid_t as well.
thanks,
-serge
On 08/15, Eric W. Biederman wrote:
>
> +static inline pid_t pid_nr(struct pid *pid)
> +{
> + pid_t nr = 0;
> + if (pid)
> + nr = pid->nr;
> + return nr;
> +}
I think this is not safe, you need rcu locks here or the caller should
do some locking.
Let's look at f_getown() (PATCH 7/7). What if original task which was
pointed by ->f_owner.pid has gone, another thread does fcntl(F_SETOWN),
and pid_nr() takes a preemtion after 'if (pid)'? In this case 'pid->nr'
may follow a freed memory.
Oleg.
On 08/15, Eric W. Biederman wrote:
>
> File handles can be requested to send sigio and sigurg
> to processes. By tracking the destination processes
> using struct pid instead of pid_t we make the interface
> safe from all potential pid wrap around problems.
As I can see, this patch adds 2 user visible changes. I am
not arguing, but probably it makes sense to document this.
Before this patch, fcntl(F_GETOWN) returns the same pid that
was given to fcntl(F_SETOWN). Now it returns 0 if there were
no process/group with such a pid when F_SETOWN was called.
The second change is good (I'd say this is bugfix). It is
not possible anymore to send the signal to not yet created
processes via fcntl(F_SETOWN, last_pid + a_little), or hit
a problem with pid re-use.
Oleg.
Kirill Korotaev <[email protected]> writes:
>> Alan Cox <[email protected]> writes:
>>
>>>Ar Maw, 2006-08-15 am 12:23 -0600, ysgrifennodd Eric W. Biederman:
>>>
>>>>This keeps the wrong process from being notified if the
>>>>daemon to spawn a new console dies.
>>>
>>>Not sure why we count pids not task structs but within the proposed
>>>implementation this appears correct so
>> Basically struct pid is relatively cheap, 64bytes or so.
>> struct task is expensive 10K or so, when all of the stacks
>> and everything are included.
>> Counting pids allows the task to exit in user space and free up
>> all of it's memory.
>> When /proc used to count the task struct it was fairly easy to
>> deliberately oom a 32bit machine just by open up directories in
>> /proc and then having the process exit. rlimits didn't help because
>> we don't count processes that have exited.
> hey, hey. your patch doesn't help in this situation (much)!
> inodes and dentries can still consume much memory.
> it just makes the situation a bit better.
It does help. The size of the locked memory is better and we
account for the used inodes and dentries. As I recall the inode+dentry
is less than 1k. The number of open file handle rlimit at least
lets the existing limits work. The core improvement is not allowing
an escape from our accounting mechanism for one of our biggest data
structures.
> I wonder whether it is easy to have the following done with your implementation:
> container tasks visible from host. theoretically having 2 pids (vpid, pid)
> it should be implementable. Do you see any obstacles?
I seen no problem for having tasks have N pid numbers. The trick
is to create a hash table entry (struct pid) that when has a pointer
to the real struct pid. It won't be the classic pid,vpid. All pids
will be equally real.
My intention is to give a process a pid in each namespace that it has
an ancestor in.
> in other regards patches look pretty good. Good job!
Thanks.
Eric
On 08/16, Oleg Nesterov wrote:
>
> On 08/15, Eric W. Biederman wrote:
> >
> > File handles can be requested to send sigio and sigurg
> > to processes. By tracking the destination processes
> > using struct pid instead of pid_t we make the interface
> > safe from all potential pid wrap around problems.
>
> ....
>
> The second change is good (I'd say this is bugfix). It is
> not possible anymore to send the signal to not yet created
> processes via fcntl(F_SETOWN, last_pid + a_little), or hit
> a problem with pid re-use.
Damn, I read the patch but forgot to read the changelog :)
It clearly says "safe from all potential pid wrap around problems",
sorry.
Oleg.
On 08/15, Eric W. Biederman wrote:
>
> +#define do_each_pid_task(pid, type, task) \
> + if ((task = pid_task(pid, type))) { \
> + prefetch(pid_next(task, type)); \
> + do {
> +
> +#define while_each_pid_task(pid, type, task) \
> + } while (pid_next(task, type) && ({ \
> + task = pid_next_task(task, type); \
> + rcu_dereference(task); \
> + prefetch(pid_next(task, type)); \
> + 1; }) ); \
> + }
A small nit. Suppose a non-leader thread blocks a signal, and does
exec. There is a window when we have 2 tasks with the same pid in
PIDTYPE_PID namespace. If send_sigio() send the signal in that
window, it will be delivered twice to process.
Oleg.
On 08/15, Eric W. Biederman wrote:
>
> diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
> index 28eff1a..d7e0187 100644
> --- a/drivers/char/vt_ioctl.c
> +++ b/drivers/char/vt_ioctl.c
> @@ -645,12 +645,13 @@ #endif
> */
> case KDSIGACCEPT:
> {
> - extern int spawnpid, spawnsig;
> + struct pid *spawnpid;
^^^^^^^^^^^^^^^^^^^^
Should be "extern struct pid *spawnpid" ?
Oleg.
On Wed, 16 Aug 2006 23:35:57 +0400
Oleg Nesterov <[email protected]> wrote:
> On 08/15, Eric W. Biederman wrote:
> >
> > diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
> > index 28eff1a..d7e0187 100644
> > --- a/drivers/char/vt_ioctl.c
> > +++ b/drivers/char/vt_ioctl.c
> > @@ -645,12 +645,13 @@ #endif
> > */
> > case KDSIGACCEPT:
> > {
> > - extern int spawnpid, spawnsig;
> > + struct pid *spawnpid;
> ^^^^^^^^^^^^^^^^^^^^
> Should be "extern struct pid *spawnpid" ?
>
It was updated thusly: (the identifiers are a bit generic-sounding though)
From: Eric Biederman <[email protected]>
This moves the declaration of spawnpid and spawnsig into, a header
file, and makes spawnpid extern. Without being extern the declaration
of swanpid was a nice little struct pid leak in the kernel and totally
broke the spawnpid functionality :(
Signed-off-by: Eric Biederman <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
drivers/char/vt_ioctl.c | 2 --
include/linux/vt_kern.h | 3 +++
2 files changed, 3 insertions(+), 2 deletions(-)
diff -puN drivers/char/vt_ioctl.c~vt-update-spawnpid-to-be-a-struct-pid_t-tidy drivers/char/vt_ioctl.c
--- a/drivers/char/vt_ioctl.c~vt-update-spawnpid-to-be-a-struct-pid_t-tidy
+++ a/drivers/char/vt_ioctl.c
@@ -645,8 +645,6 @@ int vt_ioctl(struct tty_struct *tty, str
*/
case KDSIGACCEPT:
{
- struct pid *spawnpid;
- extern int spawnsig;
if (!perm || !capable(CAP_KILL))
return -EPERM;
if (!valid_signal(arg) || arg < 1 || arg == SIGKILL)
diff -puN include/linux/vt_kern.h~vt-update-spawnpid-to-be-a-struct-pid_t-tidy include/linux/vt_kern.h
--- a/include/linux/vt_kern.h~vt-update-spawnpid-to-be-a-struct-pid_t-tidy
+++ a/include/linux/vt_kern.h
@@ -84,4 +84,7 @@ void reset_vc(struct vc_data *vc);
extern char con_buf[CON_BUF_SIZE];
extern struct semaphore con_buf_sem;
+extern struct pid *spawnpid;
+extern int spawnsig;
+
#endif /* _VT_KERN_H */
_
Oleg Nesterov <[email protected]> writes:
> On 08/15, Eric W. Biederman wrote:
>>
>> +static inline pid_t pid_nr(struct pid *pid)
>> +{
>> + pid_t nr = 0;
>> + if (pid)
>> + nr = pid->nr;
>> + return nr;
>> +}
>
> I think this is not safe, you need rcu locks here or the caller should
> do some locking.
>
> Let's look at f_getown() (PATCH 7/7). What if original task which was
> pointed by ->f_owner.pid has gone, another thread does fcntl(F_SETOWN),
> and pid_nr() takes a preemtion after 'if (pid)'? In this case 'pid->nr'
> may follow a freed memory.
This isn't an rcu reference. I hold a hard reference count on
the pid entry. So this should be safe.
What is an rcu reference is going from struct pid to the task
it points to.
Eric
>> +static inline pid_t pid_nr(struct pid *pid)
>> +{
>> + pid_t nr = 0;
>> + if (pid)
>> + nr = pid->nr;
>> + return nr;
>> +}
>
>When is it valid to be passing around a NULL 'struct pid *'?
Is 0 even the right thing to return in the rare case that pid == NULL?
-1 maybe?
Jan Engelhardt
--
On 08/16, Eric W. Biederman wrote:
> Oleg Nesterov <[email protected]> writes:
>
> > On 08/15, Eric W. Biederman wrote:
> >>
> >> +static inline pid_t pid_nr(struct pid *pid)
> >> +{
> >> + pid_t nr = 0;
> >> + if (pid)
> >> + nr = pid->nr;
> >> + return nr;
> >> +}
> >
> > I think this is not safe, you need rcu locks here or the caller should
> > do some locking.
> >
> > Let's look at f_getown() (PATCH 7/7). What if original task which was
> > pointed by ->f_owner.pid has gone, another thread does fcntl(F_SETOWN),
> > and pid_nr() takes a preemtion after 'if (pid)'? In this case 'pid->nr'
> > may follow a freed memory.
>
> This isn't an rcu reference. I hold a hard reference count on
> the pid entry. So this should be safe.
-static void f_modown(struct file *filp, unsigned long pid,
+static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
uid_t uid, uid_t euid, int force)
{
write_lock_irq(&filp->f_owner.lock);
if (force || !filp->f_owner.pid) {
- filp->f_owner.pid = pid;
+ put_pid(filp->f_owner.pid);
This 'put_pid()' can actually free 'struct pid' if the task/group
has already gone away. Another thread doing f_getown() can access
a freed memory, no?
> What is an rcu reference is going from struct pid to the task
> it points to.
Yes, you are right... But I'd say it is going form task to pid :)
Oleg.
Oleg Nesterov <[email protected]> writes:
> On 08/16, Eric W. Biederman wrote:
>> Oleg Nesterov <[email protected]> writes:
>>
>> > On 08/15, Eric W. Biederman wrote:
>> >>
>> >> +static inline pid_t pid_nr(struct pid *pid)
>> >> +{
>> >> + pid_t nr = 0;
>> >> + if (pid)
>> >> + nr = pid->nr;
>> >> + return nr;
>> >> +}
>> >
>> > I think this is not safe, you need rcu locks here or the caller should
>> > do some locking.
>> >
>> > Let's look at f_getown() (PATCH 7/7). What if original task which was
>> > pointed by ->f_owner.pid has gone, another thread does fcntl(F_SETOWN),
>> > and pid_nr() takes a preemtion after 'if (pid)'? In this case 'pid->nr'
>> > may follow a freed memory.
>>
>> This isn't an rcu reference. I hold a hard reference count on
>> the pid entry. So this should be safe.
>
> -static void f_modown(struct file *filp, unsigned long pid,
> +static void f_modown(struct file *filp, struct pid *pid, enum pid_type
> type,
> uid_t uid, uid_t euid, int force)
> {
> write_lock_irq(&filp->f_owner.lock);
> if (force || !filp->f_owner.pid) {
> - filp->f_owner.pid = pid;
> + put_pid(filp->f_owner.pid);
>
> This 'put_pid()' can actually free 'struct pid' if the task/group
> has already gone away. Another thread doing f_getown() can access
> a freed memory, no?
Good point. In that case it looks like I need to hold the f_owner.lock.
Something needs to serialize that.
Fun. I touch the code and find a place where we didn't take a lock
and accidentally relied on integer operations being atomic.
I will see about working up a fix for that.
Eric
Jan Engelhardt <[email protected]> writes:
>>> +static inline pid_t pid_nr(struct pid *pid)
>>> +{
>>> + pid_t nr = 0;
>>> + if (pid)
>>> + nr = pid->nr;
>>> + return nr;
>>> +}
>>
>>When is it valid to be passing around a NULL 'struct pid *'?
>
> Is 0 even the right thing to return in the rare case that pid == NULL?
> -1 maybe?
I believe 0 is what we have used elsewhere.
A negative value for a pid is likely to be taken as the group of all
processes, or -EPERM, and it does really confusing things when fed through
the current f_getown implementation, with PIDTYPE_PGRP.
The only other possible interpretations of 0 are the idle process and
yourself. There is still some potential there, but largely 0 is much
less likely to be confused than -1. Especially as it doesn't change
when you negate it.
Eric
Andrew Morton <[email protected]> writes:
> On Wed, 16 Aug 2006 23:35:57 +0400
> Oleg Nesterov <[email protected]> wrote:
>
>> On 08/15, Eric W. Biederman wrote:
>> >
>> > diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
>> > index 28eff1a..d7e0187 100644
>> > --- a/drivers/char/vt_ioctl.c
>> > +++ b/drivers/char/vt_ioctl.c
>> > @@ -645,12 +645,13 @@ #endif
>> > */
>> > case KDSIGACCEPT:
>> > {
>> > - extern int spawnpid, spawnsig;
>> > + struct pid *spawnpid;
>> ^^^^^^^^^^^^^^^^^^^^
>> Should be "extern struct pid *spawnpid" ?
>>
>
> It was updated thusly: (the identifiers are a bit generic-sounding though)
I need to relook at this. But I believe Oleg found an idiom bug in
f_getown that will also apply here. Basically if the update is not
an atomic transaction then we need to hold a lock when updating the
struct pid pointer so updates and uses of the pointer don't race.
Assuming I need to address that. I will place spawnpid, spawnsig,
and their lock into a structure and see if I can give it a little
bit better name.
My only defense. I didn't change the name in the first place. :)
Eric
On top of Eric's recent include/linux/pid.h changes.
I think it is hardly possible to read the current do_each_task_pid().
The new version is much simpler and makes the code smaller.
Only the do_each_task_pid change is tested, the do_each_pid_task isn't.
Eric, could you take a hard look at this patch?
Signed-off-by: Oleg Nesterov <[email protected]>
--- 2.6.18-rc3/include/linux/pid.h~ 2006-08-16 20:40:05.000000000 +0400
+++ 2.6.18-rc3/include/linux/pid.h 2006-08-18 00:45:06.000000000 +0400
@@ -99,42 +99,29 @@ static inline pid_t pid_nr(struct pid *p
return nr;
}
-#define pid_next(task, type) \
- ((task)->pids[(type)].node.next)
-#define pid_next_task(task, type) \
- hlist_entry(pid_next(task, type), struct task_struct, \
- pids[(type)].node)
-
-
-/* We could use hlist_for_each_entry_rcu here but it takes more arguments
- * than the do_each_task_pid/while_each_task_pid. So we roll our own
- * to preserve the existing interface.
- */
-#define do_each_task_pid(who, type, task) \
- if ((task = find_task_by_pid_type(type, who))) { \
- prefetch(pid_next(task, type)); \
- do {
-
-#define while_each_task_pid(who, type, task) \
- } while (pid_next(task, type) && ({ \
- task = pid_next_task(task, type); \
- rcu_dereference(task); \
- prefetch(pid_next(task, type)); \
- 1; }) ); \
- }
-
-#define do_each_pid_task(pid, type, task) \
- if ((task = pid_task(pid, type))) { \
- prefetch(pid_next(task, type)); \
- do {
-
-#define while_each_pid_task(pid, type, task) \
- } while (pid_next(task, type) && ({ \
- task = pid_next_task(task, type); \
- rcu_dereference(task); \
- prefetch(pid_next(task, type)); \
- 1; }) ); \
- }
+#define do_each_task_pid(who, type, task) \
+ do { \
+ struct hlist_node *pos___; \
+ struct pid *pid___ = find_pid(who); \
+ if (pid___ != NULL) \
+ hlist_for_each_entry_rcu((task), pos___, \
+ &pid___->tasks[type], pids[type].node) {
+
+#define while_each_task_pid(who, type, task) \
+ } \
+ } while (0)
+
+
+#define do_each_pid_task(pid, type, task) \
+ do { \
+ struct hlist_node *pos___; \
+ if (pid != NULL) \
+ hlist_for_each_entry_rcu((task), pos___, \
+ &pid->tasks[type], pids[type].node) {
+
+#define while_each_pid_task(pid, type, task) \
+ } \
+ } while (0)
#endif /* _LINUX_PID_H */