2007-05-30 00:53:13

by Matt Helsley

[permalink] [raw]
Subject: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

This patch avoids holding the mmap semaphore while walking VMAs in response to
programs which read or follow the /proc/<pid|self>/exe symlink. This also allows us
to merge mmu and nommu proc_exe_link() functions. The costs are holding a separate
reference to the executable file stored in the task struct and increased code in
fork, exec, and exit paths.

Signed-off-by: Matt Helsley <[email protected]>
---

Compiled and passed simple tests for regressions when patched against a 2.6.20
and 2.6.22-rc2-mm1 kernel.

fs/exec.c | 5 +++--
fs/proc/base.c | 20 ++++++++++++++++++++
fs/proc/internal.h | 1 -
fs/proc/task_mmu.c | 34 ----------------------------------
fs/proc/task_nommu.c | 34 ----------------------------------
include/linux/sched.h | 1 +
kernel/exit.c | 2 ++
kernel/fork.c | 10 +++++++++-
8 files changed, 35 insertions(+), 72 deletions(-)

Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
===================================================================
--- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
+++ linux-2.6.22-rc2-mm1/include/linux/sched.h
@@ -988,10 +988,11 @@ struct task_struct {
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by flush_old_exec */
+ struct file *exe_file;
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
Index: linux-2.6.22-rc2-mm1/fs/exec.c
===================================================================
--- linux-2.6.22-rc2-mm1.orig/fs/exec.c
+++ linux-2.6.22-rc2-mm1/fs/exec.c
@@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b
read_unlock(&binfmt_lock);
retval = fn(bprm, regs);
if (retval >= 0) {
put_binfmt(fmt);
allow_write_access(bprm->file);
- if (bprm->file)
- fput(bprm->file);
+ if (current->exe_file)
+ fput(current->exe_file);
+ current->exe_file = bprm->file;
bprm->file = NULL;
current->did_exec = 1;
proc_exec_connector(current);
return retval;
}
Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
===================================================================
--- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
+++ linux-2.6.22-rc2-mm1/fs/proc/base.c
@@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc
.write = sched_write,
.llseek = seq_lseek,
.release = seq_release,
};

+static int proc_exe_link(struct inode *inode, struct dentry **dentry,
+ struct vfsmount **mnt)
+{
+ int error;
+ struct task_struct *task;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ENOENT;
+ error = -ENOSYS;
+ if (!task->exe_file)
+ goto out;
+ *mnt = mntget(task->exe_file->f_path.mnt);
+ *dentry = dget(task->exe_file->f_path.dentry);
+ error = 0;
+out:
+ put_task_struct(task);
+ return error;
+}
+
static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
int error = -EACCES;

Index: linux-2.6.22-rc2-mm1/kernel/exit.c
===================================================================
--- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ linux-2.6.22-rc2-mm1/kernel/exit.c
@@ -924,10 +924,12 @@ fastcall void do_exit(long code)
if (unlikely(tsk->audit_context))
audit_free(tsk);

taskstats_exit(tsk, group_dead);

+ if (tsk->exe_file)
+ fput(tsk->exe_file);
exit_mm(tsk);

if (group_dead)
acct_process();
exit_sem(tsk);
Index: linux-2.6.22-rc2-mm1/kernel/fork.c
===================================================================
--- linux-2.6.22-rc2-mm1.orig/kernel/fork.c
+++ linux-2.6.22-rc2-mm1/kernel/fork.c
@@ -1163,10 +1163,13 @@ static struct task_struct *copy_process(

/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
p->exit_state = 0;
+ p->exe_file = current->exe_file;
+ if (p->exe_file)
+ get_file(p->exe_file);

/*
* Ok, make it visible to the rest of the system.
* We dont wake it up yet.
*/
@@ -1218,11 +1221,11 @@ static struct task_struct *copy_process(
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_cleanup_namespaces;
+ goto bad_fork_cleanup_exe_file;
}

if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
@@ -1274,10 +1277,15 @@ static struct task_struct *copy_process(
put_user(p->pid, parent_tidptr);

proc_fork_connector(p);
return p;

+bad_fork_cleanup_exe_file:
+ if (p->exe_file) {
+ fput(p->exe_file);
+ p->exe_file = NULL;
+ }
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_keys:
exit_keys(p);
bad_fork_cleanup_mm:
Index: linux-2.6.22-rc2-mm1/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.22-rc2-mm1.orig/fs/proc/task_mmu.c
+++ linux-2.6.22-rc2-mm1/fs/proc/task_mmu.c
@@ -71,44 +71,10 @@ int task_statm(struct mm_struct *mm, int
*data = mm->total_vm - mm->shared_vm;
*resident = *shared + get_mm_counter(mm, anon_rss);
return mm->total_vm;
}

-int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
-{
- struct vm_area_struct * vma;
- int result = -ENOENT;
- struct task_struct *task = get_proc_task(inode);
- struct mm_struct * mm = NULL;
-
- if (task) {
- mm = get_task_mm(task);
- put_task_struct(task);
- }
- if (!mm)
- goto out;
- down_read(&mm->mmap_sem);
-
- vma = mm->mmap;
- while (vma) {
- if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
- break;
- vma = vma->vm_next;
- }
-
- if (vma) {
- *mnt = mntget(vma->vm_file->f_path.mnt);
- *dentry = dget(vma->vm_file->f_path.dentry);
- result = 0;
- }
-
- up_read(&mm->mmap_sem);
- mmput(mm);
-out:
- return result;
-}
-
static void pad_len_spaces(struct seq_file *m, int len)
{
len = 25 + sizeof(void*) * 6 - len;
if (len < 1)
len = 1;
Index: linux-2.6.22-rc2-mm1/fs/proc/task_nommu.c
===================================================================
--- linux-2.6.22-rc2-mm1.orig/fs/proc/task_nommu.c
+++ linux-2.6.22-rc2-mm1/fs/proc/task_nommu.c
@@ -102,44 +102,10 @@ int task_statm(struct mm_struct *mm, int
up_read(&mm->mmap_sem);
*resident = size;
return size;
}

-int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
-{
- struct vm_list_struct *vml;
- struct vm_area_struct *vma;
- struct task_struct *task = get_proc_task(inode);
- struct mm_struct *mm = get_task_mm(task);
- int result = -ENOENT;
-
- if (!mm)
- goto out;
- down_read(&mm->mmap_sem);
-
- vml = mm->context.vmlist;
- vma = NULL;
- while (vml) {
- if ((vml->vma->vm_flags & VM_EXECUTABLE) && vml->vma->vm_file) {
- vma = vml->vma;
- break;
- }
- vml = vml->next;
- }
-
- if (vma) {
- *mnt = mntget(vma->vm_file->f_path.mnt);
- *dentry = dget(vma->vm_file->f_path.dentry);
- result = 0;
- }
-
- up_read(&mm->mmap_sem);
- mmput(mm);
-out:
- return result;
-}
-
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
static int show_map(struct seq_file *m, void *_vml)
{
Index: linux-2.6.22-rc2-mm1/fs/proc/internal.h
===================================================================
--- linux-2.6.22-rc2-mm1.orig/fs/proc/internal.h
+++ linux-2.6.22-rc2-mm1/fs/proc/internal.h
@@ -38,11 +38,10 @@ extern int nommu_vma_show(struct seq_fil
#endif

extern int maps_protect;

extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
-extern int proc_exe_link(struct inode *, struct dentry **, struct vfsmount **);
extern int proc_tid_stat(struct task_struct *, char *);
extern int proc_tgid_stat(struct task_struct *, char *);
extern int proc_pid_status(struct task_struct *, char *);
extern int proc_pid_statm(struct task_struct *, char *);
extern loff_t mem_lseek(struct file * file, loff_t offset, int orig);



2007-05-30 18:09:35

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

Quoting Matt Helsley ([email protected]):
> This patch avoids holding the mmap semaphore while walking VMAs in response to
> programs which read or follow the /proc/<pid|self>/exe symlink. This also allows us
> to merge mmu and nommu proc_exe_link() functions. The costs are holding a separate
> reference to the executable file stored in the task struct and increased code in
> fork, exec, and exit paths.
>
> Signed-off-by: Matt Helsley <[email protected]>
> ---
>
> Compiled and passed simple tests for regressions when patched against a 2.6.20
> and 2.6.22-rc2-mm1 kernel.
>
> fs/exec.c | 5 +++--
> fs/proc/base.c | 20 ++++++++++++++++++++
> fs/proc/internal.h | 1 -
> fs/proc/task_mmu.c | 34 ----------------------------------
> fs/proc/task_nommu.c | 34 ----------------------------------
> include/linux/sched.h | 1 +
> kernel/exit.c | 2 ++
> kernel/fork.c | 10 +++++++++-
> 8 files changed, 35 insertions(+), 72 deletions(-)
>
> Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
> +++ linux-2.6.22-rc2-mm1/include/linux/sched.h
> @@ -988,10 +988,11 @@ struct task_struct {
> int oomkilladj; /* OOM kill score adjustment (bit shift). */
> char comm[TASK_COMM_LEN]; /* executable name excluding path
> - access with [gs]et_task_comm (which lock
> it with task_lock())
> - initialized normally by flush_old_exec */
> + struct file *exe_file;
> /* file system info */
> int link_count, total_link_count;
> #ifdef CONFIG_SYSVIPC
> /* ipc stuff */
> struct sysv_sem sysvsem;
> Index: linux-2.6.22-rc2-mm1/fs/exec.c
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/fs/exec.c
> +++ linux-2.6.22-rc2-mm1/fs/exec.c
> @@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b
> read_unlock(&binfmt_lock);
> retval = fn(bprm, regs);
> if (retval >= 0) {
> put_binfmt(fmt);
> allow_write_access(bprm->file);
> - if (bprm->file)
> - fput(bprm->file);
> + if (current->exe_file)
> + fput(current->exe_file);
> + current->exe_file = bprm->file;
> bprm->file = NULL;
> current->did_exec = 1;
> proc_exec_connector(current);
> return retval;
> }
> Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
> +++ linux-2.6.22-rc2-mm1/fs/proc/base.c
> @@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc
> .write = sched_write,
> .llseek = seq_lseek,
> .release = seq_release,
> };
>
> +static int proc_exe_link(struct inode *inode, struct dentry **dentry,
> + struct vfsmount **mnt)
> +{
> + int error;
> + struct task_struct *task;
> +
> + task = get_proc_task(inode);
> + if (!task)
> + return -ENOENT;
> + error = -ENOSYS;
> + if (!task->exe_file)
> + goto out;
> + *mnt = mntget(task->exe_file->f_path.mnt);
> + *dentry = dget(task->exe_file->f_path.dentry);
> + error = 0;
> +out:
> + put_task_struct(task);
> + return error;
> +}
> +
> static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
> {
> struct inode *inode = dentry->d_inode;
> int error = -EACCES;
>
> Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> if (unlikely(tsk->audit_context))
> audit_free(tsk);
>
> taskstats_exit(tsk, group_dead);
>
> + if (tsk->exe_file)
> + fput(tsk->exe_file);

Hi,

just taking a cursory look so I may be missing something, but doesn't
this leave the possibility that right here, with tsk->exe_file being
put, another task would try to look at tsk's /proc/tsk->pid/exe?

thanks,
-serge


> exit_mm(tsk);
>
> if (group_dead)
> acct_process();
> exit_sem(tsk);
> Index: linux-2.6.22-rc2-mm1/kernel/fork.c
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/kernel/fork.c
> +++ linux-2.6.22-rc2-mm1/kernel/fork.c
> @@ -1163,10 +1163,13 @@ static struct task_struct *copy_process(
>
> /* ok, now we should be set up.. */
> p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
> p->pdeath_signal = 0;
> p->exit_state = 0;
> + p->exe_file = current->exe_file;
> + if (p->exe_file)
> + get_file(p->exe_file);
>
> /*
> * Ok, make it visible to the rest of the system.
> * We dont wake it up yet.
> */
> @@ -1218,11 +1221,11 @@ static struct task_struct *copy_process(
> recalc_sigpending();
> if (signal_pending(current)) {
> spin_unlock(&current->sighand->siglock);
> write_unlock_irq(&tasklist_lock);
> retval = -ERESTARTNOINTR;
> - goto bad_fork_cleanup_namespaces;
> + goto bad_fork_cleanup_exe_file;
> }
>
> if (clone_flags & CLONE_THREAD) {
> p->group_leader = current->group_leader;
> list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
> @@ -1274,10 +1277,15 @@ static struct task_struct *copy_process(
> put_user(p->pid, parent_tidptr);
>
> proc_fork_connector(p);
> return p;
>
> +bad_fork_cleanup_exe_file:
> + if (p->exe_file) {
> + fput(p->exe_file);
> + p->exe_file = NULL;
> + }
> bad_fork_cleanup_namespaces:
> exit_task_namespaces(p);
> bad_fork_cleanup_keys:
> exit_keys(p);
> bad_fork_cleanup_mm:
> Index: linux-2.6.22-rc2-mm1/fs/proc/task_mmu.c
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/fs/proc/task_mmu.c
> +++ linux-2.6.22-rc2-mm1/fs/proc/task_mmu.c
> @@ -71,44 +71,10 @@ int task_statm(struct mm_struct *mm, int
> *data = mm->total_vm - mm->shared_vm;
> *resident = *shared + get_mm_counter(mm, anon_rss);
> return mm->total_vm;
> }
>
> -int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
> -{
> - struct vm_area_struct * vma;
> - int result = -ENOENT;
> - struct task_struct *task = get_proc_task(inode);
> - struct mm_struct * mm = NULL;
> -
> - if (task) {
> - mm = get_task_mm(task);
> - put_task_struct(task);
> - }
> - if (!mm)
> - goto out;
> - down_read(&mm->mmap_sem);
> -
> - vma = mm->mmap;
> - while (vma) {
> - if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
> - break;
> - vma = vma->vm_next;
> - }
> -
> - if (vma) {
> - *mnt = mntget(vma->vm_file->f_path.mnt);
> - *dentry = dget(vma->vm_file->f_path.dentry);
> - result = 0;
> - }
> -
> - up_read(&mm->mmap_sem);
> - mmput(mm);
> -out:
> - return result;
> -}
> -
> static void pad_len_spaces(struct seq_file *m, int len)
> {
> len = 25 + sizeof(void*) * 6 - len;
> if (len < 1)
> len = 1;
> Index: linux-2.6.22-rc2-mm1/fs/proc/task_nommu.c
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/fs/proc/task_nommu.c
> +++ linux-2.6.22-rc2-mm1/fs/proc/task_nommu.c
> @@ -102,44 +102,10 @@ int task_statm(struct mm_struct *mm, int
> up_read(&mm->mmap_sem);
> *resident = size;
> return size;
> }
>
> -int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
> -{
> - struct vm_list_struct *vml;
> - struct vm_area_struct *vma;
> - struct task_struct *task = get_proc_task(inode);
> - struct mm_struct *mm = get_task_mm(task);
> - int result = -ENOENT;
> -
> - if (!mm)
> - goto out;
> - down_read(&mm->mmap_sem);
> -
> - vml = mm->context.vmlist;
> - vma = NULL;
> - while (vml) {
> - if ((vml->vma->vm_flags & VM_EXECUTABLE) && vml->vma->vm_file) {
> - vma = vml->vma;
> - break;
> - }
> - vml = vml->next;
> - }
> -
> - if (vma) {
> - *mnt = mntget(vma->vm_file->f_path.mnt);
> - *dentry = dget(vma->vm_file->f_path.dentry);
> - result = 0;
> - }
> -
> - up_read(&mm->mmap_sem);
> - mmput(mm);
> -out:
> - return result;
> -}
> -
> /*
> * display mapping lines for a particular process's /proc/pid/maps
> */
> static int show_map(struct seq_file *m, void *_vml)
> {
> Index: linux-2.6.22-rc2-mm1/fs/proc/internal.h
> ===================================================================
> --- linux-2.6.22-rc2-mm1.orig/fs/proc/internal.h
> +++ linux-2.6.22-rc2-mm1/fs/proc/internal.h
> @@ -38,11 +38,10 @@ extern int nommu_vma_show(struct seq_fil
> #endif
>
> extern int maps_protect;
>
> extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
> -extern int proc_exe_link(struct inode *, struct dentry **, struct vfsmount **);
> extern int proc_tid_stat(struct task_struct *, char *);
> extern int proc_tgid_stat(struct task_struct *, char *);
> extern int proc_pid_status(struct task_struct *, char *);
> extern int proc_pid_statm(struct task_struct *, char *);
> extern loff_t mem_lseek(struct file * file, loff_t offset, int orig);
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2007-05-31 06:02:20

by Chris Wright

[permalink] [raw]
Subject: Re: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

* Serge E. Hallyn ([email protected]) wrote:
> > ===================================================================
> > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > if (unlikely(tsk->audit_context))
> > audit_free(tsk);
> >
> > taskstats_exit(tsk, group_dead);
> >
> > + if (tsk->exe_file)
> > + fput(tsk->exe_file);
>
> just taking a cursory look so I may be missing something, but doesn't
> this leave the possibility that right here, with tsk->exe_file being
> put, another task would try to look at tsk's /proc/tsk->pid/exe?

And I hit this one, so there's at least one issue.

[ 110.296952] Unable to handle kernel NULL pointer dereference at 0000000000000088 RIP:
[ 110.299053] [<ffffffff80293fca>] d_path+0x1a/0x117
[ 110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0
[ 110.303509] Oops: 0000 [1] SMP
[ 110.304719] CPU 1
[ 110.305493] Modules linked in: oprofile
[ 110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty #183
[ 110.309733] RIP: 0010:[<ffffffff80293fca>] [<ffffffff80293fca>] d_path+0x1a/0x117
[ 110.312635] RSP: 0018:ffff810142335e38 EFLAGS: 00010292
[ 110.314667] RAX: ffff81006d58a000 RBX: 0000000000000000 RCX: 0000000000001000
[ 110.317397] RDX: ffff81006d58a000 RSI: 0000000000000000 RDI: 0000000000000000
[ 110.320127] RBP: ffff81006d58a000 R08: 00000000fffffff3 R09: 000000000006be8b
[ 110.322857] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[ 110.325588] R13: 0000000000001000 R14: ffff81006d58a000 R15: 00000000000000000
[ 110.328319] FS: 00002b033d578260(0000) GS:ffff81000106e480(0000) knlGS:0000000000000000
[ 110.331415] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 110.333613] CR2: 0000000000000088 CR3: 000000006cf54000 CR4: 00000000000006e0
[ 110.336344] Process pidof (pid: 3983, threadinfo ffff810142334000, task ffff8101421186c0)
[ 110.339472] Stack: ffff8101422a7268 0000000000000000 ffff81006d58a000 00000000fffffff4
[ 110.342556] 0000000000000000 0000000000001000 0000000000678820 ffffffff802b7a54
[ 110.345404] 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 110.348180] Call Trace:
[ 110.349188] [<ffffffff802b7a54>] proc_pid_readlink+0x89/0xff
[ 110.351387] [<ffffffff80285e55>] sys_readlinkat+0x87/0xa9
[ 110.353487] [<ffffffff8026d4dc>] remove_vma+0x5d/0x64
[ 110.355455] [<ffffffff80596acd>] error_exit+0x0/0x84
[ 110.357389] [<ffffffff8020935e>] system_call+0x7e/0x83
[ 110.359388]
[ 110.359958]
[ 110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 74
[ 110.363381] RIP [<ffffffff80293fca>] d_path+0x1a/0x117
[ 110.365386] RSP <ffff810142335e38>
[ 110.366720] CR2: 0000000000000088

2007-05-31 17:56:45

by Matt Helsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
> Quoting Matt Helsley ([email protected]):
> > This patch avoids holding the mmap semaphore while walking VMAs in response to
> > programs which read or follow the /proc/<pid|self>/exe symlink. This also allows us
> > to merge mmu and nommu proc_exe_link() functions. The costs are holding a separate
> > reference to the executable file stored in the task struct and increased code in
> > fork, exec, and exit paths.
> >
> > Signed-off-by: Matt Helsley <[email protected]>
> > ---
> >
> > Compiled and passed simple tests for regressions when patched against a 2.6.20
> > and 2.6.22-rc2-mm1 kernel.
> >
> > fs/exec.c | 5 +++--
> > fs/proc/base.c | 20 ++++++++++++++++++++
> > fs/proc/internal.h | 1 -
> > fs/proc/task_mmu.c | 34 ----------------------------------
> > fs/proc/task_nommu.c | 34 ----------------------------------
> > include/linux/sched.h | 1 +
> > kernel/exit.c | 2 ++
> > kernel/fork.c | 10 +++++++++-
> > 8 files changed, 35 insertions(+), 72 deletions(-)

<snip>

> > Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> > ===================================================================
> > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > if (unlikely(tsk->audit_context))
> > audit_free(tsk);
> >
> > taskstats_exit(tsk, group_dead);
> >
> > + if (tsk->exe_file)
> > + fput(tsk->exe_file);
>
> Hi,
>
> just taking a cursory look so I may be missing something, but doesn't
> this leave the possibility that right here, with tsk->exe_file being
> put, another task would try to look at tsk's /proc/tsk->pid/exe?
>
> thanks,
> -serge
>
> exit_mm(tsk);
>

<snip>

Good question. To be precise, I think the problem doesn't exist here but
after the exit_mm() because there's a VMA that holds a reference to the
same file.

The existing code appears to solve the race between
reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by
returning -ENOENT for the case where there is no executable VMA with a
reference to the file backing it.

So I need to put NULL in the exe_file field and adjust the return value
to be -ENOENT instead of -ENOSYS.

Thanks for the review!

Cheers,
-Matt Helsley

2007-06-01 01:14:19

by Matt Helsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

On Wed, 2007-05-30 at 23:01 -0700, Chris Wright wrote:
> * Serge E. Hallyn ([email protected]) wrote:
> > > ===================================================================
> > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > > if (unlikely(tsk->audit_context))
> > > audit_free(tsk);
> > >
> > > taskstats_exit(tsk, group_dead);
> > >
> > > + if (tsk->exe_file)
> > > + fput(tsk->exe_file);
> >
> > just taking a cursory look so I may be missing something, but doesn't
> > this leave the possibility that right here, with tsk->exe_file being
> > put, another task would try to look at tsk's /proc/tsk->pid/exe?
>
> And I hit this one, so there's at least one issue.
>
> [ 110.296952] Unable to handle kernel NULL pointer dereference at 0000000000000088 RIP:
> [ 110.299053] [<ffffffff80293fca>] d_path+0x1a/0x117
> [ 110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0
> [ 110.303509] Oops: 0000 [1] SMP
> [ 110.304719] CPU 1
> [ 110.305493] Modules linked in: oprofile
> [ 110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty #183
> [ 110.309733] RIP: 0010:[<ffffffff80293fca>] [<ffffffff80293fca>] d_path+0x1a/0x117
> [ 110.312635] RSP: 0018:ffff810142335e38 EFLAGS: 00010292
> [ 110.314667] RAX: ffff81006d58a000 RBX: 0000000000000000 RCX: 0000000000001000
> [ 110.317397] RDX: ffff81006d58a000 RSI: 0000000000000000 RDI: 0000000000000000
> [ 110.320127] RBP: ffff81006d58a000 R08: 00000000fffffff3 R09: 000000000006be8b
> [ 110.322857] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
> [ 110.325588] R13: 0000000000001000 R14: ffff81006d58a000 R15: 00000000000000000
> [ 110.328319] FS: 00002b033d578260(0000) GS:ffff81000106e480(0000) knlGS:0000000000000000
> [ 110.331415] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 110.333613] CR2: 0000000000000088 CR3: 000000006cf54000 CR4: 00000000000006e0
> [ 110.336344] Process pidof (pid: 3983, threadinfo ffff810142334000, task ffff8101421186c0)
> [ 110.339472] Stack: ffff8101422a7268 0000000000000000 ffff81006d58a000 00000000fffffff4
> [ 110.342556] 0000000000000000 0000000000001000 0000000000678820 ffffffff802b7a54
> [ 110.345404] 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [ 110.348180] Call Trace:
> [ 110.349188] [<ffffffff802b7a54>] proc_pid_readlink+0x89/0xff
> [ 110.351387] [<ffffffff80285e55>] sys_readlinkat+0x87/0xa9
> [ 110.353487] [<ffffffff8026d4dc>] remove_vma+0x5d/0x64
> [ 110.355455] [<ffffffff80596acd>] error_exit+0x0/0x84
> [ 110.357389] [<ffffffff8020935e>] system_call+0x7e/0x83
> [ 110.359388]
> [ 110.359958]
> [ 110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 74
> [ 110.363381] RIP [<ffffffff80293fca>] d_path+0x1a/0x117
> [ 110.365386] RSP <ffff810142335e38>
> [ 110.366720] CR2: 0000000000000088

Hmm, at first blush this does appear to be related to the bug Serge
pointed out. I'm not certain though, so I'll see if I can spot/produce
any other problems that could explain this.

Thanks for testing my patch!

Cheers,
-Matt Helsley

2007-06-01 22:32:18

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

Quoting Matt Helsley ([email protected]):
> On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
> > Quoting Matt Helsley ([email protected]):
> > > This patch avoids holding the mmap semaphore while walking VMAs in response to
> > > programs which read or follow the /proc/<pid|self>/exe symlink. This also allows us
> > > to merge mmu and nommu proc_exe_link() functions. The costs are holding a separate
> > > reference to the executable file stored in the task struct and increased code in
> > > fork, exec, and exit paths.
> > >
> > > Signed-off-by: Matt Helsley <[email protected]>
> > > ---
> > >
> > > Compiled and passed simple tests for regressions when patched against a 2.6.20
> > > and 2.6.22-rc2-mm1 kernel.
> > >
> > > fs/exec.c | 5 +++--
> > > fs/proc/base.c | 20 ++++++++++++++++++++
> > > fs/proc/internal.h | 1 -
> > > fs/proc/task_mmu.c | 34 ----------------------------------
> > > fs/proc/task_nommu.c | 34 ----------------------------------
> > > include/linux/sched.h | 1 +
> > > kernel/exit.c | 2 ++
> > > kernel/fork.c | 10 +++++++++-
> > > 8 files changed, 35 insertions(+), 72 deletions(-)
>
> <snip>
>
> > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> > > ===================================================================
> > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > > if (unlikely(tsk->audit_context))
> > > audit_free(tsk);
> > >
> > > taskstats_exit(tsk, group_dead);
> > >
> > > + if (tsk->exe_file)
> > > + fput(tsk->exe_file);
> >
> > Hi,
> >
> > just taking a cursory look so I may be missing something, but doesn't
> > this leave the possibility that right here, with tsk->exe_file being
> > put, another task would try to look at tsk's /proc/tsk->pid/exe?
> >
> > thanks,
> > -serge
> >
> > exit_mm(tsk);
> >
>
> <snip>
>
> Good question. To be precise, I think the problem doesn't exist here but
> after the exit_mm() because there's a VMA that holds a reference to the
> same file.
>
> The existing code appears to solve the race between
> reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by
> returning -ENOENT for the case where there is no executable VMA with a
> reference to the file backing it.
>
> So I need to put NULL in the exe_file field and adjust the return value
> to be -ENOENT instead of -ENOSYS.
>
> Thanks for the review!

Ok, I had to think about this a bit, but so you're saying you set it to
NULL in do_exit(), and anyone who has just dereferenced tsk->exe_file
before the fput in do_exit() should be ok because the vma hasn't yet
been put?

Should the
if (!task->exe_file)
goto out;
*mnt = mntget(task->exe_file->f_path.mnt);
*dentry = dget(task->exe_file->f_path.dentry);

also go inside an preempt_disable to prevent sleeping and maybe become

exef = task->exe_file; /* to prevent task->exe_file being set
to NULL before we've grabbed the path */
if (!exef)
goto out;
get_file(exef); /* to prevent the mm somehow being put before
we've grabbed the path? */
*mnt = mntget(task->exe_file->f_path.mnt);
*dentry = dget(task->exe_file->f_path.dentry);
put_file(exef); /* ? */

?

Or am I being overly paranoid?

thanks,
-serge

2007-06-02 00:43:39

by Matt Helsley

[permalink] [raw]
Subject: Re: [RFC][PATCH] Replacing the /proc/<pid|self>/exe symlink code

On Fri, 2007-06-01 at 17:31 -0500, Serge E. Hallyn wrote:
> Quoting Matt Helsley ([email protected]):
> > On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
> > > Quoting Matt Helsley ([email protected]):
> > > > This patch avoids holding the mmap semaphore while walking VMAs in response to
> > > > programs which read or follow the /proc/<pid|self>/exe symlink. This also allows us
> > > > to merge mmu and nommu proc_exe_link() functions. The costs are holding a separate
> > > > reference to the executable file stored in the task struct and increased code in
> > > > fork, exec, and exit paths.
> > > >
> > > > Signed-off-by: Matt Helsley <[email protected]>
> > > > ---
> > > >
> > > > Compiled and passed simple tests for regressions when patched against a 2.6.20
> > > > and 2.6.22-rc2-mm1 kernel.
> > > >
> > > > fs/exec.c | 5 +++--
> > > > fs/proc/base.c | 20 ++++++++++++++++++++
> > > > fs/proc/internal.h | 1 -
> > > > fs/proc/task_mmu.c | 34 ----------------------------------
> > > > fs/proc/task_nommu.c | 34 ----------------------------------
> > > > include/linux/sched.h | 1 +
> > > > kernel/exit.c | 2 ++
> > > > kernel/fork.c | 10 +++++++++-
> > > > 8 files changed, 35 insertions(+), 72 deletions(-)
> >
> > <snip>
> >
> > > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> > > > ===================================================================
> > > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > > > if (unlikely(tsk->audit_context))
> > > > audit_free(tsk);
> > > >
> > > > taskstats_exit(tsk, group_dead);
> > > >
> > > > + if (tsk->exe_file)
> > > > + fput(tsk->exe_file);
> > >
> > > Hi,
> > >
> > > just taking a cursory look so I may be missing something, but doesn't
> > > this leave the possibility that right here, with tsk->exe_file being
> > > put, another task would try to look at tsk's /proc/tsk->pid/exe?
> > >
> > > thanks,
> > > -serge
> > >
> > > exit_mm(tsk);
> > >
> >
> > <snip>
> >
> > Good question. To be precise, I think the problem doesn't exist here but
> > after the exit_mm() because there's a VMA that holds a reference to the
> > same file.
> >
> > The existing code appears to solve the race between
> > reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by
> > returning -ENOENT for the case where there is no executable VMA with a
> > reference to the file backing it.
> >
> > So I need to put NULL in the exe_file field and adjust the return value
> > to be -ENOENT instead of -ENOSYS.
> >
> > Thanks for the review!
>
> Ok, I had to think about this a bit, but so you're saying you set it to
> NULL in do_exit(), and anyone who has just dereferenced tsk->exe_file
> before the fput in do_exit() should be ok because the vma hasn't yet
> been put?

Yes

> Should the
> if (!task->exe_file)
> goto out;
> *mnt = mntget(task->exe_file->f_path.mnt);
> *dentry = dget(task->exe_file->f_path.dentry);
>
> also go inside an preempt_disable to prevent sleeping and maybe become

It needs some form of protection from concurrent access between write
and read. write happens during exec, fork, and exit. In the fork case
however it's not necessary because the new task isn't visible in /proc
yet and the value of current doesn't change anyway.

> exef = task->exe_file; /* to prevent task->exe_file being set
> to NULL before we've grabbed the path */
> if (!exef)
> goto out;
> get_file(exef); /* to prevent the mm somehow being put before
> we've grabbed the path? */
> *mnt = mntget(task->exe_file->f_path.mnt);
> *dentry = dget(task->exe_file->f_path.dentry);
> put_file(exef); /* ? */
>
> ?
>
> Or am I being overly paranoid?

No, you're right. In fact, because readlink() can be initiated by
another task I think disabling preemption really only fixes the problem
on uniprocessor systems. So I was thinking of using task_lock() to
protect exe_file.

Cheers,
-Matt Helsley