Commit 925d1c401fa6cfd0df5d2e37da8981494ccdec07 aka "procfs task exe symlink".
introduced struct mm_struct::exe_file and struct
mm_struct::num_exe_file_vmas.
The rationale is weak: unifying MMU and no-MMU version of /proc/*/exe code.
For this a) struct mm_struct becomes bigger, b) mmap/munmap/exit become slower,
c) patch adds more code than removes in fact.
->exe_file maybe well defined, but doesn't make sense always. After original
executable is unmapped, /proc/*/exe will still report it and, more importantly,
pin corresponding struct file.
->num_exe_file_vmas is even worse -- it just counts number of executable
file-backed VMAs even if those VMAs aren't related to ->exe_file.
After commit 8feae13110d60cc6287afabc2887366b0eb226c2 aka
"NOMMU: Make VMAs per MM as for MMU-mode linux" no-MMU kernels also
maintain list of VMAs in ->mmap, so we can switch back for MMU version
of /proc/*/exe.
Signed-off-by: Alexey Dobriyan <[email protected]>
---
fs/binfmt_flat.c | 3 -
fs/exec.c | 2
fs/proc/base.c | 105 +++++++++++++----------------------------------
include/linux/mm.h | 12 -----
include/linux/mm_types.h | 6 --
include/linux/proc_fs.h | 20 --------
kernel/fork.c | 3 -
mm/mmap.c | 22 +--------
mm/nommu.c | 16 -------
9 files changed, 37 insertions(+), 152 deletions(-)
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -531,8 +531,7 @@ static int load_flat_file(struct linux_binprm * bprm,
DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
down_write(¤t->mm->mmap_sem);
- textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
- MAP_PRIVATE|MAP_EXECUTABLE, 0);
+ textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE, 0);
up_write(¤t->mm->mmap_sem);
if (!textpos || textpos >= (unsigned long) -4096) {
if (!textpos)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -958,8 +958,6 @@ int flush_old_exec(struct linux_binprm * bprm)
if (retval)
goto out;
- set_mm_exe_file(bprm->mm, bprm->file);
-
/*
* Release all of the old mmap stuff
*/
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -211,6 +211,36 @@ static int proc_root_link(struct inode *inode, struct path *path)
return result;
}
+static int proc_exe_link(struct inode *inode, struct path *path)
+{
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+ tsk = get_proc_task(inode);
+ if (!tsk)
+ return -ENOENT;
+ mm = get_task_mm(tsk);
+ put_task_struct(tsk);
+ if (!mm)
+ return -ENOENT;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(&vma->vm_file->f_path);
+
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return 0;
+ }
+ }
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return -ENOENT;
+}
+
/*
* Return zero if current may access user memory in @task, -error if not.
*/
@@ -1265,81 +1295,6 @@ static const struct file_operations proc_pid_sched_operations = {
#endif
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
- mm->num_exe_file_vmas--;
- if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
- fput(mm->exe_file);
- mm->exe_file = NULL;
- }
-
-}
-
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
-{
- if (new_exe_file)
- get_file(new_exe_file);
- if (mm->exe_file)
- fput(mm->exe_file);
- mm->exe_file = new_exe_file;
- mm->num_exe_file_vmas = 0;
-}
-
-struct file *get_mm_exe_file(struct mm_struct *mm)
-{
- struct file *exe_file;
-
- /* We need mmap_sem to protect against races with removal of
- * VM_EXECUTABLE vmas */
- down_read(&mm->mmap_sem);
- exe_file = mm->exe_file;
- if (exe_file)
- get_file(exe_file);
- up_read(&mm->mmap_sem);
- return exe_file;
-}
-
-void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
- /* It's safe to write the exe_file pointer without exe_file_lock because
- * this is called during fork when the task is not yet in /proc */
- newmm->exe_file = get_mm_exe_file(oldmm);
-}
-
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
-{
- struct task_struct *task;
- struct mm_struct *mm;
- struct file *exe_file;
-
- task = get_proc_task(inode);
- if (!task)
- return -ENOENT;
- mm = get_task_mm(task);
- put_task_struct(task);
- if (!mm)
- return -ENOENT;
- exe_file = get_mm_exe_file(mm);
- mmput(mm);
- if (exe_file) {
- *exe_path = exe_file->f_path;
- path_get(&exe_file->f_path);
- fput(exe_file);
- return 0;
- } else
- return -ENOENT;
-}
-
static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1118,18 +1118,6 @@ extern void exit_mmap(struct mm_struct *);
extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);
-#ifdef CONFIG_PROC_FS
-/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
-extern void added_exe_file_vma(struct mm_struct *mm);
-extern void removed_exe_file_vma(struct mm_struct *mm);
-#else
-static inline void added_exe_file_vma(struct mm_struct *mm)
-{}
-
-static inline void removed_exe_file_vma(struct mm_struct *mm)
-{}
-#endif /* CONFIG_PROC_FS */
-
extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
extern int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -265,12 +265,6 @@ struct mm_struct {
*/
struct task_struct *owner;
#endif
-
-#ifdef CONFIG_PROC_FS
- /* store ref to file /proc/<pid>/exe symlink points to */
- struct file *exe_file;
- unsigned long num_exe_file_vmas;
-#endif
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
#endif
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -192,12 +192,6 @@ extern void proc_net_remove(struct net *net, const char *name);
extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
struct proc_dir_entry *parent);
-/* While the {get|set|dup}_mm_exe_file functions are for mm_structs, they are
- * only needed to implement /proc/<pid>|self/exe so we define them here. */
-extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
-extern struct file *get_mm_exe_file(struct mm_struct *mm);
-extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
-
#else
#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
@@ -244,20 +238,6 @@ static inline int pid_ns_prepare_proc(struct pid_namespace *ns)
static inline void pid_ns_release_proc(struct pid_namespace *ns)
{
}
-
-static inline void set_mm_exe_file(struct mm_struct *mm,
- struct file *new_exe_file)
-{}
-
-static inline struct file *get_mm_exe_file(struct mm_struct *mm)
-{
- return NULL;
-}
-
-static inline void dup_mm_exe_file(struct mm_struct *oldmm,
- struct mm_struct *newmm)
-{}
-
#endif /* CONFIG_PROC_FS */
#if !defined(CONFIG_PROC_KCORE)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -482,7 +482,6 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
exit_aio(mm);
exit_mmap(mm);
- set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
list_del(&mm->mmlist);
@@ -605,8 +604,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
if (init_new_context(tsk, mm))
goto fail_nocontext;
- dup_mm_exe_file(oldmm, mm);
-
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -235,11 +235,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
- }
mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
@@ -636,11 +633,8 @@ again: remove_next = 1 + (end > next->vm_end);
spin_unlock(&mapping->i_mmap_lock);
if (remove_next) {
- if (file) {
+ if (file)
fput(file);
- if (next->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
- }
mm->map_count--;
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
@@ -1192,8 +1186,6 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if (vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
@@ -1849,11 +1841,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
}
vma_set_policy(new, pol);
- if (new->vm_file) {
+ if (new->vm_file)
get_file(new->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -2200,11 +2189,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file) {
+ if (new_vma->vm_file)
get_file(new_vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -726,11 +726,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
kenter("%p", vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
- }
put_nommu_region(vma->vm_region);
kmem_cache_free(vm_area_cachep, vma);
}
@@ -1223,10 +1220,6 @@ unsigned long do_mmap_pgoff(struct file *file,
get_file(file);
vma->vm_file = file;
get_file(file);
- if (vm_flags & VM_EXECUTABLE) {
- added_exe_file_vma(current->mm);
- vma->vm_mm = current->mm;
- }
}
down_write(&nommu_region_sem);
@@ -1364,11 +1357,8 @@ share:
error_put_region:
__put_nommu_region(region);
if (vma) {
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
- }
kmem_cache_free(vm_area_cachep, vma);
}
kleave(" = %d [pr]", ret);
@@ -1380,8 +1370,6 @@ error:
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
kmem_cache_free(vm_area_cachep, vma);
kleave(" = %d", ret);
return ret;
On March 23rd Alexey Dobriyan <[email protected]> wrote:
> Commit 925d1c401fa6cfd0df5d2e37da8981494ccdec07 aka "procfs task exe symlink".
> introduced struct mm_struct::exe_file and struct mm_struct::num_exe_file_vmas.
>
> The rationale is weak: unifying MMU and no-MMU version of /proc/*/exe
> code. For this a) struct mm_struct becomes bigger, b) mmap/munmap/exit become
> slower, c) patch adds more code than removes in fact. ->exe_file maybe well
Your patch adds a VMA walk with the mmap semaphore held when openning
/proc/*/exe. That gives unrelated tasks the opportunity to contend on each
others mmap semaphores just by doing a readlink on /proc/*/exe.
Did you have any measurements to report showing an improvement in performance
with your patch or are you expecting it to be "slower" based purely on code
inspection?
(Which reminds me of this thread: http://lkml.org/lkml/2009/3/11/114 )
> defined, but doesn't make sense always. After original executable is unmapped,
> /proc/*/exe will still report it and, more importantly, pin corresponding
> struct file. ->num_exe_file_vmas is even worse -- it just counts number of
->exe_file does not pin the file. It uses the num_exe_file_vmas count to drop
the mm->exe_file reference when the last executable VMA is gone. Since
MAP_EXECUTABLE (and hence VM_EXECUTABLE) is only applied to the first executable
file opened during exec (stored in ->exe_file) and since userspace can't use it
(the mmap syscall code masks it out), num_exe_file_vmas counts VMAs related to
exe_file.
So ->exe_file does not pin the file any more or less than the old VMA does...
> executable file-backed VMAs even if those VMAs aren't related to ->exe_file.
and this part of your argument is incorrect as well.
> After commit 8feae13110d60cc6287afabc2887366b0eb226c2 aka
> "NOMMU: Make VMAs per MM as for MMU-mode linux" no-MMU kernels also
> maintain list of VMAs in ->mmap, so we can switch back for MMU version
> of /proc/*/exe.
Unless the executable is fully unmapped ->exe_file won't change even if
something else manages to get mapped at a lower address. In contrast, the MMU
version is sensitive to changes in the order of the VMAs over the lifetime of
the task. It has worked in the past because MAP_EXECUTABLE only gets applied
during exec, MAP_EXECUTABLE can only be applied by the kernel, all binary
format handlers place the executable low in the mmap, and nothing else
maps MAP_EXECUTABLE regions. While ->exe_file still relies on
MAP_EXECUTABLE, it does not care about the mapped address because it
does not walk the VMAs.
Cheers,
-Matt Helsley
On Tue, Mar 31, 2009 at 05:14:27PM -0700, Matt Helsley wrote:
> On March 23rd Alexey Dobriyan <[email protected]> wrote:
> > Commit 925d1c401fa6cfd0df5d2e37da8981494ccdec07 aka "procfs task exe symlink".
> > introduced struct mm_struct::exe_file and struct mm_struct::num_exe_file_vmas.
> >
> > The rationale is weak: unifying MMU and no-MMU version of /proc/*/exe
> > code. For this a) struct mm_struct becomes bigger, b) mmap/munmap/exit become
> > slower, c) patch adds more code than removes in fact. ->exe_file maybe well
>
> Your patch adds a VMA walk with the mmap semaphore held when openning
> /proc/*/exe. That gives unrelated tasks the opportunity to contend on each
> others mmap semaphores just by doing a readlink on /proc/*/exe.
This is dealt with in proc_fd_access_allowed() with ptrace check.
> Did you have any measurements to report showing an improvement in performance
> with your patch or are you expecting it to be "slower" based purely on code
> inspection?
The performance is restored to previous levels, because code became the same
as before ->exe_file.
> > defined, but doesn't make sense always. After original executable is unmapped,
> > /proc/*/exe will still report it and, more importantly, pin corresponding
> > struct file. ->num_exe_file_vmas is even worse -- it just counts number of
>
> ->exe_file does not pin the file. It uses the num_exe_file_vmas count to drop
> the mm->exe_file reference when the last executable VMA is gone. Since
> MAP_EXECUTABLE (and hence VM_EXECUTABLE) is only applied to the first executable
> file opened during exec (stored in ->exe_file) and since userspace can't use it
> (the mmap syscall code masks it out), num_exe_file_vmas counts VMAs related to
> exe_file.
OK.
> So ->exe_file does not pin the file any more or less than the old VMA does...
>
> > executable file-backed VMAs even if those VMAs aren't related to ->exe_file.
>
> and this part of your argument is incorrect as well.
>
> > After commit 8feae13110d60cc6287afabc2887366b0eb226c2 aka
> > "NOMMU: Make VMAs per MM as for MMU-mode linux" no-MMU kernels also
> > maintain list of VMAs in ->mmap, so we can switch back for MMU version
> > of /proc/*/exe.
>
> Unless the executable is fully unmapped ->exe_file won't change even if
> something else manages to get mapped at a lower address. In contrast, the MMU
> version is sensitive to changes in the order of the VMAs over the lifetime of
> the task. It has worked in the past because MAP_EXECUTABLE only gets applied
> during exec, MAP_EXECUTABLE can only be applied by the kernel, all binary
> format handlers place the executable low in the mmap, and nothing else
> maps MAP_EXECUTABLE regions. While ->exe_file still relies on
> MAP_EXECUTABLE, it does not care about the mapped address because it
> does not walk the VMAs.
Both VMA walk for /proc/*/exe and ->exe_file are heuristics and rely on
the fact that userspace can't create explicit VM_EXECUTABLE mappings.
Take out that and ->exe_file becomes incorrect by design -- which is
the point of the patch to remove incorectness.
Matt Helsley <[email protected]> wrote:
> all binary format handlers place the executable low in the mmap
Not so, at least not on NOMMU, and possibly not so with randomised executable
mappings either.
David