Contents:
1) takes unhash_process() into sched.c, moves zeroing ->pid into it (and
under tasklist_lock
2) new helper in fs/proc/base.c - name_to_int(dentry) returns ~0U if name
doesn't match 0|[1-9][0-9]* or is too large. Otherwise it returns
numeric value of name. proc_pid_lookup() and proc_lookupfd() converted.
3) sane dentry retention. Namely, we don't kill /proc/<pid> dentries at the
first opportunity (as the current tree does). Instead we do the following:
* ->d_delete() kills it only if process is already dead.
* all ->lookup() in proc/base.c end with checking if process is still
alive and unhash if it isn't.
* proc_pid_lookup() (lookup for /proc/<pid>) caches reference to dentry
in task_struct. It's _not_ counted in ->d_count.
* ->d_iput() resets said reference to NULL.
* release_task() (burying a zombie) checks if there is a cached
reference and if there is - shrinks the subtree.
* tasklist_lock is used for exclusion.
That way we are guaranteed that after release_task() all dentries in
/proc/<pid> will go away as soon as possible; OTOH, before release_task()
we have normal retention policy - they go away under memory pressure with
the same rules as for dentries on any other fs.
4) preparation to sane policy for /proc/<pid>/fd/* - we don't store
struct file * in these inodes anymore.
5) sane retention policy for /proc/<pid>/fd/* - ->d_revalidate() says
"kill it" if descriptor is not opened anymore (in addition to checks
for task being dead) and we allow dentries of /proc/<pid>/fd/<n> to
stay around.
Patchset eliminates a _lot_ of allocation/freeing/guaranteed negative dcache
lookups for procfs. It seems to be working here, but I would really appreciate
help with testing/review.
First chunk follows, the rest will go in separate mails.
diff -urN C8-0/include/linux/sched.h C8-unhash_process/include/linux/sched.h
--- C8-0/include/linux/sched.h Sun Apr 14 17:53:12 2002
+++ C8-unhash_process/include/linux/sched.h Fri Apr 19 01:16:35 2002
@@ -769,15 +769,7 @@
#define thread_group_leader(p) (p->pid == p->tgid)
-static inline void unhash_process(struct task_struct *p)
-{
- write_lock_irq(&tasklist_lock);
- nr_threads--;
- unhash_pid(p);
- REMOVE_LINKS(p);
- list_del(&p->thread_group);
- write_unlock_irq(&tasklist_lock);
-}
+extern void unhash_process(struct task_struct *p);
/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */
static inline void task_lock(struct task_struct *p)
diff -urN C8-0/kernel/exit.c C8-unhash_process/kernel/exit.c
--- C8-0/kernel/exit.c Sun Apr 14 17:53:13 2002
+++ C8-unhash_process/kernel/exit.c Fri Apr 19 01:16:35 2002
@@ -27,6 +27,17 @@
int getrusage(struct task_struct *, int, struct rusage *);
+static inline void __unhash_process(struct task_struct *p)
+{
+ write_lock_irq(&tasklist_lock);
+ nr_threads--;
+ unhash_pid(p);
+ REMOVE_LINKS(p);
+ list_del(&p->thread_group);
+ p->pid = 0;
+ write_unlock_irq(&tasklist_lock);
+}
+
static void release_task(struct task_struct * p)
{
if (p == current)
@@ -43,8 +54,14 @@
current->cmaj_flt += p->maj_flt + p->cmaj_flt;
current->cnswap += p->nswap + p->cnswap;
sched_exit(p);
- p->pid = 0;
put_task_struct(p);
+}
+
+/* we are using it only for SMP init */
+
+void unhash_process(struct task_struct *p)
+{
+ return __unhash_process(p);
}
/*
diff -urN C8-name_to_int/fs/proc/base.c C8-retain_dentry/fs/proc/base.c
--- C8-name_to_int/fs/proc/base.c Fri Apr 19 01:17:11 2002
+++ C8-retain_dentry/fs/proc/base.c Fri Apr 19 01:17:36 2002
@@ -747,7 +747,7 @@
* directory. In this case, however, we can do it - no aliasing problems
* due to the way we treat inodes.
*/
-static int pid_base_revalidate(struct dentry * dentry, int flags)
+static int pid_revalidate(struct dentry * dentry, int flags)
{
if (proc_task(dentry->d_inode)->pid)
return 1;
@@ -755,25 +755,42 @@
return 0;
}
-static int pid_delete_dentry(struct dentry * dentry)
+static void pid_base_iput(struct dentry *dentry, struct inode *inode)
+{
+ struct task_struct *task = proc_task(inode);
+ write_lock_irq(&tasklist_lock);
+ if (task->proc_dentry == dentry)
+ task->proc_dentry = NULL;
+ write_unlock_irq(&tasklist_lock);
+ iput(inode);
+}
+
+static int pid_fd_delete_dentry(struct dentry * dentry)
{
return 1;
}
+static int pid_delete_dentry(struct dentry * dentry)
+{
+ return proc_task(dentry->d_inode)->pid == 0;
+}
+
static struct dentry_operations pid_fd_dentry_operations =
{
d_revalidate: pid_fd_revalidate,
- d_delete: pid_delete_dentry,
+ d_delete: pid_fd_delete_dentry,
};
static struct dentry_operations pid_dentry_operations =
{
+ d_revalidate: pid_revalidate,
d_delete: pid_delete_dentry,
};
static struct dentry_operations pid_base_dentry_operations =
{
- d_revalidate: pid_base_revalidate,
+ d_revalidate: pid_revalidate,
+ d_iput: pid_base_iput,
d_delete: pid_delete_dentry,
};
@@ -842,6 +859,8 @@
inode->i_mode |= S_IWUSR | S_IXUSR;
dentry->d_op = &pid_fd_dentry_operations;
d_add(dentry, inode);
+ if (!proc_task(dentry->d_inode)->pid)
+ d_drop(dentry);
return NULL;
out_unlock2:
@@ -959,6 +978,8 @@
}
dentry->d_op = &pid_dentry_operations;
d_add(dentry, inode);
+ if (!proc_task(dentry->d_inode)->pid)
+ d_drop(dentry);
return NULL;
out:
@@ -1045,6 +1066,11 @@
dentry->d_op = &pid_base_dentry_operations;
d_add(dentry, inode);
+ read_lock(&tasklist_lock);
+ proc_task(dentry->d_inode)->proc_dentry = dentry;
+ read_unlock(&tasklist_lock);
+ if (!proc_task(dentry->d_inode)->pid)
+ d_drop(dentry);
return NULL;
out:
return ERR_PTR(-ENOENT);
diff -urN C8-name_to_int/include/linux/sched.h C8-retain_dentry/include/linux/sched.h
--- C8-name_to_int/include/linux/sched.h Fri Apr 19 01:16:35 2002
+++ C8-retain_dentry/include/linux/sched.h Fri Apr 19 01:17:36 2002
@@ -346,6 +346,7 @@
/* journalling filesystem info */
void *journal_info;
+ struct dentry *proc_dentry;
};
extern void __put_task_struct(struct task_struct *tsk);
diff -urN C8-name_to_int/kernel/exit.c C8-retain_dentry/kernel/exit.c
--- C8-name_to_int/kernel/exit.c Fri Apr 19 01:16:35 2002
+++ C8-retain_dentry/kernel/exit.c Fri Apr 19 01:17:36 2002
@@ -29,13 +29,28 @@
static inline void __unhash_process(struct task_struct *p)
{
+ struct dentry *proc_dentry;
write_lock_irq(&tasklist_lock);
nr_threads--;
unhash_pid(p);
REMOVE_LINKS(p);
list_del(&p->thread_group);
p->pid = 0;
+ proc_dentry = p->proc_dentry;
+ if (unlikely(proc_dentry)) {
+ spin_lock(&dcache_lock);
+ if (!list_empty(&proc_dentry->d_hash)) {
+ dget_locked(proc_dentry);
+ list_del_init(&proc_dentry->d_hash);
+ } else
+ proc_dentry = NULL;
+ spin_unlock(&dcache_lock);
+ }
write_unlock_irq(&tasklist_lock);
+ if (unlikely(proc_dentry)) {
+ shrink_dcache_parent(proc_dentry);
+ dput(proc_dentry);
+ }
}
static void release_task(struct task_struct * p)
diff -urN C8-name_to_int/kernel/fork.c C8-retain_dentry/kernel/fork.c
--- C8-name_to_int/kernel/fork.c Sun Apr 14 17:53:13 2002
+++ C8-retain_dentry/kernel/fork.c Fri Apr 19 01:17:36 2002
@@ -665,6 +665,7 @@
copy_flags(clone_flags, p);
p->pid = get_pid(clone_flags);
+ p->proc_dentry = NULL;
INIT_LIST_HEAD(&p->run_list);
diff -urN C8-unhash_process/fs/proc/base.c C8-name_to_int/fs/proc/base.c
--- C8-unhash_process/fs/proc/base.c Tue Mar 19 16:05:58 2002
+++ C8-name_to_int/fs/proc/base.c Fri Apr 19 01:17:11 2002
@@ -778,34 +778,41 @@
};
/* Lookups */
-#define MAX_MULBY10 ((~0U-9)/10)
+
+static unsigned name_to_int(struct dentry *dentry)
+{
+ const char *name = dentry->d_name.name;
+ int len = dentry->d_name.len;
+ unsigned n = 0;
+
+ if (len > 1 && *name == '0')
+ goto out;
+ while (len-- > 0) {
+ unsigned c = *name++ - '0';
+ if (c > 9)
+ goto out;
+ if (n >= (~0U-9)/10)
+ goto out;
+ n *= 10;
+ n += c;
+ }
+ return n;
+out:
+ return ~0U;
+}
/* SMP-safe */
static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry)
{
- unsigned int fd, c;
struct task_struct *task = proc_task(dir);
+ unsigned fd = name_to_int(dentry);
struct file * file;
struct files_struct * files;
struct inode *inode;
struct proc_inode *ei;
- const char *name;
- int len;
- fd = 0;
- len = dentry->d_name.len;
- name = dentry->d_name.name;
- if (len > 1 && *name == '0') goto out;
- while (len-- > 0) {
- c = *name - '0';
- name++;
- if (c > 9)
- goto out;
- if (fd >= MAX_MULBY10)
- goto out;
- fd *= 10;
- fd += c;
- }
+ if (fd == ~0U)
+ goto out;
inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_FD_DIR+fd);
if (!inode)
@@ -992,17 +999,12 @@
/* SMP-safe */
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry)
{
- unsigned int pid, c;
struct task_struct *task;
- const char *name;
struct inode *inode;
struct proc_inode *ei;
- int len;
+ unsigned pid;
- pid = 0;
- name = dentry->d_name.name;
- len = dentry->d_name.len;
- if (len == 4 && !memcmp(name, "self", 4)) {
+ if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
inode = new_inode(dir->i_sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -1017,18 +1019,9 @@
d_add(dentry, inode);
return NULL;
}
- while (len-- > 0) {
- c = *name - '0';
- name++;
- if (c > 9)
- goto out;
- if (pid >= MAX_MULBY10)
- goto out;
- pid *= 10;
- pid += c;
- if (!pid)
- goto out;
- }
+ pid = name_to_int(dentry);
+ if (pid == ~0U)
+ goto out;
read_lock(&tasklist_lock);
task = find_task_by_pid(pid);
diff -urN C8-file/fs/proc/base.c C8-procfs/fs/proc/base.c
--- C8-file/fs/proc/base.c Fri Apr 19 02:11:56 2002
+++ C8-procfs/fs/proc/base.c Fri Apr 19 01:20:20 2002
@@ -88,6 +88,11 @@
return PROC_I(inode)->task;
}
+static inline int proc_type(struct inode *inode)
+{
+ return PROC_I(inode)->type;
+}
+
ssize_t proc_pid_read_maps(struct task_struct*,struct file*,char*,size_t,loff_t*);
int proc_pid_stat(struct task_struct*,char*);
int proc_pid_status(struct task_struct*,char*);
@@ -99,7 +104,7 @@
struct task_struct *task = proc_task(inode);
struct files_struct *files;
struct file *file;
- int fd = (inode->i_ino & 0xffff) - PROC_PID_FD_DIR;
+ int fd = proc_type(inode) - PROC_PID_FD_DIR;
task_lock(task);
files = task->files;
@@ -735,6 +740,7 @@
*/
get_task_struct(task);
ei->task = task;
+ ei->type = ino;
inode->i_uid = 0;
inode->i_gid = 0;
if (ino == PROC_PID_INO || task_dumpable(task)) {
@@ -753,11 +759,6 @@
/* dentry stuff */
-static int pid_fd_revalidate(struct dentry * dentry, int flags)
-{
- return 0;
-}
-
/*
* Exceptional case: normally we are not allowed to unhash a busy
* directory. In this case, however, we can do it - no aliasing problems
@@ -771,6 +772,31 @@
return 0;
}
+static int pid_fd_revalidate(struct dentry * dentry, int flags)
+{
+ struct task_struct *task = proc_task(dentry->d_inode);
+ int fd = proc_type(dentry->d_inode) - PROC_PID_FD_DIR;
+ struct files_struct *files;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ atomic_inc(&files->count);
+ task_unlock(task);
+ if (files) {
+ read_lock(&files->file_lock);
+ if (fcheck_files(files, fd)) {
+ read_unlock(&files->file_lock);
+ put_files_struct(files);
+ return 1;
+ }
+ read_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+ d_drop(dentry);
+ return 0;
+}
+
static void pid_base_iput(struct dentry *dentry, struct inode *inode)
{
struct task_struct *task = proc_task(inode);
@@ -781,11 +807,6 @@
iput(inode);
}
-static int pid_fd_delete_dentry(struct dentry * dentry)
-{
- return 1;
-}
-
static int pid_delete_dentry(struct dentry * dentry)
{
return proc_task(dentry->d_inode)->pid == 0;
@@ -794,7 +815,7 @@
static struct dentry_operations pid_fd_dentry_operations =
{
d_revalidate: pid_fd_revalidate,
- d_delete: pid_fd_delete_dentry,
+ d_delete: pid_delete_dentry,
};
static struct dentry_operations pid_dentry_operations =
@@ -879,8 +900,8 @@
return NULL;
out_unlock2:
- put_files_struct(files);
read_unlock(&files->file_lock);
+ put_files_struct(files);
out_unlock:
iput(inode);
out:
diff -urN C8-retain_dentry/fs/proc/base.c C8-current/fs/proc/base.c
--- C8-retain_dentry/fs/proc/base.c Fri Apr 19 01:17:36 2002
+++ C8-current/fs/proc/base.c Fri Apr 19 01:39:23 2002
@@ -37,6 +37,52 @@
#define fake_ino(pid,ino) (((pid)<<16)|(ino))
+enum pid_directory_inos {
+ PROC_PID_INO = 2,
+ PROC_PID_STATUS,
+ PROC_PID_MEM,
+ PROC_PID_CWD,
+ PROC_PID_ROOT,
+ PROC_PID_EXE,
+ PROC_PID_FD,
+ PROC_PID_ENVIRON,
+ PROC_PID_CMDLINE,
+ PROC_PID_STAT,
+ PROC_PID_STATM,
+ PROC_PID_MAPS,
+ PROC_PID_CPU,
+ PROC_PID_MOUNTS,
+ PROC_PID_FD_DIR = 0x8000, /* 0x8000-0xffff */
+};
+
+struct pid_entry {
+ int type;
+ int len;
+ char *name;
+ mode_t mode;
+};
+
+#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)}
+static struct pid_entry base_stuff[] = {
+ E(PROC_PID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR),
+ E(PROC_PID_ENVIRON, "environ", S_IFREG|S_IRUSR),
+ E(PROC_PID_STATUS, "status", S_IFREG|S_IRUGO),
+ E(PROC_PID_CMDLINE, "cmdline", S_IFREG|S_IRUGO),
+ E(PROC_PID_STAT, "stat", S_IFREG|S_IRUGO),
+ E(PROC_PID_STATM, "statm", S_IFREG|S_IRUGO),
+#ifdef CONFIG_SMP
+ E(PROC_PID_CPU, "cpu", S_IFREG|S_IRUGO),
+#endif
+ E(PROC_PID_MAPS, "maps", S_IFREG|S_IRUGO),
+ E(PROC_PID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR),
+ E(PROC_PID_CWD, "cwd", S_IFLNK|S_IRWXUGO),
+ E(PROC_PID_ROOT, "root", S_IFLNK|S_IRWXUGO),
+ E(PROC_PID_EXE, "exe", S_IFLNK|S_IRWXUGO),
+ E(PROC_PID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
+ {0,0,NULL,0}
+};
+#undef E
+
static inline struct task_struct *proc_task(struct inode *inode)
{
return PROC_I(inode)->task;
@@ -50,11 +96,28 @@
static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
{
- struct file *file = PROC_I(inode)->file;
- if (file) {
- *mnt = mntget(file->f_vfsmnt);
- *dentry = dget(file->f_dentry);
- return 0;
+ struct task_struct *task = proc_task(inode);
+ struct files_struct *files;
+ struct file *file;
+ int fd = (inode->i_ino & 0xffff) - PROC_PID_FD_DIR;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ atomic_inc(&files->count);
+ task_unlock(task);
+ if (files) {
+ read_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (file) {
+ *mnt = mntget(file->f_vfsmnt);
+ *dentry = dget(file->f_dentry);
+ read_unlock(&files->file_lock);
+ put_files_struct(files);
+ return 0;
+ }
+ read_unlock(&files->file_lock);
+ put_files_struct(files);
}
return -ENOENT;
}
@@ -525,52 +588,6 @@
follow_link: proc_pid_follow_link
};
-struct pid_entry {
- int type;
- int len;
- char *name;
- mode_t mode;
-};
-
-enum pid_directory_inos {
- PROC_PID_INO = 2,
- PROC_PID_STATUS,
- PROC_PID_MEM,
- PROC_PID_CWD,
- PROC_PID_ROOT,
- PROC_PID_EXE,
- PROC_PID_FD,
- PROC_PID_ENVIRON,
- PROC_PID_CMDLINE,
- PROC_PID_STAT,
- PROC_PID_STATM,
- PROC_PID_MAPS,
- PROC_PID_CPU,
- PROC_PID_MOUNTS,
- PROC_PID_FD_DIR = 0x8000, /* 0x8000-0xffff */
-};
-
-#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)}
-static struct pid_entry base_stuff[] = {
- E(PROC_PID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR),
- E(PROC_PID_ENVIRON, "environ", S_IFREG|S_IRUSR),
- E(PROC_PID_STATUS, "status", S_IFREG|S_IRUGO),
- E(PROC_PID_CMDLINE, "cmdline", S_IFREG|S_IRUGO),
- E(PROC_PID_STAT, "stat", S_IFREG|S_IRUGO),
- E(PROC_PID_STATM, "statm", S_IFREG|S_IRUGO),
-#ifdef CONFIG_SMP
- E(PROC_PID_CPU, "cpu", S_IFREG|S_IRUGO),
-#endif
- E(PROC_PID_MAPS, "maps", S_IFREG|S_IRUGO),
- E(PROC_PID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR),
- E(PROC_PID_CWD, "cwd", S_IFLNK|S_IRWXUGO),
- E(PROC_PID_ROOT, "root", S_IFLNK|S_IRWXUGO),
- E(PROC_PID_EXE, "exe", S_IFLNK|S_IRWXUGO),
- E(PROC_PID_MOUNTS, "mounts", S_IFREG|S_IRUGO),
- {0,0,NULL,0}
-};
-#undef E
-
#define NUMBUF 10
static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
@@ -707,7 +724,6 @@
/* Common stuff */
ei = PROC_I(inode);
ei->task = NULL;
- ei->file = NULL;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_ino = fake_ino(task->pid, ino);
@@ -842,21 +858,20 @@
task_unlock(task);
if (!files)
goto out_unlock;
+ inode->i_mode = S_IFLNK;
read_lock(&files->file_lock);
- file = ei->file = fcheck_files(files, fd);
+ file = fcheck_files(files, fd);
if (!file)
goto out_unlock2;
- get_file(file);
+ if (file->f_mode & 1)
+ inode->i_mode |= S_IRUSR | S_IXUSR;
+ if (file->f_mode & 2)
+ inode->i_mode |= S_IWUSR | S_IXUSR;
read_unlock(&files->file_lock);
put_files_struct(files);
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
- inode->i_mode = S_IFLNK;
ei->op.proc_get_link = proc_fd_link;
- if (file->f_mode & 1)
- inode->i_mode |= S_IRUSR | S_IXUSR;
- if (file->f_mode & 2)
- inode->i_mode |= S_IWUSR | S_IXUSR;
dentry->d_op = &pid_fd_dentry_operations;
d_add(dentry, inode);
if (!proc_task(dentry->d_inode)->pid)
@@ -1078,8 +1093,6 @@
void proc_pid_delete_inode(struct inode *inode)
{
- if (PROC_I(inode)->file)
- fput(PROC_I(inode)->file);
if (proc_task(inode))
put_task_struct(proc_task(inode));
}
diff -urN C8-retain_dentry/include/linux/proc_fs.h C8-current/include/linux/proc_fs.h
--- C8-retain_dentry/include/linux/proc_fs.h Sun Apr 14 17:53:12 2002
+++ C8-current/include/linux/proc_fs.h Fri Apr 19 01:33:23 2002
@@ -212,7 +212,6 @@
int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
int (*proc_read)(struct task_struct *task, char *page);
} op;
- struct file *file;
struct proc_dir_entry *pde;
struct inode vfs_inode;
};
--On Saturday, April 20, 2002 13:56:35 -0400 Alexander Viro <[email protected]> wrote:
> Patchset eliminates a _lot_ of allocation/freeing/guaranteed negative dcache
> lookups for procfs. It seems to be working here, but I would really appreciate
> help with testing/review.
>
> First chunk follows, the rest will go in separate mails.
>
> diff -urN C8-0/include/linux/sched.h C8-unhash_process/include/linux/sched.h
> --- C8-0/include/linux/sched.h Sun Apr 14 17:53:12 2002
> +++ C8-unhash_process/include/linux/sched.h Fri Apr 19 01:16:35 2002
Patch 1 of 5 failed to apply to 2.5.8.
Which version are these built against?
Hanna
[email protected]
On Mon, 22 Apr 2002, Hanna Linder wrote:
> Patch 1 of 5 failed to apply to 2.5.8.
> Which version are these built against?
???
Applies clean to 2.5.8 here. md5 of files in question:
ed29a4584c15c347ac21362e9359d204 C8/kernel/exit.c
266abe14ebf56a6c54b750f1250d98dd C8/include/linux/sched.h
(15 minutes later) md5 of these files in tree from ftp.kernel.org:
ed29a4584c15c347ac21362e9359d204 linux-2.5.8/kernel/exit.c
266abe14ebf56a6c54b750f1250d98dd linux-2.5.8/include/linux/sched.h
Check your tree...