LinuxLists.cc - [RFC][PATCH 1/8] get mount write in __dentry

2007-09-28 18:14:45

Subject: [RFC][PATCH 1/8] get mount write in __dentry_open()

The first three patches here fix actual bugs. I think
the last two will reduce the chance for any future bugs
to creep in. RFC for now.

--

This is a bug fix for the r/o bind mount patch set.

We need to ensure taking a mnt write on the mnt
referenced by any new struct file.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/open.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)

diff -puN fs/open.c~get-write-in-__dentry_open fs/open.c
--- lxc/fs/open.c~get-write-in-__dentry_open 2007-09-28 11:03:18.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:18.000000000 -0700
@@ -778,9 +778,15 @@ static struct file *__dentry_open(struct
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
- error = get_write_access(inode);
+ error = mnt_want_write(mnt);
if (error)
goto cleanup_file;
+
+ error = get_write_access(inode);
+ if (error) {
+ mnt_drop_write(mnt);
+ goto cleanup_file;
+ }
}

f->f_mapping = inode->i_mapping;
@@ -820,8 +826,10 @@ static struct file *__dentry_open(struct

cleanup_all:
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITE)
+ if (f->f_mode & FMODE_WRITE) {
put_write_access(inode);
+ mnt_drop_write(mnt);
+ }
file_kill(f);
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
_

2007-09-28 18:13:45

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 2/8] move mnt_want_write() into open_namei_create()

In a moment, we're going to make may_open() stop doing
the mnt_want/drop_write() pair. Doing this first makes
the next patch simpler.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/namei.c | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)

diff -puN fs/namei.c~move-mnt_want_write-into-open_namei_create fs/namei.c
--- lxc/fs/namei.c~move-mnt_want_write-into-open_namei_create 2007-09-28 11:03:19.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:19.000000000 -0700
@@ -1686,6 +1686,19 @@ static int open_namei_create(struct name
int error;
struct dentry *dir = nd->dentry;

+ /*
+ * This ensures that the mnt stays writable
+ * over the vfs_create() call to may_open(),
+ * which takes a more persistent
+ * mnt_want_write().
+ */
+ error = mnt_want_write(nd->mnt);
+ if (error) {
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(nd->dentry);
+ return error;
+ }
+
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
error = vfs_create(dir->d_inode, path->dentry, mode, nd);
@@ -1693,9 +1706,16 @@ static int open_namei_create(struct name
dput(nd->dentry);
nd->dentry = path->dentry;
if (error)
- return error;
+ goto out;
/* Don't check for write permission, don't truncate */
- return may_open(nd, 0, flag & ~O_TRUNC);
+ error = may_open(nd, 0, flag & ~O_TRUNC);
+out:
+ /*
+ * if needed, may_open() has taken a write
+ * on the mnt for us, so we can release ours.
+ */
+ mnt_drop_write(nd->mnt);
+ return error;
}

/*
@@ -1778,11 +1798,7 @@ do_last:

/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = mnt_want_write(nd->mnt);
- if (error)
- goto exit_mutex_unlock;
error = open_namei_create(nd, &path, flag, mode);
- mnt_drop_write(nd->mnt);
if (error)
goto exit;
return 0;
_

2007-09-28 18:13:58

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 5/8] make open_namei() return a filp

If open_namei() succeeds, there is potentially a mnt_want_write()
that needs to get balanced. If the caller doesn't create a
'struct file' and eventually __fput() it, or manually drop the
write count on an error, we have a bug.

Forcing open_namei() to return a filp fixes this. Any caller
getting a 'struct file' back must consider that filp instantiated
and fput() it normally. The callers no longer have to worry about
ever manually releasing a mnt write count.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/namei.c | 16 ++++++++--------
lxc-dave/fs/open.c | 7 +------
lxc-dave/include/linux/fs.h | 2 +-
3 files changed, 10 insertions(+), 15 deletions(-)

diff -puN fs/namei.c~make-open_namei-return-a-filp fs/namei.c
--- lxc/fs/namei.c~make-open_namei-return-a-filp 2007-09-28 11:03:22.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:22.000000000 -0700
@@ -1750,8 +1750,8 @@ static inline int sys_open_flags_to_name
* system call. See sys_open_flags_to_namei_flags().
* SMP-safe
*/
-int open_namei(int dfd, const char *pathname, int sys_open_flag,
- int mode, struct nameidata *nd)
+struct file *open_namei(int dfd, const char *pathname, int sys_open_flag,
+ int mode, struct nameidata *nd)
{
int acc_mode, error;
struct path path;
@@ -1777,7 +1777,7 @@ int open_namei(int dfd, const char *path
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
nd, flag);
if (error)
- return error;
+ return ERR_PTR(error);
goto ok;
}

@@ -1786,7 +1786,7 @@ int open_namei(int dfd, const char *path
*/
error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
if (error)
- return error;
+ return ERR_PTR(error);

/*
* We have the parent and last component. First of all, check
@@ -1820,7 +1820,7 @@ do_last:
error = __open_namei_create(nd, &path, flag, mode);
if (error)
goto exit;
- return 0;
+ return nameidata_to_filp(nd, sys_open_flag);
}

/*
@@ -1864,7 +1864,7 @@ ok:
mnt_drop_write(nd->mnt);
goto exit;
}
- return 0;
+ return nameidata_to_filp(nd, sys_open_flag);

exit_mutex_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
@@ -1874,7 +1874,7 @@ exit:
if (!IS_ERR(nd->intent.open.file))
release_open_intent(nd);
path_release(nd);
- return error;
+ return ERR_PTR(error);

do_link:
error = -ELOOP;
@@ -1901,7 +1901,7 @@ do_link:
* with "intent.open".
*/
release_open_intent(nd);
- return error;
+ return ERR_PTR(error);
}
nd->flags &= ~LOOKUP_PARENT;
if (nd->last_type == LAST_BIND)
diff -puN fs/open.c~make-open_namei-return-a-filp fs/open.c
--- lxc/fs/open.c~make-open_namei-return-a-filp 2007-09-28 11:03:22.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:22.000000000 -0700
@@ -843,14 +843,9 @@ cleanup_file:
static struct file *do_filp_open(int dfd, const char *filename, int flags,
int mode)
{
- int error;
struct nameidata nd;

- error = open_namei(dfd, filename, flags, mode, &nd);
- if (!error)
- return nameidata_to_filp(&nd, flags);
-
- return ERR_PTR(error);
+ return open_namei(dfd, filename, flags, mode, &nd);
}

struct file *filp_open(const char *filename, int flags, int mode)
diff -puN include/linux/fs.h~make-open_namei-return-a-filp include/linux/fs.h
--- lxc/include/linux/fs.h~make-open_namei-return-a-filp 2007-09-28 11:03:22.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-28 11:03:22.000000000 -0700
@@ -1721,7 +1721,7 @@ extern struct file *create_read_pipe(str
extern struct file *create_write_pipe(void);
extern void free_write_pipe(struct file *);

-extern int open_namei(int dfd, const char *, int, int, struct nameidata *);
+extern struct file *open_namei(int dfd, const char *, int, int, struct nameidata *);
extern int may_open(struct nameidata *, int, int);

extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
_

2007-09-28 18:14:20

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 3/8] move mnt_want_write() out of may_open()

may_open() can fail in a lot of ways. It is also named
such that it doesn't appear to be _taking_ action, just
checking a bunch of conditions.

So, it makes a poor place to take and release the mnt
writer count. This moves the burder of taking the
mnt writer counts into the callers. The callers have
the advantage of much more visibility of the filp.
Since we rely on the __fput(filp) to release the mnt
writer count, this is a good thing.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/namei.c | 41 +++++++++++++++++++++++------------------
lxc-dave/fs/nfsctl.c | 20 ++++++++++++++++++--
2 files changed, 41 insertions(+), 20 deletions(-)

diff -puN fs/namei.c~move-mnt_want_write-out-of-may_open fs/namei.c
--- lxc/fs/namei.c~move-mnt_want_write-out-of-may_open 2007-09-28 11:03:20.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:20.000000000 -0700
@@ -1621,14 +1621,6 @@ int may_open(struct nameidata *nd, int a
return -EACCES;

flag &= ~O_TRUNC;
- } else if (flag & FMODE_WRITE) {
- /*
- * effectively: !special_file()
- * balanced by __fput()
- */
- error = mnt_want_write(nd->mnt);
- if (error)
- return error;
}

error = vfs_permission(nd, acc_mode);
@@ -1687,10 +1679,8 @@ static int open_namei_create(struct name
struct dentry *dir = nd->dentry;

/*
- * This ensures that the mnt stays writable
- * over the vfs_create() call to may_open(),
- * which takes a more persistent
- * mnt_want_write().
+ * This mnt_want_write() is potentially persistent,
+ * and balanced in __fput()
*/
error = mnt_want_write(nd->mnt);
if (error) {
@@ -1710,14 +1700,18 @@ static int open_namei_create(struct name
/* Don't check for write permission, don't truncate */
error = may_open(nd, 0, flag & ~O_TRUNC);
out:
- /*
- * if needed, may_open() has taken a write
- * on the mnt for us, so we can release ours.
- */
- mnt_drop_write(nd->mnt);
+ if (error)
+ mnt_drop_write(nd->mnt);
return error;
}

+static inline int write_may_occur_to_file(struct inode *inode, int acc_mode)
+{
+ if (special_file(inode->i_mode))
+ return 0;
+ return (acc_mode & FMODE_WRITE);
+}
+
/*
* open_namei()
*
@@ -1831,9 +1825,20 @@ do_last:
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
ok:
- error = may_open(nd, acc_mode, flag);
+ /*
+ * This mnt_want_write() is potentially persistent,
+ * and balanced in __fput()
+ */
+ if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
+ error = mnt_want_write(nd->mnt);
if (error)
goto exit;
+ error = may_open(nd, acc_mode, flag);
+ if (error) {
+ if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
+ mnt_drop_write(nd->mnt);
+ goto exit;
+ }
return 0;

exit_mutex_unlock:
diff -puN fs/nfsctl.c~move-mnt_want_write-out-of-may_open fs/nfsctl.c
--- lxc/fs/nfsctl.c~move-mnt_want_write-out-of-may_open 2007-09-28 11:03:20.000000000 -0700
+++ lxc-dave/fs/nfsctl.c 2007-09-28 11:03:20.000000000 -0700
@@ -22,6 +22,7 @@

static struct file *do_open(char *name, int flags)
{
+ struct file *filp;
struct nameidata nd;
struct vfsmount *mnt;
int error;
@@ -35,14 +36,29 @@ static struct file *do_open(char *name,
if (error)
return ERR_PTR(error);

+ /*
+ * This is balanced when you __fput() the 'struct file'
+ * returned by dentry_open() below. If the may_open()
+ * ever stops containing FMODE_WRITE, this needs to
+ * be changed.
+ */
+ error = mnt_want_write(mnt);
+ if (error)
+ return ERR_PTR(error);
if (flags == O_RDWR)
error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
else
error = may_open(&nd, MAY_WRITE, FMODE_WRITE);

- if (!error)
- return dentry_open(nd.dentry, nd.mnt, flags);
+ if (error)
+ goto out;

+ filp = dentry_open(nd.dentry, nd.mnt, flags);
+ if (!IS_ERR(filp))
+ return filp;
+ error = PTR_ERR(error);
+out:
+ mnt_drop_write(mnt);
path_release(&nd);
return ERR_PTR(error);
}
_

2007-09-28 18:14:33

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 4/8] do namei_flags calculation inside open_namei()

My end goal here is to make sure all users of may_open()
return filps. This will ensure that we properly release
mount write counts which were taken for the filp in
may_open().

This patch moves the sys_open flags to namei flags
calculation into fs/namei.c. We'll shortly be moving
the nameidata_to_filp() calls into namei.c, and this
gets the sys_open flags to a place where we can get
at them when we need them.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/namei.c | 43 ++++++++++++++++++++++++++++++++++---------
lxc-dave/fs/open.c | 22 ++--------------------
2 files changed, 36 insertions(+), 29 deletions(-)

diff -puN fs/namei.c~do-namei_flags-calculation-inside-open_namei fs/namei.c
--- lxc/fs/namei.c~do-namei_flags-calculation-inside-open_namei 2007-09-28 11:03:21.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:21.000000000 -0700
@@ -1672,7 +1672,12 @@ int may_open(struct nameidata *nd, int a
return 0;
}

-static int open_namei_create(struct nameidata *nd, struct path *path,
+/*
+ * Be careful about ever adding any more callers of this
+ * function. Its flags must be in the namei format, not
+ * what get passed to sys_open().
+ */
+static int __open_namei_create(struct nameidata *nd, struct path *path,
int flag, int mode)
{
int error;
@@ -1713,26 +1718,46 @@ static inline int write_may_occur_to_fil
}

/*
+ * Note that while the flag value (low two bits) for sys_open means:
+ * 00 - read-only
+ * 01 - write-only
+ * 10 - read-write
+ * 11 - special
+ * it is changed into
+ * 00 - no permissions needed
+ * 01 - read-permission
+ * 10 - write-permission
+ * 11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
+ *
+*/
+static inline int sys_open_flags_to_namei_flags(int flag)
+{
+ if ((flag+1) & O_ACCMODE)
+ flag++;
+ return flag;
+}
+
+/*
* open_namei()
*
* namei for open - this is in fact almost the whole open-routine.
*
* Note that the low bits of "flag" aren't the same as in the open
- * system call - they are 00 - no permissions needed
- * 01 - read permission needed
- * 10 - write permission needed
- * 11 - read/write permissions needed
- * which is a lot more logical, and also allows the "no perm" needed
- * for symlinks (where the permissions are checked later).
+ * system call. See sys_open_flags_to_namei_flags().
* SMP-safe
*/
-int open_namei(int dfd, const char *pathname, int flag,
+int open_namei(int dfd, const char *pathname, int sys_open_flag,
int mode, struct nameidata *nd)
{
int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;
+ int flag = sys_open_flags_to_namei_flags(sys_open_flag);

acc_mode = ACC_MODE(flag);

@@ -1792,7 +1817,7 @@ do_last:

/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = open_namei_create(nd, &path, flag, mode);
+ error = __open_namei_create(nd, &path, flag, mode);
if (error)
goto exit;
return 0;
diff -puN fs/open.c~do-namei_flags-calculation-inside-open_namei fs/open.c
--- lxc/fs/open.c~do-namei_flags-calculation-inside-open_namei 2007-09-28 11:03:21.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:21.000000000 -0700
@@ -840,31 +840,13 @@ cleanup_file:
return ERR_PTR(error);
}

-/*
- * Note that while the flag value (low two bits) for sys_open means:
- * 00 - read-only
- * 01 - write-only
- * 10 - read-write
- * 11 - special
- * it is changed into
- * 00 - no permissions needed
- * 01 - read-permission
- * 10 - write-permission
- * 11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
static struct file *do_filp_open(int dfd, const char *filename, int flags,
int mode)
{
- int namei_flags, error;
+ int error;
struct nameidata nd;

- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
- namei_flags++;
-
- error = open_namei(dfd, filename, namei_flags, mode, &nd);
+ error = open_namei(dfd, filename, flags, mode, &nd);
if (!error)
return nameidata_to_filp(&nd, flags);

_

2007-09-28 18:14:59

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 7/8] kill filp_open()

Replace all callers with open_namei() directly, and move the
nameidata stack allocation into open_namei().

Does it make sense to still call it open_namei(), even though it
doesn't actually take a nameidata any more?

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/drivers/usb/gadget/file_storage.c | 5 +
lxc-dave/fs/exec.c | 2
lxc-dave/fs/namei.c | 77 ++++++++++++++---------------
lxc-dave/fs/open.c | 6 --
lxc-dave/fs/reiserfs/journal.c | 2
lxc-dave/include/linux/fs.h | 2
lxc-dave/kernel/acct.c | 2
lxc-dave/mm/swapfile.c | 4 -
lxc-dave/sound/sound_firmware.c | 2
9 files changed, 51 insertions(+), 51 deletions(-)

diff -puN drivers/usb/gadget/file_storage.c~kill-filp_open drivers/usb/gadget/file_storage.c
--- lxc/drivers/usb/gadget/file_storage.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/drivers/usb/gadget/file_storage.c 2007-09-28 11:03:27.000000000 -0700
@@ -3468,16 +3468,17 @@ static int open_backing_file(struct lun
struct inode *inode = NULL;
loff_t size;
loff_t num_sectors;
+ int mode = O_LARGEFILE;

/* R/W if we can, R/O if we must */
ro = curlun->ro;
if (!ro) {
- filp = filp_open(filename, O_RDWR | O_LARGEFILE, 0);
+ filp = open_namei(AT_FDCWD, filename, O_RDWR | mode, 0);
if (-EROFS == PTR_ERR(filp))
ro = 1;
}
if (ro)
- filp = filp_open(filename, O_RDONLY | O_LARGEFILE, 0);
+ filp = open_namei(AT_FDCWD, filename, O_RDONLY | mode, 0);
if (IS_ERR(filp)) {
LINFO(curlun, "unable to open backing file: %s\n", filename);
return PTR_ERR(filp);
diff -puN fs/exec.c~kill-filp_open fs/exec.c
--- lxc/fs/exec.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/exec.c 2007-09-28 11:03:27.000000000 -0700
@@ -1764,7 +1764,7 @@ int do_coredump(long signr, int exit_cod
goto fail_unlock;
}
} else
- file = filp_open(corename,
+ file = open_namei(AT_FDCWD, corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
if (IS_ERR(file))
diff -puN fs/namei.c~kill-filp_open fs/namei.c
--- lxc/fs/namei.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:27.000000000 -0700
@@ -1751,8 +1751,9 @@ static inline int sys_open_flags_to_name
* SMP-safe
*/
struct file *open_namei(int dfd, const char *pathname, int sys_open_flag,
- int mode, struct nameidata *nd)
+ int mode)
{
+ struct nameidata nd;
int acc_mode, error;
struct path path;
struct dentry *dir;
@@ -1775,7 +1776,7 @@ struct file *open_namei(int dfd, const c
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
- nd, flag);
+ &nd, flag);
if (error)
return ERR_PTR(error);
goto ok;
@@ -1784,7 +1785,7 @@ struct file *open_namei(int dfd, const c
/*
* Create - we need to know the parent.
*/
- error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
+ error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,&nd,flag,mode);
if (error)
return ERR_PTR(error);

@@ -1794,14 +1795,14 @@ struct file *open_namei(int dfd, const c
* will not do.
*/
error = -EISDIR;
- if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+ if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
goto exit;

- dir = nd->dentry;
- nd->flags &= ~LOOKUP_PARENT;
+ dir = nd.dentry;
+ nd.flags &= ~LOOKUP_PARENT;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->mnt;
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.mnt;

do_last:
error = PTR_ERR(path.dentry);
@@ -1810,17 +1811,17 @@ do_last:
goto exit;
}

- if (IS_ERR(nd->intent.open.file)) {
- error = PTR_ERR(nd->intent.open.file);
+ if (IS_ERR(nd.intent.open.file)) {
+ error = PTR_ERR(nd.intent.open.file);
goto exit_mutex_unlock;
}

/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = __open_namei_create(nd, &path, flag, mode);
+ error = __open_namei_create(&nd, &path, flag, mode);
if (error)
goto exit;
- return nameidata_to_filp(nd, sys_open_flag);
+ return nameidata_to_filp(&nd, sys_open_flag);
}

/*
@@ -1845,7 +1846,7 @@ do_last:
if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
goto do_link;

- path_to_nameidata(&path, nd);
+ path_to_nameidata(&path, &nd);
error = -EISDIR;
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
@@ -1854,26 +1855,26 @@ ok:
* This mnt_want_write() is potentially persistent,
* and balanced in __fput()
*/
- if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
- error = mnt_want_write(nd->mnt);
+ if (write_may_occur_to_file(nd.dentry->d_inode, acc_mode))
+ error = mnt_want_write(nd.mnt);
if (error)
goto exit;
- error = may_open(nd, acc_mode, flag);
+ error = may_open(&nd, acc_mode, flag);
if (error) {
- if (write_may_occur_to_file(nd->dentry->d_inode, acc_mode))
- mnt_drop_write(nd->mnt);
+ if (write_may_occur_to_file(nd.dentry->d_inode, acc_mode))
+ mnt_drop_write(nd.mnt);
goto exit;
}
- return nameidata_to_filp(nd, sys_open_flag);
+ return nameidata_to_filp(&nd, sys_open_flag);

exit_mutex_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
- dput_path(&path, nd);
+ dput_path(&path, &nd);
exit:
- if (!IS_ERR(nd->intent.open.file))
- release_open_intent(nd);
- path_release(nd);
+ if (!IS_ERR(&nd.intent.open.file))
+ release_open_intent(&nd);
+ path_release(&nd);
return ERR_PTR(error);

do_link:
@@ -1887,42 +1888,42 @@ do_link:
* After that we have the parent and last component, i.e.
* we are in the same situation as after the first path_walk().
* Well, almost - if the last component is normal we get its copy
- * stored in nd->last.name and we will have to putname() it when we
+ * stored in nd.last.name and we will have to putname() it when we
* are done. Procfs-like symlinks just set LAST_BIND.
*/
- nd->flags |= LOOKUP_PARENT;
- error = security_inode_follow_link(path.dentry, nd);
+ nd.flags |= LOOKUP_PARENT;
+ error = security_inode_follow_link(path.dentry, &nd);
if (error)
goto exit_dput;
- error = __do_follow_link(&path, nd);
+ error = __do_follow_link(&path, &nd);
if (error) {
/* Does someone understand code flow here? Or it is only
* me so stupid? Anathema to whoever designed this non-sense
* with "intent.open".
*/
- release_open_intent(nd);
+ release_open_intent(&nd);
return ERR_PTR(error);
}
- nd->flags &= ~LOOKUP_PARENT;
- if (nd->last_type == LAST_BIND)
+ nd.flags &= ~LOOKUP_PARENT;
+ if (nd.last_type == LAST_BIND)
goto ok;
error = -EISDIR;
- if (nd->last_type != LAST_NORM)
+ if (nd.last_type != LAST_NORM)
goto exit;
- if (nd->last.name[nd->last.len]) {
- __putname(nd->last.name);
+ if (nd.last.name[nd.last.len]) {
+ __putname(nd.last.name);
goto exit;
}
error = -ELOOP;
if (count++==32) {
- __putname(nd->last.name);
+ __putname(nd.last.name);
goto exit;
}
- dir = nd->dentry;
+ dir = nd.dentry;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->mnt;
- __putname(nd->last.name);
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.mnt;
+ __putname(nd.last.name);
goto do_last;
}

diff -puN fs/open.c~kill-filp_open fs/open.c
--- lxc/fs/open.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:27.000000000 -0700
@@ -842,8 +842,7 @@ cleanup_file:

struct file *filp_open(const char *filename, int flags, int mode)
{
- struct nameidata nd;
- return open_namei(AT_FDCWD, filename, flags, mode, &nd);
+ return open_namei(AT_FDCWD, filename, flags, mode);
}
EXPORT_SYMBOL(filp_open);

@@ -1047,8 +1046,7 @@ long do_sys_open(int dfd, const char __u

fd = get_unused_fd_flags(flags);
if (fd >= 0) {
- struct nameidata nd;
- struct file *f = open_namei(dfd, tmp, flags, mode, &nd);
+ struct file *f = open_namei(dfd, tmp, flags, mode);

if (IS_ERR(f)) {
put_unused_fd(fd);
diff -puN fs/reiserfs/journal.c~kill-filp_open fs/reiserfs/journal.c
--- lxc/fs/reiserfs/journal.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/fs/reiserfs/journal.c 2007-09-28 11:03:27.000000000 -0700
@@ -2623,7 +2623,7 @@ static int journal_init_dev(struct super
return 0;
}

- journal->j_dev_file = filp_open(jdev_name, 0, 0);
+ journal->j_dev_file = open_namei(AT_FDCWD, jdev_name, 0, 0);
if (!IS_ERR(journal->j_dev_file)) {
struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
if (!S_ISBLK(jdev_inode->i_mode)) {
diff -puN include/linux/fs.h~kill-filp_open include/linux/fs.h
--- lxc/include/linux/fs.h~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-28 11:03:27.000000000 -0700
@@ -1721,7 +1721,7 @@ extern struct file *create_read_pipe(str
extern struct file *create_write_pipe(void);
extern void free_write_pipe(struct file *);

-extern struct file *open_namei(int dfd, const char *, int, int, struct nameidata *);
+extern struct file *open_namei(int dfd, const char *, int, int);
extern int may_open(struct nameidata *, int, int);

extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
diff -puN kernel/acct.c~kill-filp_open kernel/acct.c
--- lxc/kernel/acct.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/kernel/acct.c 2007-09-28 11:03:27.000000000 -0700
@@ -208,7 +208,7 @@ static int acct_on(char *name)
int error;

/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ file = open_namei(AT_FDCWD, name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file))
return PTR_ERR(file);

diff -puN mm/swapfile.c~kill-filp_open mm/swapfile.c
--- lxc/mm/swapfile.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/mm/swapfile.c 2007-09-28 11:03:27.000000000 -0700
@@ -1198,7 +1198,7 @@ asmlinkage long sys_swapoff(const char _
if (IS_ERR(pathname))
goto out;

- victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
+ victim = open_namei(AT_FDCWD, pathname, O_RDWR|O_LARGEFILE, 0);
putname(pathname);
err = PTR_ERR(victim);
if (IS_ERR(victim))
@@ -1477,7 +1477,7 @@ asmlinkage long sys_swapon(const char __
name = NULL;
goto bad_swap_2;
}
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ swap_file = open_namei(AT_FDCWD, name, O_RDWR|O_LARGEFILE, 0);
error = PTR_ERR(swap_file);
if (IS_ERR(swap_file)) {
swap_file = NULL;
diff -puN sound/sound_firmware.c~kill-filp_open sound/sound_firmware.c
--- lxc/sound/sound_firmware.c~kill-filp_open 2007-09-28 11:03:27.000000000 -0700
+++ lxc-dave/sound/sound_firmware.c 2007-09-28 11:03:27.000000000 -0700
@@ -14,7 +14,7 @@ static int do_mod_firmware_load(const ch
char *dp;
loff_t pos;

- filp = filp_open(fn, 0, 0);
+ filp = open_namei(AT_FDCWD, fn, 0, 0);
if (IS_ERR(filp))
{
printk(KERN_INFO "Unable to load '%s'.\n", fn);
_

2007-09-28 18:15:28

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 6/8] kill do_filp_open()

This kills off the almost empty do_filp_open(). The indenting
change in do_sys_open() is because we would have gone over our
80 characters otherwise.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/open.c | 39 ++++++++++++++++++---------------------
1 file changed, 18 insertions(+), 21 deletions(-)

diff -puN fs/open.c~kill-do_filp_open fs/open.c
--- lxc/fs/open.c~kill-do_filp_open 2007-09-28 11:03:24.000000000 -0700
+++ lxc-dave/fs/open.c 2007-09-28 11:03:24.000000000 -0700
@@ -840,17 +840,10 @@ cleanup_file:
return ERR_PTR(error);
}

-static struct file *do_filp_open(int dfd, const char *filename, int flags,
- int mode)
-{
- struct nameidata nd;
-
- return open_namei(dfd, filename, flags, mode, &nd);
-}
-
struct file *filp_open(const char *filename, int flags, int mode)
{
- return do_filp_open(AT_FDCWD, filename, flags, mode);
+ struct nameidata nd;
+ return open_namei(AT_FDCWD, filename, flags, mode, &nd);
}
EXPORT_SYMBOL(filp_open);

@@ -1049,20 +1042,24 @@ long do_sys_open(int dfd, const char __u
char *tmp = getname(filename);
int fd = PTR_ERR(tmp);

- if (!IS_ERR(tmp)) {
- fd = get_unused_fd_flags(flags);
- if (fd >= 0) {
- struct file *f = do_filp_open(dfd, tmp, flags, mode);
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- fd = PTR_ERR(f);
- } else {
- fsnotify_open(f->f_path.dentry);
- fd_install(fd, f);
- }
+ if (IS_ERR(tmp))
+ goto out;
+
+ fd = get_unused_fd_flags(flags);
+ if (fd >= 0) {
+ struct nameidata nd;
+ struct file *f = open_namei(dfd, tmp, flags, mode, &nd);
+
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(f);
+ } else {
+ fsnotify_open(f->f_path.dentry);
+ fd_install(fd, f);
}
- putname(tmp);
}
+ putname(tmp);
+out:
return fd;
}

_

2007-09-28 18:15:43

by Dave Hansen

[permalink] [raw]

Subject: [RFC][PATCH 8/8] keep track of mnt_writer state of struct file

There have been a few oopses caused by 'struct file's with
NULL f_vfsmnts. There was also a set of potentially missed
mnt_want_write()s from dentry_open() calls.

This patch provides a very simple debugging framework to
catch these kinds of bugs. It will WARN_ON() them, but
should stop us from having any oopses or mnt_writer
count imbalances.

Signed-off-by: Dave Hansen <[email protected]>
---

lxc-dave/fs/file_table.c | 21 +++++++++++++++++++--
lxc-dave/fs/namei.c | 15 +++++++++++++--
lxc-dave/include/linux/fs.h | 4 ++++
3 files changed, 36 insertions(+), 4 deletions(-)

diff -puN fs/file_table.c~keep-track-of-mnt_writer-state-of-struct-file fs/file_table.c
--- lxc/fs/file_table.c~keep-track-of-mnt_writer-state-of-struct-file 2007-09-28 11:03:50.000000000 -0700
+++ lxc-dave/fs/file_table.c 2007-09-28 11:03:50.000000000 -0700
@@ -42,6 +42,12 @@ static inline void file_free_rcu(struct
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files);
+ /*
+ * At this point, either both or neither of these bits
+ * should be set.
+ */
+ WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN);
+ WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}

@@ -194,6 +200,7 @@ int init_file(struct file *file, struct
file->f_mode = mode;
file->f_op = fop;
if (mode & FMODE_WRITE) {
+ file->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
error = mnt_want_write(mnt);
WARN_ON(error);
}
@@ -236,8 +243,18 @@ void fastcall __fput(struct file *file)
fops_put(file->f_op);
if (file->f_mode & FMODE_WRITE) {
put_write_access(inode);
- if (!special_file(inode->i_mode))
- mnt_drop_write(mnt);
+ if (!special_file(inode->i_mode)) {
+ if (file->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) {
+ mnt_drop_write(mnt);
+ file->f_mnt_write_state |=
+ FILE_MNT_WRITE_RELEASED;
+ } else {
+ printk(KERN_WARNING "__fput() of writeable "
+ "file with no "
+ "mnt_want_write()\n");
+ WARN_ON(1);
+ }
+ }
}
put_pid(file->f_owner.pid);
file_kill(file);
diff -puN fs/namei.c~keep-track-of-mnt_writer-state-of-struct-file fs/namei.c
--- lxc/fs/namei.c~keep-track-of-mnt_writer-state-of-struct-file 2007-09-28 11:03:50.000000000 -0700
+++ lxc-dave/fs/namei.c 2007-09-28 11:03:50.000000000 -0700
@@ -1753,6 +1753,7 @@ static inline int sys_open_flags_to_name
struct file *open_namei(int dfd, const char *pathname, int sys_open_flag,
int mode)
{
+ struct file *file;
struct nameidata nd;
int acc_mode, error;
struct path path;
@@ -1821,7 +1822,14 @@ do_last:
error = __open_namei_create(&nd, &path, flag, mode);
if (error)
goto exit;
- return nameidata_to_filp(&nd, sys_open_flag);
+ file = nameidata_to_filp(&nd, sys_open_flag);
+ /*
+ * The mnt_want_write happened in
+ * __open_namei_create(), but it
+ * happened unconditionally, so
+ * this is safe to assume.
+ */
+ file->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
}

/*
@@ -1865,7 +1873,10 @@ ok:
mnt_drop_write(nd.mnt);
goto exit;
}
- return nameidata_to_filp(&nd, sys_open_flag);
+ file = nameidata_to_filp(&nd, sys_open_flag);
+ if (write_may_occur_to_file(nd.dentry->d_inode, acc_mode))
+ file->f_mnt_write_state = FILE_MNT_WRITE_TAKEN;
+ return file;

exit_mutex_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
diff -puN include/linux/fs.h~keep-track-of-mnt_writer-state-of-struct-file include/linux/fs.h
--- lxc/include/linux/fs.h~keep-track-of-mnt_writer-state-of-struct-file 2007-09-28 11:03:50.000000000 -0700
+++ lxc-dave/include/linux/fs.h 2007-09-28 11:03:50.000000000 -0700
@@ -779,6 +779,9 @@ static inline int ra_has_index(struct fi
index < ra->start + ra->size);
}

+#define FILE_MNT_WRITE_TAKEN 1
+#define FILE_MNT_WRITE_RELEASED 2
+
struct file {
/*
* fu_list becomes invalid after file_free is called and queued via
@@ -813,6 +816,7 @@ struct file {
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
+ unsigned long f_mnt_write_state;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
_

2007-10-01 19:56:19

by Miklos Szeredi

[permalink] [raw]

Subject: Re: [RFC][PATCH 3/8] move mnt_want_write() out of may_open()

> @@ -1687,10 +1679,8 @@ static int open_namei_create(struct name
> struct dentry *dir = nd->dentry;
>
> /*
> - * This ensures that the mnt stays writable
> - * over the vfs_create() call to may_open(),
> - * which takes a more persistent
> - * mnt_want_write().
> + * This mnt_want_write() is potentially persistent,
> + * and balanced in __fput()
> */
> error = mnt_want_write(nd->mnt);
> if (error) {

I'm confused: isn't it the mnt_want_write() in __dentry_open(), that
is balanced in __fput()?

Miklos

2007-10-01 20:10:43

by Dave Hansen

[permalink] [raw]

Subject: Re: [RFC][PATCH 3/8] move mnt_want_write() out of may_open()

On Mon, 2007-10-01 at 21:55 +0200, Miklos Szeredi wrote:
> > @@ -1687,10 +1679,8 @@ static int open_namei_create(struct name
> > struct dentry *dir = nd->dentry;
> >
> > /*
> > - * This ensures that the mnt stays writable
> > - * over the vfs_create() call to may_open(),
> > - * which takes a more persistent
> > - * mnt_want_write().
> > + * This mnt_want_write() is potentially persistent,
> > + * and balanced in __fput()
> > */
> > error = mnt_want_write(nd->mnt);
> > if (error) {
>
> I'm confused: isn't it the mnt_want_write() in __dentry_open(), that
> is balanced in __fput()?

This is broken. I didn't realize that nameidata_to_filp() called
dentry_open. I'll rework these.

-- Dave