From: Pekka Enberg <[email protected]>
The revoke operation cannibalizes the revoked struct inode and removes it from
the inode cache thus forcing subsequent callers to look up the real inode.
Therefore we must make sure that while the revoke operation is in progress
(e.g. flushing dirty pages to disk) no one takes a new reference to the inode
and starts I/O on it.
Cc: Alan Cox <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Pekka Enberg <[email protected]>
---
fs/inode.c | 1 +
fs/namei.c | 15 ++++++++++++++-
include/linux/fs.h | 3 +++
3 files changed, 18 insertions(+), 1 deletion(-)
Index: 2.6/include/linux/fs.h
===================================================================
--- 2.6.orig/include/linux/fs.h 2007-11-23 09:58:11.000000000 +0200
+++ 2.6/include/linux/fs.h 2007-12-14 16:40:50.000000000 +0200
@@ -150,6 +150,7 @@ #define MS_MGC_MSK 0xffff0000
#define S_NOCMTIME 128 /* Do not update file c/mtime */
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE 512 /* Inode is fs-internal */
+#define S_REVOKE_LOCK 1024 /* Inode is being revoked */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -183,6 +184,7 @@ #define MS_MGC_MSK 0xffff0000
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
#define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
#define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
+#define IS_REVOKE_LOCKED(inode) ((inode)->i_flags & S_REVOKE_LOCK)
/* the read-only stuff doesn't really belong here, but any other place is
probably as bad and I don't want to create yet another include file. */
@@ -642,6 +644,7 @@ struct inode {
struct list_head inotify_watches; /* watches on this inode */
struct mutex inotify_mutex; /* protects the watches list */
#endif
+ wait_queue_head_t i_revoke_wait;
unsigned long i_state;
unsigned long dirtied_when; /* jiffies of first dirtying */
Index: 2.6/fs/inode.c
===================================================================
--- 2.6.orig/fs/inode.c 2007-10-26 09:36:45.000000000 +0300
+++ 2.6/fs/inode.c 2007-12-14 16:40:50.000000000 +0200
@@ -216,6 +216,7 @@ memset(inode, 0, sizeof(*inode));
INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
i_size_ordered_init(inode);
+ init_waitqueue_head(&inode->i_revoke_wait);
#ifdef CONFIG_INOTIFY
INIT_LIST_HEAD(&inode->inotify_watches);
mutex_init(&inode->inotify_mutex);
Index: 2.6/fs/namei.c
===================================================================
--- 2.6.orig/fs/namei.c 2007-10-26 09:36:48.000000000 +0300
+++ 2.6/fs/namei.c 2007-12-14 16:40:50.000000000 +0200
@@ -785,10 +785,23 @@ static int do_lookup(struct nameidata *n
struct path *path)
{
struct vfsmount *mnt = nd->mnt;
- struct dentry *dentry = __d_lookup(nd->dentry, name);
+ struct dentry *dentry;
+again:
+ dentry = __d_lookup(nd->dentry, name);
if (!dentry)
goto need_lookup;
+
+ if (dentry->d_inode && IS_REVOKE_LOCKED(dentry->d_inode)) {
+ int err;
+
+ err = wait_event_interruptible(dentry->d_inode->i_revoke_wait,
+ !IS_REVOKE_LOCKED(dentry->d_inode));
+ if (err)
+ return err;
+ goto again;
+ }
+
if (dentry->d_op && dentry->d_op->d_revalidate)
goto need_revalidate;
done:
Quoting Pekka J Enberg ([email protected]):
> From: Pekka Enberg <[email protected]>
>
> The revoke operation cannibalizes the revoked struct inode and removes it from
> the inode cache thus forcing subsequent callers to look up the real inode.
> Therefore we must make sure that while the revoke operation is in progress
> (e.g. flushing dirty pages to disk) no one takes a new reference to the inode
> and starts I/O on it.
>
> Cc: Alan Cox <[email protected]>
> Cc: Al Viro <[email protected]>
> Cc: Christoph Hellwig <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Signed-off-by: Pekka Enberg <[email protected]>
> ---
> fs/inode.c | 1 +
> fs/namei.c | 15 ++++++++++++++-
> include/linux/fs.h | 3 +++
> 3 files changed, 18 insertions(+), 1 deletion(-)
>
> Index: 2.6/include/linux/fs.h
> ===================================================================
> --- 2.6.orig/include/linux/fs.h 2007-11-23 09:58:11.000000000 +0200
> +++ 2.6/include/linux/fs.h 2007-12-14 16:40:50.000000000 +0200
> @@ -150,6 +150,7 @@ #define MS_MGC_MSK 0xffff0000
> #define S_NOCMTIME 128 /* Do not update file c/mtime */
> #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
> #define S_PRIVATE 512 /* Inode is fs-internal */
> +#define S_REVOKE_LOCK 1024 /* Inode is being revoked */
>
> /*
> * Note that nosuid etc flags are inode-specific: setting some file-system
> @@ -183,6 +184,7 @@ #define MS_MGC_MSK 0xffff0000
> #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
> #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
> #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
> +#define IS_REVOKE_LOCKED(inode) ((inode)->i_flags & S_REVOKE_LOCK)
>
> /* the read-only stuff doesn't really belong here, but any other place is
> probably as bad and I don't want to create yet another include file. */
> @@ -642,6 +644,7 @@ struct inode {
> struct list_head inotify_watches; /* watches on this inode */
> struct mutex inotify_mutex; /* protects the watches list */
> #endif
> + wait_queue_head_t i_revoke_wait;
>
> unsigned long i_state;
> unsigned long dirtied_when; /* jiffies of first dirtying */
> Index: 2.6/fs/inode.c
> ===================================================================
> --- 2.6.orig/fs/inode.c 2007-10-26 09:36:45.000000000 +0300
> +++ 2.6/fs/inode.c 2007-12-14 16:40:50.000000000 +0200
> @@ -216,6 +216,7 @@ memset(inode, 0, sizeof(*inode));
> INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
> INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
> i_size_ordered_init(inode);
> + init_waitqueue_head(&inode->i_revoke_wait);
> #ifdef CONFIG_INOTIFY
> INIT_LIST_HEAD(&inode->inotify_watches);
> mutex_init(&inode->inotify_mutex);
> Index: 2.6/fs/namei.c
> ===================================================================
> --- 2.6.orig/fs/namei.c 2007-10-26 09:36:48.000000000 +0300
> +++ 2.6/fs/namei.c 2007-12-14 16:40:50.000000000 +0200
> @@ -785,10 +785,23 @@ static int do_lookup(struct nameidata *n
> struct path *path)
> {
> struct vfsmount *mnt = nd->mnt;
> - struct dentry *dentry = __d_lookup(nd->dentry, name);
> + struct dentry *dentry;
>
> +again:
> + dentry = __d_lookup(nd->dentry, name);
> if (!dentry)
> goto need_lookup;
> +
> + if (dentry->d_inode && IS_REVOKE_LOCKED(dentry->d_inode)) {
Hi,
not sure whether this is a problem or not, but dentry->d_inode isn't
locked here, right? So nothing is keeping do_lookup() returning
with an inode which gets revoked between here and the return 0
a few lines down?
> + int err;
> +
> + err = wait_event_interruptible(dentry->d_inode->i_revoke_wait,
> + !IS_REVOKE_LOCKED(dentry->d_inode));
> + if (err)
> + return err;
> + goto again;
> + }
> +
> if (dentry->d_op && dentry->d_op->d_revalidate)
> goto need_revalidate;
> done:
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Hi Serge,
(Thanks for looking at this. I appreciate the review!)
On Mon, 17 Dec 2007, [email protected] wrote:
> > struct vfsmount *mnt = nd->mnt;
> > - struct dentry *dentry = __d_lookup(nd->dentry, name);
> > + struct dentry *dentry;
> >
> > +again:
> > + dentry = __d_lookup(nd->dentry, name);
> > if (!dentry)
> > goto need_lookup;
> > +
> > + if (dentry->d_inode && IS_REVOKE_LOCKED(dentry->d_inode)) {
>
> not sure whether this is a problem or not, but dentry->d_inode isn't
> locked here, right? So nothing is keeping do_lookup() returning
> with an inode which gets revoked between here and the return 0
> a few lines down?
I assume you mean S_REVOKE_LOCK and not ->i_mutex, right?
The caller is supposed to block open(2) with chmod(2)/chattr(2) so while
revoke is in progress, you can get references to the _revoked inode_,
which is fine (operations on it will fail with EBADFS). The
->i_revoke_wait bits are there to make sure that while we revoke, you
can't get a _new reference_ to the inode until we're done.
Pekka
This is a relatively minor detail in the rather bigger context of this
patch, but...
> @@ -642,6 +644,7 @@ struct inode {
> struct list_head inotify_watches; /* watches on this inode */
> struct mutex inotify_mutex; /* protects the watches list */
> #endif
> + wait_queue_head_t i_revoke_wait;
That seems like a relatively hefty addition to every inode in the system
when revoke - I think - will be a fairly rare operation. Would there be
any significant cost to using a single, global revoke-wait queue instead
of growing the inode structure?
jon
Hi Jonathan,
(Thanks for the review!)
On Tue, 18 Dec 2007, Jonathan Corbet wrote:
> This is a relatively minor detail in the rather bigger context of this
> patch, but...
>
> > @@ -642,6 +644,7 @@ struct inode {
> > struct list_head inotify_watches; /* watches on this inode */
> > struct mutex inotify_mutex; /* protects the watches list */
> > #endif
> > + wait_queue_head_t i_revoke_wait;
>
> That seems like a relatively hefty addition to every inode in the system
> when revoke - I think - will be a fairly rare operation. Would there be
> any significant cost to using a single, global revoke-wait queue instead
> of growing the inode structure?
No, that's a good idea. I'll change it for the next patchset. Thanks!
Pekka
Quoting Pekka J Enberg ([email protected]):
> Hi Serge,
>
> (Thanks for looking at this. I appreciate the review!)
>
> On Mon, 17 Dec 2007, [email protected] wrote:
> > > struct vfsmount *mnt = nd->mnt;
> > > - struct dentry *dentry = __d_lookup(nd->dentry, name);
> > > + struct dentry *dentry;
> > >
> > > +again:
> > > + dentry = __d_lookup(nd->dentry, name);
> > > if (!dentry)
> > > goto need_lookup;
> > > +
> > > + if (dentry->d_inode && IS_REVOKE_LOCKED(dentry->d_inode)) {
> >
> > not sure whether this is a problem or not, but dentry->d_inode isn't
> > locked here, right? So nothing is keeping do_lookup() returning
> > with an inode which gets revoked between here and the return 0
> > a few lines down?
>
> I assume you mean S_REVOKE_LOCK and not ->i_mutex, right?
No I did mean the i_mutex since you take the i_mutex when you set
S_REVOKE_LOCK. So between that and the comment above do_lookup(),
I assumed you were trying to lock out concurrent do_lookups() returning
an inode whose revoke is starting at the same time.
But based on your next paragraph it sounds like I misunderstand your
locking.
> The caller is supposed to block open(2) with chmod(2)/chattr(2) so while
> revoke is in progress, you can get references to the _revoked inode_,
> which is fine (operations on it will fail with EBADFS). The
> ->i_revoke_wait bits are there to make sure that while we revoke, you
> can't get a _new reference_ to the inode until we're done.
And a new reference means through iget(), so if revoke starts
between the IS_REVOKE_LOCKED() check in do_lookup and its return,
it's ok bc we'll get a reference later on?
I'm a little confused but i'll keep looking.
thanks,
-serge
Hi,
On Wed, 19 Dec 2007, Serge E. Hallyn wrote:
> > I assume you mean S_REVOKE_LOCK and not ->i_mutex, right?
>
> No I did mean the i_mutex since you take the i_mutex when you set
> S_REVOKE_LOCK. So between that and the comment above do_lookup(),
> I assumed you were trying to lock out concurrent do_lookups() returning
> an inode whose revoke is starting at the same time.
No, I only use ->i_mutex for synchronizing the write to ->i_flags.
On Wed, 19 Dec 2007, Serge E. Hallyn wrote:
> > The caller is supposed to block open(2) with chmod(2)/chattr(2) so while
> > revoke is in progress, you can get references to the _revoked inode_,
> > which is fine (operations on it will fail with EBADFS). The
> > ->i_revoke_wait bits are there to make sure that while we revoke, you
> > can't get a _new reference_ to the inode until we're done.
>
> And a new reference means through iget(), so if revoke starts
> between the IS_REVOKE_LOCKED() check in do_lookup and its return,
> it's ok bc we'll get a reference later on?
Yes, as soon as we unhash the dentries and the inode, do_lookup() will try
to find a new inode with iget() but we need to wait before writeback on
the revoked inode is finished.
On Wed, 19 Dec 2007, Serge E. Hallyn wrote:
> I'm a little confused but i'll keep looking.
I don't blame you. The patch is missing the following "minor detail" which
is needed to avoid fs corruption...
Pekka
Index: 2.6/fs/revoke.c
===================================================================
--- 2.6.orig/fs/revoke.c 2007-12-16 19:57:40.000000000 +0200
+++ 2.6/fs/revoke.c 2007-12-19 18:03:13.000000000 +0200
@@ -426,6 +426,8 @@ int err = 0;
make_revoked_inode(inode);
remove_inode_hash(inode);
revoke_aliases(inode);
+
+ err = write_inode_now(inode, 1);
failed:
revoke_unlock(inode);
wake_up(&inode->i_revoke_wait);
Quoting Pekka J Enberg ([email protected]):
> Hi,
>
> On Wed, 19 Dec 2007, Serge E. Hallyn wrote:
> > > I assume you mean S_REVOKE_LOCK and not ->i_mutex, right?
> >
> > No I did mean the i_mutex since you take the i_mutex when you set
> > S_REVOKE_LOCK. So between that and the comment above do_lookup(),
> > I assumed you were trying to lock out concurrent do_lookups() returning
> > an inode whose revoke is starting at the same time.
>
> No, I only use ->i_mutex for synchronizing the write to ->i_flags.
duh.
> On Wed, 19 Dec 2007, Serge E. Hallyn wrote:
> > > The caller is supposed to block open(2) with chmod(2)/chattr(2) so while
> > > revoke is in progress, you can get references to the _revoked inode_,
> > > which is fine (operations on it will fail with EBADFS). The
> > > ->i_revoke_wait bits are there to make sure that while we revoke, you
> > > can't get a _new reference_ to the inode until we're done.
> >
> > And a new reference means through iget(), so if revoke starts
> > between the IS_REVOKE_LOCKED() check in do_lookup and its return,
> > it's ok bc we'll get a reference later on?
>
> Yes, as soon as we unhash the dentries and the inode, do_lookup() will try
> to find a new inode with iget() but we need to wait before writeback on
> the revoked inode is finished.
Ok, that makes sense. I'll let that sit for a short while and look
again :)
thanks,
-serge
> On Wed, 19 Dec 2007, Serge E. Hallyn wrote:
> > I'm a little confused but i'll keep looking.
>
> I don't blame you. The patch is missing the following "minor detail" which
> is needed to avoid fs corruption...
>
> Pekka
>
> Index: 2.6/fs/revoke.c
> ===================================================================
> --- 2.6.orig/fs/revoke.c 2007-12-16 19:57:40.000000000 +0200
> +++ 2.6/fs/revoke.c 2007-12-19 18:03:13.000000000 +0200
> @@ -426,6 +426,8 @@ int err = 0;
> make_revoked_inode(inode);
> remove_inode_hash(inode);
> revoke_aliases(inode);
> +
> + err = write_inode_now(inode, 1);
> failed:
> revoke_unlock(inode);
> wake_up(&inode->i_revoke_wait);