2010-11-23 14:11:57

by Nick Piggin

[permalink] [raw]
Subject: [patch 7/7] fs: fix or note I_DIRTY handling bugs in filesystems

Comments?

Index: linux-2.6/drivers/staging/pohmelfs/inode.c
===================================================================
--- linux-2.6.orig/drivers/staging/pohmelfs/inode.c 2010-11-23 22:57:45.000000000 +1100
+++ linux-2.6/drivers/staging/pohmelfs/inode.c 2010-11-23 22:59:47.000000000 +1100
@@ -373,6 +373,7 @@ static int pohmelfs_write_inode_create_c
dprintk("%s: parent: %llu, ino: %llu, inode: %p.\n",
__func__, parent->ino, n->ino, inode);

+ /* XXX: is this race WRT writeback? */
if (inode && (inode->i_state & I_DIRTY)) {
struct pohmelfs_inode *pi = POHMELFS_I(inode);
pohmelfs_write_create_inode(pi);
Index: linux-2.6/fs/gfs2/file.c
===================================================================
--- linux-2.6.orig/fs/gfs2/file.c 2010-11-23 22:54:47.000000000 +1100
+++ linux-2.6/fs/gfs2/file.c 2010-11-24 00:58:42.000000000 +1100
@@ -557,23 +557,43 @@ static int gfs2_close(struct inode *inod
static int gfs2_fsync(struct file *file, int datasync)
{
struct inode *inode = file->f_mapping->host;
- int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+ unsigned dirty, mask;
int ret = 0;

- /* XXX: testing i_state is broken without proper synchronization */
-
if (gfs2_is_jdata(GFS2_I(inode))) {
gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
return 0;
}

- if (sync_state != 0) {
- if (!datasync)
- ret = write_inode_now(inode, 0);
+ spin_lock(&inode_lock);
+ inode_writeback_begin(inode, 1);
+
+ if (datasync)
+ mask = I_DIRTY_DATASYNC;
+ else
+ mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+ dirty = inode->i_state & mask;
+ inode->i_state &= ~mask;
+ if (dirty) {
+ spin_unlock(&inode_lock);
+
+ if (!datasync) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ };
+ ret = inode->i_sb->s_op->write_inode(inode, &wbc);
+ } else {
+ if (gfs2_is_stuffed(GFS2_I(inode)))
+ gfs2_log_flush(GFS2_SB(inode),
+ GFS2_I(inode)->i_gl);
+ }

- if (gfs2_is_stuffed(GFS2_I(inode)))
- gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+ spin_lock(&inode_lock);
}
+ if (ret)
+ inode->i_state |= dirty;
+ inode_writeback_end(inode);
+ spin_unlock(&inode_lock);

return ret;
}
Index: linux-2.6/fs/jffs2/fs.c
===================================================================
--- linux-2.6.orig/fs/jffs2/fs.c 2010-11-23 22:54:46.000000000 +1100
+++ linux-2.6/fs/jffs2/fs.c 2010-11-23 22:59:47.000000000 +1100
@@ -361,6 +361,7 @@ void jffs2_dirty_inode(struct inode *ino
{
struct iattr iattr;

+ /* XXX: huh? How does this make sense? */
if (!(inode->i_state & I_DIRTY_DATASYNC)) {
D2(printk(KERN_DEBUG "jffs2_dirty_inode() not calling setattr() for ino #%lu\n", inode->i_ino));
return;
Index: linux-2.6/fs/jfs/file.c
===================================================================
--- linux-2.6.orig/fs/jfs/file.c 2010-11-23 22:54:47.000000000 +1100
+++ linux-2.6/fs/jfs/file.c 2010-11-24 00:38:08.000000000 +1100
@@ -19,6 +19,7 @@

#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/writeback.h>
#include <linux/quotaops.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
@@ -31,18 +32,34 @@
int jfs_fsync(struct file *file, int datasync)
{
struct inode *inode = file->f_mapping->host;
- int rc = 0;
+ unsigned dirty, mask;
+ int err = 0;

- if (!(inode->i_state & I_DIRTY) ||
- (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+ spin_lock(&inode_lock);
+ inode_writeback_begin(inode, 1);
+
+ if (datasync)
+ mask = I_DIRTY_DATASYNC;
+ else
+ mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+ dirty = inode->i_state & mask;
+ inode->i_state &= ~mask;
+ spin_unlock(&inode_lock);
+
+ if (!dirty) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
- return rc;
+ } else {
+ err = jfs_commit_inode(inode, 1);
}

- rc |= jfs_commit_inode(inode, 1);
+ spin_lock(&inode_lock);
+ if (err)
+ inode->i_state |= dirty;
+ inode_writeback_end(inode);
+ spin_unlock(&inode_lock);

- return rc ? -EIO : 0;
+ return err ? -EIO : 0;
}

static int jfs_open(struct inode *inode, struct file *file)
Index: linux-2.6/fs/nfsd/vfs.c
===================================================================
--- linux-2.6.orig/fs/nfsd/vfs.c 2010-11-23 22:57:45.000000000 +1100
+++ linux-2.6/fs/nfsd/vfs.c 2010-11-23 22:59:47.000000000 +1100
@@ -969,10 +969,9 @@ static int wait_for_concurrent_writes(st
dprintk("nfsd: write resume %d\n", task_pid_nr(current));
}

- if (inode->i_state & I_DIRTY) {
- dprintk("nfsd: write sync %d\n", task_pid_nr(current));
- err = vfs_fsync(file, 0);
- }
+ dprintk("nfsd: write sync %d\n", task_pid_nr(current));
+ err = vfs_fsync(file, 0);
+
last_ino = inode->i_ino;
last_dev = inode->i_sb->s_dev;
return err;
Index: linux-2.6/fs/ocfs2/file.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/file.c 2010-11-23 22:54:47.000000000 +1100
+++ linux-2.6/fs/ocfs2/file.c 2010-11-24 00:33:24.000000000 +1100
@@ -176,12 +176,24 @@ static int ocfs2_sync_file(struct file *
journal_t *journal;
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned dirty, mask;

mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
file->f_path.dentry, file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name);

- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+ spin_lock(&inode_lock);
+ inode_writeback_begin(inode, 1);
+ if (datasync)
+ mask = I_DIRTY_DATASYNC;
+ else
+ mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+ dirty = inode->i_state & mask;
+ inode->i_state &= ~mask;
+ spin_unlock(&inode_lock);
+
+ if (datasync && dirty) {
+
/*
* We still have to flush drive's caches to get data to the
* platter
@@ -195,6 +207,12 @@ static int ocfs2_sync_file(struct file *
err = jbd2_journal_force_commit(journal);

bail:
+ spin_lock(&inode_lock);
+ if (err)
+ inode->i_state |= dirty;
+ inode_writeback_end(inode);
+ spin_unlock(&inode_lock);
+
mlog_exit(err);

return (err < 0) ? -EIO : 0;
Index: linux-2.6/fs/ubifs/file.c
===================================================================
--- linux-2.6.orig/fs/ubifs/file.c 2010-11-23 22:54:47.000000000 +1100
+++ linux-2.6/fs/ubifs/file.c 2010-11-23 22:59:47.000000000 +1100
@@ -1313,11 +1313,9 @@ int ubifs_fsync(struct file *file, int d
* VFS has already synchronized dirty pages for this inode. Synchronize
* the inode unless this is a 'datasync()' call.
*/
- if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
- err = inode->i_sb->s_op->write_inode(inode, NULL);
- if (err)
- return err;
- }
+ err = sync_inode_metadata(inode, datasync, 1);
+ if (err)
+ return err;

/*
* Nodes related to this inode may still sit in a write-buffer. Flush
Index: linux-2.6/fs/ufs/truncate.c
===================================================================
--- linux-2.6.orig/fs/ufs/truncate.c 2010-11-23 22:54:47.000000000 +1100
+++ linux-2.6/fs/ufs/truncate.c 2010-11-23 22:59:47.000000000 +1100
@@ -479,7 +479,7 @@ int ufs_truncate(struct inode *inode, lo
retry |= ufs_trunc_tindirect (inode);
if (!retry)
break;
- if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
+ if (IS_SYNC(inode))
ufs_sync_inode (inode);
blk_run_address_space(inode->i_mapping);
yield();
Index: linux-2.6/fs/xfs/linux-2.6/xfs_file.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_file.c 2010-11-23 22:54:47.000000000 +1100
+++ linux-2.6/fs/xfs/linux-2.6/xfs_file.c 2010-11-24 00:08:03.000000000 +1100
@@ -99,6 +99,7 @@ xfs_file_fsync(
struct xfs_trans *tp;
int error = 0;
int log_flushed = 0;
+ unsigned dirty, mask;

trace_xfs_file_fsync(ip);

@@ -132,9 +133,16 @@ xfs_file_fsync(
* might gets cleared when the inode gets written out via the AIL
* or xfs_iflush_cluster.
*/
- if (((inode->i_state & I_DIRTY_DATASYNC) ||
- ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
- ip->i_update_core) {
+ spin_lock(&inode_lock);
+ inode_writeback_begin(inode, 1);
+ if (datasync)
+ mask = I_DIRTY_DATASYNC;
+ else
+ mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
+ dirty = inode->i_state & mask;
+ inode->i_state &= ~mask;
+ spin_unlock(&inode_lock);
+ if (dirty && ip->i_update_core) {
/*
* Kick off a transaction to log the inode core to get the
* updates. The sync transaction will also force the log.
@@ -145,7 +153,7 @@ xfs_file_fsync(
XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
- return -error;
+ goto out;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);

@@ -197,6 +205,13 @@ xfs_file_fsync(
xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
}

+out:
+ spin_lock(&inode_lock);
+ if (error)
+ inode->i_state |= dirty;
+ inode_writeback_end(inode);
+ spin_unlock(&inode_lock);
+
return -error;
}

Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c 2010-11-23 23:53:57.000000000 +1100
+++ linux-2.6/fs/inode.c 2010-11-23 23:54:06.000000000 +1100
@@ -82,6 +82,7 @@ static struct hlist_head *inode_hashtabl
* the i_state of an inode while it is in use..
*/
DEFINE_SPINLOCK(inode_lock);
+EXPORT_SYMBOL(inode_lock);

/*
* iprune_sem provides exclusion between the kswapd or try_to_free_pages


2010-11-23 15:03:56

by Steven Whitehouse

[permalink] [raw]
Subject: Re: [patch 7/7] fs: fix or note I_DIRTY handling bugs in filesystems

Hi,

On Wed, 2010-11-24 at 01:06 +1100, [email protected] wrote:
> plain text document attachment (fs-fix-dirty-flags.patch)
> Comments?
>
> Index: linux-2.6/drivers/staging/pohmelfs/inode.c
> ===================================================================
> --- linux-2.6.orig/drivers/staging/pohmelfs/inode.c 2010-11-23 22:57:45.000000000 +1100
> +++ linux-2.6/drivers/staging/pohmelfs/inode.c 2010-11-23 22:59:47.000000000 +1100
> @@ -373,6 +373,7 @@ static int pohmelfs_write_inode_create_c
> dprintk("%s: parent: %llu, ino: %llu, inode: %p.\n",
> __func__, parent->ino, n->ino, inode);
>
> + /* XXX: is this race WRT writeback? */
> if (inode && (inode->i_state & I_DIRTY)) {
> struct pohmelfs_inode *pi = POHMELFS_I(inode);
> pohmelfs_write_create_inode(pi);
> Index: linux-2.6/fs/gfs2/file.c
> ===================================================================
> --- linux-2.6.orig/fs/gfs2/file.c 2010-11-23 22:54:47.000000000 +1100
> +++ linux-2.6/fs/gfs2/file.c 2010-11-24 00:58:42.000000000 +1100
> @@ -557,23 +557,43 @@ static int gfs2_close(struct inode *inod
> static int gfs2_fsync(struct file *file, int datasync)
> {
> struct inode *inode = file->f_mapping->host;
> - int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
> + unsigned dirty, mask;
> int ret = 0;
>
> - /* XXX: testing i_state is broken without proper synchronization */
> -
> if (gfs2_is_jdata(GFS2_I(inode))) {
> gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
> return 0;
> }
>
> - if (sync_state != 0) {
> - if (!datasync)
> - ret = write_inode_now(inode, 0);
> + spin_lock(&inode_lock);
> + inode_writeback_begin(inode, 1);
> +
> + if (datasync)
> + mask = I_DIRTY_DATASYNC;
> + else
> + mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
> + dirty = inode->i_state & mask;
> + inode->i_state &= ~mask;
> + if (dirty) {
> + spin_unlock(&inode_lock);
> +
> + if (!datasync) {
> + struct writeback_control wbc = {
> + .sync_mode = WB_SYNC_ALL,
> + };
> + ret = inode->i_sb->s_op->write_inode(inode, &wbc);
> + } else {
> + if (gfs2_is_stuffed(GFS2_I(inode)))
> + gfs2_log_flush(GFS2_SB(inode),
> + GFS2_I(inode)->i_gl);
> + }
>
> - if (gfs2_is_stuffed(GFS2_I(inode)))
> - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
> + spin_lock(&inode_lock);
> }
> + if (ret)
> + inode->i_state |= dirty;
> + inode_writeback_end(inode);
> + spin_unlock(&inode_lock);
>
> return ret;
> }

The GFS2 changes seem to make sense to me, so:
Acked-by: Steven Whitehouse <[email protected]>

Steve.

2010-11-23 22:52:07

by Dave Chinner

[permalink] [raw]
Subject: Re: [patch 7/7] fs: fix or note I_DIRTY handling bugs in filesystems

On Wed, Nov 24, 2010 at 01:06:17AM +1100, [email protected] wrote:
> Comments?

How did you test the changes?

> +++ linux-2.6/fs/xfs/linux-2.6/xfs_file.c 2010-11-24 00:08:03.000000000 +1100
> @@ -99,6 +99,7 @@ xfs_file_fsync(
> struct xfs_trans *tp;
> int error = 0;
> int log_flushed = 0;
> + unsigned dirty, mask;
>
> trace_xfs_file_fsync(ip);
>
> @@ -132,9 +133,16 @@ xfs_file_fsync(
> * might gets cleared when the inode gets written out via the AIL
> * or xfs_iflush_cluster.
> */
> - if (((inode->i_state & I_DIRTY_DATASYNC) ||
> - ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
> - ip->i_update_core) {
> + spin_lock(&inode_lock);
> + inode_writeback_begin(inode, 1);
> + if (datasync)
> + mask = I_DIRTY_DATASYNC;
> + else
> + mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
> + dirty = inode->i_state & mask;
> + inode->i_state &= ~mask;
> + spin_unlock(&inode_lock);
> + if (dirty && ip->i_update_core) {

It looks to me like the pattern "inode_writeback_begin(); get dirty
state from i_state" repeated for each filesystem is wrong. The
inode_writeback_begin() helper does this:

inode->i_state &= ~I_DIRTY;

which clears all the dirty bits from the i_state, which means the
followup:

dirty = inode->i_state & mask;

will always result in a zero value for dirty. IOWs, this seems to
ensure that ->fsync never sees dirty inodes anymore. This will break
fsync on XFS, and probably on all the other filesystems you modified
to use this pattern as well.

Also, I think the pattern is racy with respect to concurrent page
cache dirtiers. i.e if the inode was dirtied between writeback and
->fsync() in vfs_fsync_range(), then this new code clears the
I_DIRTY_PAGES bit in i_state without writing back the dirty pages.

And FWIW, I'm not sure that we want to be propagating the inode_lock
into every filesystem...

Cheers,

Dave.
--
Dave Chinner
[email protected]

2010-11-24 00:23:13

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 7/7] fs: fix or note I_DIRTY handling bugs in filesystems

On Wed, Nov 24, 2010 at 09:51:48AM +1100, Dave Chinner wrote:
> On Wed, Nov 24, 2010 at 01:06:17AM +1100, [email protected] wrote:
> > Comments?
>
> How did you test the changes?

Not widely as yet, just tested a few filesystems passed deadlock and
bug tests. It's just in RFC state as yet.


> > +++ linux-2.6/fs/xfs/linux-2.6/xfs_file.c 2010-11-24 00:08:03.000000000 +1100
> > @@ -99,6 +99,7 @@ xfs_file_fsync(
> > struct xfs_trans *tp;
> > int error = 0;
> > int log_flushed = 0;
> > + unsigned dirty, mask;
> >
> > trace_xfs_file_fsync(ip);
> >
> > @@ -132,9 +133,16 @@ xfs_file_fsync(
> > * might gets cleared when the inode gets written out via the AIL
> > * or xfs_iflush_cluster.
> > */
> > - if (((inode->i_state & I_DIRTY_DATASYNC) ||
> > - ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
> > - ip->i_update_core) {
> > + spin_lock(&inode_lock);
> > + inode_writeback_begin(inode, 1);
> > + if (datasync)
> > + mask = I_DIRTY_DATASYNC;
> > + else
> > + mask = I_DIRTY_SYNC | I_DIRTY_DATASYNC;
> > + dirty = inode->i_state & mask;
> > + inode->i_state &= ~mask;
> > + spin_unlock(&inode_lock);
> > + if (dirty && ip->i_update_core) {
>
> It looks to me like the pattern "inode_writeback_begin(); get dirty
> state from i_state" repeated for each filesystem is wrong. The
> inode_writeback_begin() helper does this:
>
> inode->i_state &= ~I_DIRTY;
>
> which clears all the dirty bits from the i_state, which means the
> followup:
>
> dirty = inode->i_state & mask;
>
> will always result in a zero value for dirty. IOWs, this seems to
> ensure that ->fsync never sees dirty inodes anymore. This will break
> fsync on XFS, and probably on all the other filesystems you modified
> to use this pattern as well.

Yes, the helper needs to do inode->i_state &= ~I_DIRTY_PAGES. Good
catch, thanks.

I had I_DIRTY there because I was initially going to return the
dirty bits, however some cases want to check/clear bits at different
times (eg. background writeout wants to clear DIRTY_PAGES then do
the pagecache writeback, and then test/clear the metadata dirty bits).


> Also, I think the pattern is racy with respect to concurrent page
> cache dirtiers. i.e if the inode was dirtied between writeback and
> ->fsync() in vfs_fsync_range(), then this new code clears the
> I_DIRTY_PAGES bit in i_state without writing back the dirty pages.

That gets caught in the writeback_end helper, same way as for background
writeout. It's useful to do this for the fsync helper so that the inode
actually gets marked clean if the pagecache writeback cleaned
everything.

>
> And FWIW, I'm not sure that we want to be propagating the inode_lock
> into every filesystem...
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]