2001-04-30 17:04:02

by Marcelo Tosatti

[permalink] [raw]
Subject: [PATCH] fix fsync/fdatasync races


Hi,

The following patch implements a new super_operations "wait_inode"
operation on ext2 to fix the generic_osync_inode/fsync/fdatasync race I
mentioned sometime ago.

We still have to implement the wait_inode operation on _all_ block
filesystems to make them safe.

Comments?

diff --exclude-from=exclude -Nur linux.orig/fs/ext2/fsync.c linux/fs/ext2/fsync.c
--- linux.orig/fs/ext2/fsync.c Mon Apr 30 11:50:22 2001
+++ linux/fs/ext2/fsync.c Mon Apr 30 13:26:22 2001
@@ -44,10 +44,15 @@
int err;

err = fsync_inode_buffers(inode);
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY)) {
+ wait_on_inode(inode);
+ wait_inode_bh(inode);
return err;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ }
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+ wait_on_inode(inode);
return err;
+ }

err |= ext2_sync_inode(inode);
return err ? -EIO : 0;
diff --exclude-from=exclude -Nur linux.orig/fs/ext2/inode.c linux/fs/ext2/inode.c
--- linux.orig/fs/ext2/inode.c Mon Apr 30 11:50:22 2001
+++ linux/fs/ext2/inode.c Mon Apr 30 13:56:12 2001
@@ -1102,6 +1102,54 @@
return;
}

+int ext2_wait_inode(struct inode * inode)
+{
+ struct buffer_head *bh;
+ unsigned long block_group;
+ unsigned long group_desc;
+ unsigned long desc;
+ unsigned long block;
+ unsigned long offset;
+ struct ext2_group_desc * gdp;
+
+ if ((inode->i_ino != EXT2_ROOT_INO &&
+ inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
+ inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
+ ext2_error (inode->i_sb, "ext2_wait_inode",
+ "bad inode number: %lu", inode->i_ino);
+ return -EIO;
+ }
+ block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
+ if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) {
+ ext2_error (inode->i_sb, "ext2_wait_inode",
+ "group >= groups count");
+ return -EIO;
+ }
+ group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb);
+ desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1);
+ bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc];
+ if (!bh) {
+ ext2_error (inode->i_sb, "ext2_wait_inode",
+ "Descriptor not loaded");
+ return -EIO;
+ }
+ gdp = (struct ext2_group_desc *) bh->b_data;
+
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
+ EXT2_INODE_SIZE(inode->i_sb);
+ block = le32_to_cpu(gdp[desc].bg_inode_table) +
+ (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
+ bh = get_hash_table(inode->i_dev, block, inode->i_sb->s_blocksize);
+
+ if (bh)
+ wait_on_buffer(bh);
+
+ return 0;
+}
+
static int ext2_update_inode(struct inode * inode, int do_sync)
{
struct buffer_head * bh;
diff --exclude-from=exclude -Nur linux.orig/fs/ext2/super.c linux/fs/ext2/super.c
--- linux.orig/fs/ext2/super.c Mon Apr 30 11:50:22 2001
+++ linux/fs/ext2/super.c Mon Apr 30 13:31:12 2001
@@ -149,6 +149,7 @@
static struct super_operations ext2_sops = {
read_inode: ext2_read_inode,
write_inode: ext2_write_inode,
+ wait_inode: ext2_wait_inode,
put_inode: ext2_put_inode,
delete_inode: ext2_delete_inode,
put_super: ext2_put_super,
diff --exclude-from=exclude -Nur linux.orig/fs/inode.c linux/fs/inode.c
--- linux.orig/fs/inode.c Mon Apr 30 11:50:22 2001
+++ linux/fs/inode.c Mon Apr 30 13:45:52 2001
@@ -155,7 +155,7 @@
spin_unlock(&inode_lock);
}

-static void __wait_on_inode(struct inode * inode)
+void __wait_on_inode(struct inode * inode)
{
DECLARE_WAITQUEUE(wait, current);

@@ -170,13 +170,6 @@
current->state = TASK_RUNNING;
}

-static inline void wait_on_inode(struct inode *inode)
-{
- if (inode->i_state & I_LOCK)
- __wait_on_inode(inode);
-}
-
-
static inline void write_inode(struct inode *inode, int sync)
{
if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
@@ -406,8 +399,20 @@

if (sb) {
spin_lock(&inode_lock);
- while (inode->i_state & I_DIRTY)
+ if (inode->i_state & I_DIRTY)
sync_one(inode, sync);
+ /*
+ * If its a synchronous inode write operation,
+ * and the inode is still locked, we have to wait
+ * on the fs's underlying buffer_head to _guarantee_
+ * that the inode data safely reached disk.
+ */
+ else if ((inode->i_state & I_LOCK) && sync) {
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ wait_inode_bh(inode);
+ return;
+ }
spin_unlock(&inode_lock);
}
else
@@ -453,18 +458,35 @@
#else
err = fsync_inode_buffers(inode);
#endif
-
+
spin_lock(&inode_lock);
- if (!(inode->i_state & I_DIRTY))
- goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
+ if (!(inode->i_state & I_DIRTY)) {
+ spin_unlock(&inode_lock);
+ /*
+ * Even if the inode is not "dirty", we have to:
+ * 1) wait for it to become unlocked, guaranteeing that all
+ * inode dirty pages have been written to disk. and
+ * 2) wait for the buffer_head which represents the inode metadata
+ * to get unlocked, guaranteeing that dirty inode metadata has been
+ * written to disk.
+ *
+ * - Marcelo
+ */
+ wait_on_inode(inode);
+ wait_inode_bh(inode);
+ return err;
+ }
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+ spin_unlock(&inode_lock);
+ /*
+ * Only rule #1 of the comment above is valid because here we
+ * don't care about potential dirty metadata being synced.
+ */
+ wait_on_inode(inode);
+ return err;
+ }
spin_unlock(&inode_lock);
write_inode_now(inode, 1);
- return err;
-
- out:
- spin_unlock(&inode_lock);
return err;
}

diff --exclude-from=exclude -Nur linux.orig/include/linux/ext2_fs.h linux/include/linux/ext2_fs.h
--- linux.orig/include/linux/ext2_fs.h Mon Apr 30 11:50:24 2001
+++ linux/include/linux/ext2_fs.h Mon Apr 30 13:35:46 2001
@@ -577,6 +577,7 @@

extern void ext2_read_inode (struct inode *);
extern void ext2_write_inode (struct inode *, int);
+extern int ext2_wait_inode (struct inode *);
extern void ext2_put_inode (struct inode *);
extern void ext2_delete_inode (struct inode *);
extern int ext2_sync_inode (struct inode *);
diff --exclude-from=exclude -Nur linux.orig/include/linux/fs.h linux/include/linux/fs.h
--- linux.orig/include/linux/fs.h Mon Apr 30 11:50:24 2001
+++ linux/include/linux/fs.h Mon Apr 30 13:32:34 2001
@@ -810,6 +810,7 @@
void (*read_inode2) (struct inode *, void *) ;
void (*dirty_inode) (struct inode *);
void (*write_inode) (struct inode *, int);
+ int (*wait_inode) (struct inode *);
void (*put_inode) (struct inode *);
void (*delete_inode) (struct inode *);
void (*put_super) (struct super_block *);
@@ -831,6 +832,19 @@
#define I_CLEAR 32

#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+
+extern void __wait_on_inode(struct inode * inode);
+static inline void wait_on_inode(struct inode *inode)
+{
+ if (inode->i_state & I_LOCK)
+ __wait_on_inode(inode);
+}
+
+static inline void wait_inode_bh(struct inode *inode)
+{
+ if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->wait_inode)
+ inode->i_sb->s_op->wait_inode(inode);
+}

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)


2001-04-30 18:14:48

by Alexander Viro

[permalink] [raw]
Subject: Re: [PATCH] fix fsync/fdatasync races



On Mon, 30 Apr 2001, Marcelo Tosatti wrote:

>
> Hi,
>
> The following patch implements a new super_operations "wait_inode"
> operation on ext2 to fix the generic_osync_inode/fsync/fdatasync race I
> mentioned sometime ago.
>
> We still have to implement the wait_inode operation on _all_ block
> filesystems to make them safe.
>
> Comments?

Just one: we already have two copies of "find the on-disk inode by inumber"
code. No need to introduce the third one.

If we do it at all, please consider the patch below. It takes the
inode-loading stuff into struct ext2_inode *ext2_get_inode(sb, ino, bhp)
and does corresponding cleanup of ext2_read_inode() and ext2_update_inode().

Al

diff -urN S4/fs/ext2/inode.c S4-ext2_get_inode/fs/ext2/inode.c
--- S4/fs/ext2/inode.c Sat Apr 28 02:12:56 2001
+++ S4-ext2_get_inode/fs/ext2/inode.c Mon Apr 30 13:44:43 2001
@@ -958,56 +958,60 @@
mark_inode_dirty(inode);
}

-void ext2_read_inode (struct inode * inode)
+static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
+ struct buffer_head **p)
{
struct buffer_head * bh;
- struct ext2_inode * raw_inode;
unsigned long block_group;
- unsigned long group_desc;
- unsigned long desc;
unsigned long block;
unsigned long offset;
struct ext2_group_desc * gdp;

- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
- inode->i_ino != EXT2_ACL_DATA_INO &&
- inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
- inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "bad inode number: %lu", inode->i_ino);
- goto bad_inode;
- }
- block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
- if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "group >= groups count");
- goto bad_inode;
- }
- group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb);
- desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1);
- bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc];
- if (!bh) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "Descriptor not loaded");
- goto bad_inode;
- }
-
- gdp = (struct ext2_group_desc *) bh->b_data;
+ *p = NULL;
+ if ((ino != EXT2_ROOT_INO && ino != EXT2_ACL_IDX_INO &&
+ ino != EXT2_ACL_DATA_INO && ino < EXT2_FIRST_INO(sb)) ||
+ ino > le32_to_cpu(sb->u.ext2_sb.s_es->s_inodes_count))
+ goto Einval;
+
+ block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
+ gdp = ext2_get_group_desc(sb, block_group, &bh);
+ if (!gdp)
+ goto Egdp;
/*
* Figure out the offset within the block group inode table
*/
- offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
- EXT2_INODE_SIZE(inode->i_sb);
- block = le32_to_cpu(gdp[desc].bg_inode_table) +
- (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
- if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
- ext2_error (inode->i_sb, "ext2_read_inode",
- "unable to read inode block - "
- "inode=%lu, block=%lu", inode->i_ino, block);
+ offset = ((ino - 1) % EXT2_INODES_PER_GROUP(sb))* EXT2_INODE_SIZE(sb);
+ block = le32_to_cpu(gdp->bg_inode_table) +
+ (offset >> EXT2_BLOCK_SIZE_BITS(sb));
+ if (!(bh = bread (sb->s_dev, block, sb->s_blocksize)))
+ goto Eio;
+
+ *p = bh;
+ offset &= (EXT2_BLOCK_SIZE(sb) - 1);
+ return (struct ext2_inode *) (bh->b_data + offset);
+
+Einval:
+ ext2_error (sb, "ext2_get_inode", "bad inode number: %lu", ino);
+ return ERR_PTR(-EINVAL);
+Eio:
+ ext2_error (sb, "ext2_get_inode",
+ "unable to read inode block - inode=%lu, block=%lu",
+ ino, block);
+Egdp:
+ return ERR_PTR(-EIO);
+}
+
+void ext2_read_inode (struct inode * inode)
+{
+ struct ext2_inode * raw_inode;
+ struct ext2_inode_info * ei = &inode->u.ext2_i;
+ ino_t ino = inode->i_ino;
+ struct buffer_head * bh;
+ unsigned long block;
+
+ raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
+ if (IS_ERR(raw_inode))
goto bad_inode;
- }
- offset &= (EXT2_BLOCK_SIZE(inode->i_sb) - 1);
- raw_inode = (struct ext2_inode *) (bh->b_data + offset);

inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -1021,13 +1025,13 @@
inode->i_atime = le32_to_cpu(raw_inode->i_atime);
inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
- inode->u.ext2_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
* the test is that same one that e2fsck uses
* NeilBrown 1999oct15
*/
- if (inode->i_nlink == 0 && (inode->i_mode == 0 || inode->u.ext2_i.i_dtime)) {
+ if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
/* this inode is deleted */
brelse (bh);
goto bad_inode;
@@ -1035,30 +1039,29 @@
inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
inode->i_version = ++event;
- inode->u.ext2_i.i_flags = le32_to_cpu(raw_inode->i_flags);
- inode->u.ext2_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
- inode->u.ext2_i.i_frag_no = raw_inode->i_frag;
- inode->u.ext2_i.i_frag_size = raw_inode->i_fsize;
- inode->u.ext2_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+ ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+ ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+ ei->i_frag_no = raw_inode->i_frag;
+ ei->i_frag_size = raw_inode->i_fsize;
+ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
if (S_ISDIR(inode->i_mode))
- inode->u.ext2_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+ ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
else {
- inode->u.ext2_i.i_high_size = le32_to_cpu(raw_inode->i_size_high);
+ ei->i_high_size = le32_to_cpu(raw_inode->i_size_high);
inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
}
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
- inode->u.ext2_i.i_prealloc_count = 0;
- inode->u.ext2_i.i_block_group = block_group;
+ ei->i_prealloc_count = 0;
+ ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);

/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
*/
for (block = 0; block < EXT2_N_BLOCKS; block++)
- inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+ ei->i_data[block] = raw_inode->i_block[block];

- if (inode->i_ino == EXT2_ACL_IDX_INO ||
- inode->i_ino == EXT2_ACL_DATA_INO)
+ if (ino == EXT2_ACL_IDX_INO || ino == EXT2_ACL_DATA_INO)
/* Nothing to do */ ;
else if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext2_file_inode_operations;
@@ -1106,70 +1109,33 @@
{
struct buffer_head * bh;
struct ext2_inode * raw_inode;
- unsigned long block_group;
- unsigned long group_desc;
- unsigned long desc;
unsigned long block;
- unsigned long offset;
int err = 0;
- struct ext2_group_desc * gdp;
+ uid_t uid = inode->i_uid;
+ gid_t gid = inode->i_gid;

- if ((inode->i_ino != EXT2_ROOT_INO &&
- inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
- inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "bad inode number: %lu", inode->i_ino);
- return -EIO;
- }
- block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
- if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "group >= groups count");
+ raw_inode = ext2_get_inode(inode->i_sb, inode->i_ino, &bh);
+ if (IS_ERR(bh))
return -EIO;
- }
- group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb);
- desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1);
- bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc];
- if (!bh) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "Descriptor not loaded");
- return -EIO;
- }
- gdp = (struct ext2_group_desc *) bh->b_data;
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) *
- EXT2_INODE_SIZE(inode->i_sb);
- block = le32_to_cpu(gdp[desc].bg_inode_table) +
- (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb));
- if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
- ext2_error (inode->i_sb, "ext2_write_inode",
- "unable to read inode block - "
- "inode=%lu, block=%lu", inode->i_ino, block);
- return -EIO;
- }
- offset &= EXT2_BLOCK_SIZE(inode->i_sb) - 1;
- raw_inode = (struct ext2_inode *) (bh->b_data + offset);

raw_inode->i_mode = cpu_to_le16(inode->i_mode);
if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
if(!inode->u.ext2_i.i_dtime) {
- raw_inode->i_uid_high = cpu_to_le16(high_16_bits(inode->i_uid));
- raw_inode->i_gid_high = cpu_to_le16(high_16_bits(inode->i_gid));
+ raw_inode->i_uid_high = cpu_to_le16(high_16_bits(uid));
+ raw_inode->i_gid_high = cpu_to_le16(high_16_bits(gid));
} else {
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
} else {
- raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(inode->i_uid));
- raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(uid));
+ raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(gid));
raw_inode->i_uid_high = 0;
raw_inode->i_gid_high = 0;
}
@@ -1218,8 +1184,7 @@
ll_rw_block (WRITE, 1, &bh);
wait_on_buffer (bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
- printk ("IO error syncing ext2 inode ["
- "%s:%08lx]\n",
+ printk ("IO error syncing ext2 inode [%s:%08lx]\n",
bdevname(inode->i_dev), inode->i_ino);
err = -EIO;
}